Example #1
0
def preProcess():
    print 'PreProcess Reuters Corpus'
    start_time = time.time()
    docs = 0
    bad = 0
    tokenizer = Tokenizer()

    if not os.path.isdir(Paths.base):
        os.makedirs(Paths.base)

    with open(Paths.text_index, 'w') as fileid_out:
      with codecs.open(Paths.texts_clean, 'w', 'utf-8-sig') as out:
          with codecs.open(Paths.reuter_test, 'w', 'utf-8-sig') as test:
              for f in reuters.fileids():
                  contents = reuters.open(f).read()
                  try:
                      tokens = tokenizer.tokenize(contents)
                      docs += 1
                      if docs % 1000 == 0:
                          print "Normalised %d documents" % (docs)

                      out.write(' '.join(tokens) + "\n")
                      # if f.startswith("train"):
                      #
                      # else:
                      #     test.write(' '.join(tokens) + "\n")
                      fileid_out.write(f + "\n")

                  except UnicodeDecodeError:
                      bad += 1
    print "Normalised %d documents" % (docs)
    print "Skipped %d bad documents" % (bad)
    print 'Finished building train file ' + Paths.texts_clean
    end_time = time.time()
    print '(Time to preprocess Reuters Corpus: %s)' % (end_time - start_time)
Example #2
0
def test(dictionary, model, index, test_document):
        write_data_c=[]
        write_contents=[]
        write_contents_tokens=[]
        token=tokenize(test_documents.strip())
        a=' '.join(token)+"\n"
        print "ACTUAL SENTENCE: " +a
        test_model=model[dictionary.doc2bow(token)]
        similarities=index[test_model]
        similarities=sorted(enumerate(similarities), key=lambda item : -item[1])
        #print "SIMILARITIES"
        #print similarities     
        for i in range(0,5):
                (file_no,score)=similarities[i]
                fileid=fileids[file_no]
                matched=reuters.open(fileid).read()
                #tokens_contents=wordpunct_tokenize(matched.strip())            
                match_tokens=tokenize(matched)
                match_lsi=model[dictionary.doc2bow(match_tokens)]
                difference= np.absolute(np.array([e[1] for e in test_model]) - np.array([e[1] for e in match_lsi]))
                #deltas=np.absolute(deltas)
                #print deltas
                #print list1
                topics=sorted(enumerate(difference), key= lambda e: -e[1])
                topic=model.show_topic(topics[0][0])
                words=[e[0] for e in topic]
                #print token
                print "The popular stems were"
                for word in words:
                        print word
                print "The closest document to the query is : %s \n"%fileid
                print matched,"\n"
Example #3
0
def main():
    q = input("enter a query to be processed> ")
    while not q:
        q = input("no empty queries please> ")

    dp = DataProcessor()
    # list_doc = dp.process_texts(sys.argv[1:])


    reuters_texts = []
    #Working with the first 50 files from the reuters library
    reuters_data = reuters.fileids()[:200]
    for data in reuters_data:
        file_str = "" #concatinate file to string
        file = reuters.open(data)
        for line in file:
            file_str = file_str + line
        file_str = file_str.replace('\n','')
        file_str = file_str.replace("  "," ")
        file_str = file_str.replace("   ", " ")
        reuters_texts.append(file_str)

    # for text in reuters_texts:
    #     print(str(text)+"\n")
    # #print(reuters_texts) # used for debugging purposes

    [document_frequency, term_frequency_document] = dp.inverted_index(reuters_texts)
    """returns the document frequency and term frequency document
    ITS A MUST WHEN CALCULATING THE TF-IDF
    """
    term_weights = dp.compute_weights(term_frequency_document,reuters_texts)
    # print the term weights
    # for term,weights in term_weights.items():
    #     print(term," ",weights)

    print("document_frequency: ", document_frequency)
    [total_collection, total_distinct_terms] = dp.get_collection_lengths(reuters_texts)
    [similarity,sorted_doc_list] = dp.bm25(reuters_texts,document_frequency,term_frequency_document,q)
    document_lengths = dp.get_doc_length(reuters_texts)
    query_likelyhood_scores = dp.query_likelyhood(reuters_texts,document_lengths,total_collection,total_distinct_terms,.5)
    modded_query_vector = dp.rocchioAlgorithm(reuters_texts,term_weights,q,1,1,1)
    precision_score = dp.precision(q,reuters_texts)


    #output statements
    #print("total_collection: ",total_collection)
    #print("document lengths: " ,document_lengths)
    print("Query: ",q)
    print("using bm25 smoothing: ", similarity)
    #print("sorted_doc_list: ",sorted_doc_list)
    print("query_likelyhood_scores: ",query_likelyhood_scores)
    print("modded_query_vector taken from Rocchios algorithm: ",modded_query_vector)
    print("precision score from precision function for the query " + q + ": ", precision_score)
Example #4
0
from reuters_nlp import Tokenizer, Paths

# Preprocess script - build a single text file with cleaned, normalised documents
#  - tokenised, stemmed, one document per line.
# Track fileids to retrieve document text later

docs = 0
bad = 0

tokenizer = Tokenizer()

with open(Paths.text_index, "w") as fileid_out:
    with codecs.open(Paths.texts_clean, "w", "utf-8-sig") as out:

        for f in reuters.fileids():
            contents = reuters.open(f).read()

            try:
                tokens = tokenizer.tokenize(contents)
                docs += 1
                if docs % 1000 == 0:
                    print "Normalised %d documents" % (docs)

                out.write(" ".join(tokens) + "\n")
                fileid_out.write(f + "\n")

            except UnicodeDecodeError:
                bad += 1

print "Normalised %d documents" % (docs)
print "Skipped %d bad documents" % (bad)
Example #5
0
                topic=model.show_topic(topics[0][0])
                words=[e[0] for e in topic]
                #print token
                print "The popular stems were"
                for word in words:
                        print word
                print "The closest document to the query is : %s \n"%fileid
                print matched,"\n"



documents=reuters.fileids()
write_data_filtered=[]
for each_doc in documents:
                        fileids.append(each_doc)
                        doc=reuters.open(each_doc).read()
                        write_data=tokenize(doc)
                        write_data_filtered.append(write_data)
dictionary = corpora.Dictionary()
for w in write_data_filtered:
        dictionary.doc2bow(w, allow_update=True)

corpus=list(corpus_building(write_data_filtered,dictionary))
#print "CORPUS"
#print dictionary
model=models.TfidfModel(corpus)
corpus_tfidf=model[corpus]
#print corpus_tfidf
lsi=models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)
corpus_lsi=lsi[corpus_tfidf]
similarityModel=similarities.MatrixSimilarity(corpus_lsi)
Example #6
0
    @property
    def entities(self):
        return self._entities

    #Entities Adder
    def addEntity(self, entity):
        self.entities.append(entity)

    def __repr__(self):
        return self.id


#%% Execution
doc_list = []
for doc in reuters.fileids():
    doc_ob = Document(doc, reuters.open(doc).read())
    for ent in doc_ob.model.ents:
        doc_ob.addEntity(Entity(ent))
    doc_list.append(doc_ob)

#Writing
f = open('doc_list', 'wb')
pickle.dump(doc_list, f)
f.close()

#Reading
# f = open('doc_list', 'rb')
# doc_list = pickle.load(f)
# f.close()

#Runtime
Example #7
0
while s and s != 'exit':

	# Convert input document into LSI vector space
	tokens = tokenizer.tokenize(s)
	bow_vector = dictionary.doc2bow(tokens)
	lsi_vector = model[bow_vector]

	# Compute similarity of input vector to all document vectors
	similarities = index[lsi_vector]
	similarities = sorted(enumerate(similarities), key=lambda item: -item[1])

	# Get contents of most similar documents
	(file_no, score) = similarities[0]
	fileid = fileids[file_no]
	contents = reuters.open(fileid).read()

	# Re-convert most similar document to LSI space
	#  to examine similarity
	match_tokens = tokenizer.tokenize(contents.strip())
	match_bow_vector = dictionary.doc2bow(match_tokens)
	match_lsi_vector = model[match_bow_vector]

	# Find the topic (LSI vector element) with the smallest difference
	#  between the corpus document and the query document - this should
	#  be the topic that contributed the most to the similarity
	lsi_values = np.array([e[1] for e in lsi_vector])
	match_lsi_values = np.array([e[1] for e in match_lsi_vector])
	deltas = np.absolute(lsi_values - match_lsi_values)
	
	# Sort to bring the most important topics to the start of the list