def preProcess(): print 'PreProcess Reuters Corpus' start_time = time.time() docs = 0 bad = 0 tokenizer = Tokenizer() if not os.path.isdir(Paths.base): os.makedirs(Paths.base) with open(Paths.text_index, 'w') as fileid_out: with codecs.open(Paths.texts_clean, 'w', 'utf-8-sig') as out: with codecs.open(Paths.reuter_test, 'w', 'utf-8-sig') as test: for f in reuters.fileids(): contents = reuters.open(f).read() try: tokens = tokenizer.tokenize(contents) docs += 1 if docs % 1000 == 0: print "Normalised %d documents" % (docs) out.write(' '.join(tokens) + "\n") # if f.startswith("train"): # # else: # test.write(' '.join(tokens) + "\n") fileid_out.write(f + "\n") except UnicodeDecodeError: bad += 1 print "Normalised %d documents" % (docs) print "Skipped %d bad documents" % (bad) print 'Finished building train file ' + Paths.texts_clean end_time = time.time() print '(Time to preprocess Reuters Corpus: %s)' % (end_time - start_time)
def test(dictionary, model, index, test_document): write_data_c=[] write_contents=[] write_contents_tokens=[] token=tokenize(test_documents.strip()) a=' '.join(token)+"\n" print "ACTUAL SENTENCE: " +a test_model=model[dictionary.doc2bow(token)] similarities=index[test_model] similarities=sorted(enumerate(similarities), key=lambda item : -item[1]) #print "SIMILARITIES" #print similarities for i in range(0,5): (file_no,score)=similarities[i] fileid=fileids[file_no] matched=reuters.open(fileid).read() #tokens_contents=wordpunct_tokenize(matched.strip()) match_tokens=tokenize(matched) match_lsi=model[dictionary.doc2bow(match_tokens)] difference= np.absolute(np.array([e[1] for e in test_model]) - np.array([e[1] for e in match_lsi])) #deltas=np.absolute(deltas) #print deltas #print list1 topics=sorted(enumerate(difference), key= lambda e: -e[1]) topic=model.show_topic(topics[0][0]) words=[e[0] for e in topic] #print token print "The popular stems were" for word in words: print word print "The closest document to the query is : %s \n"%fileid print matched,"\n"
def main(): q = input("enter a query to be processed> ") while not q: q = input("no empty queries please> ") dp = DataProcessor() # list_doc = dp.process_texts(sys.argv[1:]) reuters_texts = [] #Working with the first 50 files from the reuters library reuters_data = reuters.fileids()[:200] for data in reuters_data: file_str = "" #concatinate file to string file = reuters.open(data) for line in file: file_str = file_str + line file_str = file_str.replace('\n','') file_str = file_str.replace(" "," ") file_str = file_str.replace(" ", " ") reuters_texts.append(file_str) # for text in reuters_texts: # print(str(text)+"\n") # #print(reuters_texts) # used for debugging purposes [document_frequency, term_frequency_document] = dp.inverted_index(reuters_texts) """returns the document frequency and term frequency document ITS A MUST WHEN CALCULATING THE TF-IDF """ term_weights = dp.compute_weights(term_frequency_document,reuters_texts) # print the term weights # for term,weights in term_weights.items(): # print(term," ",weights) print("document_frequency: ", document_frequency) [total_collection, total_distinct_terms] = dp.get_collection_lengths(reuters_texts) [similarity,sorted_doc_list] = dp.bm25(reuters_texts,document_frequency,term_frequency_document,q) document_lengths = dp.get_doc_length(reuters_texts) query_likelyhood_scores = dp.query_likelyhood(reuters_texts,document_lengths,total_collection,total_distinct_terms,.5) modded_query_vector = dp.rocchioAlgorithm(reuters_texts,term_weights,q,1,1,1) precision_score = dp.precision(q,reuters_texts) #output statements #print("total_collection: ",total_collection) #print("document lengths: " ,document_lengths) print("Query: ",q) print("using bm25 smoothing: ", similarity) #print("sorted_doc_list: ",sorted_doc_list) print("query_likelyhood_scores: ",query_likelyhood_scores) print("modded_query_vector taken from Rocchios algorithm: ",modded_query_vector) print("precision score from precision function for the query " + q + ": ", precision_score)
from reuters_nlp import Tokenizer, Paths # Preprocess script - build a single text file with cleaned, normalised documents # - tokenised, stemmed, one document per line. # Track fileids to retrieve document text later docs = 0 bad = 0 tokenizer = Tokenizer() with open(Paths.text_index, "w") as fileid_out: with codecs.open(Paths.texts_clean, "w", "utf-8-sig") as out: for f in reuters.fileids(): contents = reuters.open(f).read() try: tokens = tokenizer.tokenize(contents) docs += 1 if docs % 1000 == 0: print "Normalised %d documents" % (docs) out.write(" ".join(tokens) + "\n") fileid_out.write(f + "\n") except UnicodeDecodeError: bad += 1 print "Normalised %d documents" % (docs) print "Skipped %d bad documents" % (bad)
topic=model.show_topic(topics[0][0]) words=[e[0] for e in topic] #print token print "The popular stems were" for word in words: print word print "The closest document to the query is : %s \n"%fileid print matched,"\n" documents=reuters.fileids() write_data_filtered=[] for each_doc in documents: fileids.append(each_doc) doc=reuters.open(each_doc).read() write_data=tokenize(doc) write_data_filtered.append(write_data) dictionary = corpora.Dictionary() for w in write_data_filtered: dictionary.doc2bow(w, allow_update=True) corpus=list(corpus_building(write_data_filtered,dictionary)) #print "CORPUS" #print dictionary model=models.TfidfModel(corpus) corpus_tfidf=model[corpus] #print corpus_tfidf lsi=models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300) corpus_lsi=lsi[corpus_tfidf] similarityModel=similarities.MatrixSimilarity(corpus_lsi)
@property def entities(self): return self._entities #Entities Adder def addEntity(self, entity): self.entities.append(entity) def __repr__(self): return self.id #%% Execution doc_list = [] for doc in reuters.fileids(): doc_ob = Document(doc, reuters.open(doc).read()) for ent in doc_ob.model.ents: doc_ob.addEntity(Entity(ent)) doc_list.append(doc_ob) #Writing f = open('doc_list', 'wb') pickle.dump(doc_list, f) f.close() #Reading # f = open('doc_list', 'rb') # doc_list = pickle.load(f) # f.close() #Runtime
while s and s != 'exit': # Convert input document into LSI vector space tokens = tokenizer.tokenize(s) bow_vector = dictionary.doc2bow(tokens) lsi_vector = model[bow_vector] # Compute similarity of input vector to all document vectors similarities = index[lsi_vector] similarities = sorted(enumerate(similarities), key=lambda item: -item[1]) # Get contents of most similar documents (file_no, score) = similarities[0] fileid = fileids[file_no] contents = reuters.open(fileid).read() # Re-convert most similar document to LSI space # to examine similarity match_tokens = tokenizer.tokenize(contents.strip()) match_bow_vector = dictionary.doc2bow(match_tokens) match_lsi_vector = model[match_bow_vector] # Find the topic (LSI vector element) with the smallest difference # between the corpus document and the query document - this should # be the topic that contributed the most to the similarity lsi_values = np.array([e[1] for e in lsi_vector]) match_lsi_values = np.array([e[1] for e in match_lsi_vector]) deltas = np.absolute(lsi_values - match_lsi_values) # Sort to bring the most important topics to the start of the list