import codecs from nltk.corpus import reuters from reuters_nlp import Tokenizer, Paths # Preprocess script - build a single text file with cleaned, normalised documents # - tokenised, stemmed, one document per line. # Track fileids to retrieve document text later docs = 0 bad = 0 tokenizer = Tokenizer() with open(Paths.text_index, "w") as fileid_out: with codecs.open(Paths.texts_clean, "w", "utf-8-sig") as out: for f in reuters.fileids(): contents = reuters.open(f).read() try: tokens = tokenizer.tokenize(contents) docs += 1 if docs % 1000 == 0: print "Normalised %d documents" % (docs) out.write(" ".join(tokens) + "\n") fileid_out.write(f + "\n") except UnicodeDecodeError: bad += 1
from reuters_nlp import Tokenizer, Paths # Query script - find the documents that most closely match an input document # and examine the top topics to see which were the most important # Similarity index index = similarities.MatrixSimilarity.load(Paths.similarity_index) # LSI model model = models.LsiModel.load(Paths.lsi_model) # Gensim dictionary (word <-> feature ID mapping) dictionary = corpora.Dictionary.load(Paths.dictionary) # Custom tokeniser/normaliser tokenizer = Tokenizer() # Recall NLTK corpus fileids to retrieve matching document fileids = [] with open(Paths.text_index) as f: fileids = [line.rstrip() for line in f] print "Enter a query document:" s = raw_input('> ') while s and s != 'exit': # Convert input document into LSI vector space tokens = tokenizer.tokenize(s) bow_vector = dictionary.doc2bow(tokens)