# Preprocess script - build a single text file with cleaned, normalised documents # - tokenised, stemmed, one document per line. # Track fileids to retrieve document text later docs = 0 bad = 0 tokenizer = Tokenizer() with open(Paths.text_index, "w") as fileid_out: with codecs.open(Paths.texts_clean, "w", "utf-8-sig") as out: for f in reuters.fileids(): contents = reuters.open(f).read() try: tokens = tokenizer.tokenize(contents) docs += 1 if docs % 1000 == 0: print "Normalised %d documents" % (docs) out.write(" ".join(tokens) + "\n") fileid_out.write(f + "\n") except UnicodeDecodeError: bad += 1 print "Normalised %d documents" % (docs) print "Skipped %d bad documents" % (bad) print "Finished building " + Paths.texts_clean
# Custom tokeniser/normaliser tokenizer = Tokenizer() # Recall NLTK corpus fileids to retrieve matching document fileids = [] with open(Paths.text_index) as f: fileids = [line.rstrip() for line in f] print "Enter a query document:" s = raw_input('> ') while s and s != 'exit': # Convert input document into LSI vector space tokens = tokenizer.tokenize(s) bow_vector = dictionary.doc2bow(tokens) lsi_vector = model[bow_vector] # Compute similarity of input vector to all document vectors similarities = index[lsi_vector] similarities = sorted(enumerate(similarities), key=lambda item: -item[1]) # Get contents of most similar documents (file_no, score) = similarities[0] fileid = fileids[file_no] contents = reuters.open(fileid).read() # Re-convert most similar document to LSI space # to examine similarity match_tokens = tokenizer.tokenize(contents.strip())