Example #1
0
import codecs
from nltk.corpus import reuters
from reuters_nlp import Tokenizer, Paths

# Preprocess script - build a single text file with cleaned, normalised documents
#  - tokenised, stemmed, one document per line.
# Track fileids to retrieve document text later

docs = 0
bad = 0

tokenizer = Tokenizer()

with open(Paths.text_index, "w") as fileid_out:
    with codecs.open(Paths.texts_clean, "w", "utf-8-sig") as out:

        for f in reuters.fileids():
            contents = reuters.open(f).read()

            try:
                tokens = tokenizer.tokenize(contents)
                docs += 1
                if docs % 1000 == 0:
                    print "Normalised %d documents" % (docs)

                out.write(" ".join(tokens) + "\n")
                fileid_out.write(f + "\n")

            except UnicodeDecodeError:
                bad += 1
Example #2
0
from reuters_nlp import Tokenizer, Paths

# Query script - find the documents that most closely match an input document
#  and examine the top topics to see which were the most important

# Similarity index
index = similarities.MatrixSimilarity.load(Paths.similarity_index)

# LSI model
model = models.LsiModel.load(Paths.lsi_model)

# Gensim dictionary (word <-> feature ID mapping)
dictionary = corpora.Dictionary.load(Paths.dictionary)

# Custom tokeniser/normaliser
tokenizer = Tokenizer()

# Recall NLTK corpus fileids to retrieve matching document
fileids = []
with open(Paths.text_index) as f:
	fileids = [line.rstrip() for line in f]


print "Enter a query document:"
s = raw_input('> ')

while s and s != 'exit':

	# Convert input document into LSI vector space
	tokens = tokenizer.tokenize(s)
	bow_vector = dictionary.doc2bow(tokens)