-
Notifications
You must be signed in to change notification settings - Fork 0
/
query.py
74 lines (56 loc) · 2.4 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from gensim import corpora, models, similarities
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import reuters
import numpy as np
from reuters_nlp import Tokenizer, Paths
# Query script - find the documents that most closely match an input document
# and examine the top topics to see which were the most important
# Similarity index
index = similarities.MatrixSimilarity.load(Paths.similarity_index)
# LSI model
model = models.LsiModel.load(Paths.lsi_model)
# Gensim dictionary (word <-> feature ID mapping)
dictionary = corpora.Dictionary.load(Paths.dictionary)
# Custom tokeniser/normaliser
tokenizer = Tokenizer()
# Recall NLTK corpus fileids to retrieve matching document
fileids = []
with open(Paths.text_index) as f:
fileids = [line.rstrip() for line in f]
print "Enter a query document:"
s = raw_input('> ')
while s and s != 'exit':
# Convert input document into LSI vector space
tokens = tokenizer.tokenize(s)
bow_vector = dictionary.doc2bow(tokens)
lsi_vector = model[bow_vector]
# Compute similarity of input vector to all document vectors
similarities = index[lsi_vector]
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
# Get contents of most similar documents
(file_no, score) = similarities[0]
fileid = fileids[file_no]
contents = reuters.open(fileid).read()
# Re-convert most similar document to LSI space
# to examine similarity
match_tokens = tokenizer.tokenize(contents.strip())
match_bow_vector = dictionary.doc2bow(match_tokens)
match_lsi_vector = model[match_bow_vector]
# Find the topic (LSI vector element) with the smallest difference
# between the corpus document and the query document - this should
# be the topic that contributed the most to the similarity
lsi_values = np.array([e[1] for e in lsi_vector])
match_lsi_values = np.array([e[1] for e in match_lsi_vector])
deltas = np.absolute(lsi_values - match_lsi_values)
# Sort to bring the most important topics to the start of the list
sorted_topics = sorted(enumerate(deltas), key=lambda e: e[1])
top_topic = model.show_topic(sorted_topics[0][0], 20)
top_topic_words = [e[1] for e in top_topic]
print "The closes matching document:"
print "%s - %.3f" % (fileid, score)
print contents, "\n"
print "The most significant topic included these stems:"
print ', '.join(top_topic_words), "\n"
print "-" * 80
s = raw_input('>')