forked from mpercy/cs224u
-
Notifications
You must be signed in to change notification settings - Fork 0
/
esa.py
170 lines (130 loc) · 6.64 KB
/
esa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python
import heapq
import logging
import sys
from gensim import utils
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import MatrixSimilarity, Similarity
from nltk.tokenize import wordpunct_tokenize
if __name__ == '__main__':
sys.exit(0)
logger = logging.getLogger("ESA")
class ESAModel(object):
"""
Encapsulates an ESA model.
Provide a model prefix, and the class takes care of loading everything else.
Sharded files are mmapped and read in lazily, so the first time accessing a
particular shard will display some slowdown until the caches get warmed up.
"""
def __init__(self, model_prefix = None, num_best = None):
self.model_prefix = model_prefix
self.num_best = num_best
if self.model_prefix is None:
raise ValueError("model_prefix must be specified")
logger.info("ESA: Loading word dictionary...")
self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')
logger.info("ESA: Loading document name map...")
self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')
logger.info("ESA: Loading TF-IDF model...")
self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')
logger.info("ESA: Loading similarity index...")
self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r')
#logger.info("ESA: Preloading reverse indexes...")
#self.similarity_index.preload_reverse_index()
logger.info("ESA: Finished loading model files.")
# Features in the similarity model, actually documents.
def num_features(self):
""" Returns number of documents in the index. """
return len(self.similarity_index)
def num_terms(self):
""" Returns the number of words / features / terms used in the vocabulary. """
return len(self.dictionary)
def featurize(self, input_str, num_best=None, use_reverse_index=True):
"""
Returns similar documents by cosine similarity based on TF-IDF score.
If num_best is left as None, returns a numpy.array with a score for
every document in the corpus. Otherwise, it returns the top-K scored
items as a list of (doc_idx, score) tuples.
If use_reverse_index is set to False, the forward index is used (and
the full corpus is queried). This is only a good idea when the number
of terms in the input string is big, such as the text of a long article.
For short documents, using the reverse index is usually much faster.
"""
logger.debug("input string: %s", input_str)
# Tokenize the input string.
input_str = utils.to_utf8(input_str, errors='replace').decode("utf8")
doc = wordpunct_tokenize(input_str)
doc = [w.lower() for w in doc]
# Convert from tokens to word ids from the model dictionary.
doc_bow = self.dictionary.doc2bow(doc)
# Get TF-IDF score for the document words (this does not update the TF-IDF model itself).
doc_tfidf = self.tfidf[doc_bow]
# Calculate similarity scores.
self.similarity_index.use_reverse_index = use_reverse_index
similar_docs = self.similarity_index[doc_tfidf]
# Fall back to self.num_best if it wasn't specified here.
if num_best is None:
num_best = self.num_best
if num_best is None:
return similar_docs
# Return top-k if requested.
return heapq.nlargest(num_best, enumerate(similar_docs), key=lambda item: item[1])
class ClusteredESAModel(object):
"""
Encapsulates an ESA model after running k-means clustering.
Otherwise, identical to ESAModel.
"""
def __init__(self, model_prefix = None, num_best = None):
self.model_prefix = model_prefix
self.num_best = num_best
if self.model_prefix is None:
raise ValueError("model_prefix must be specified")
logger.info("ESA: Loading word dictionary...")
self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')
logger.info("ESA: Loading document name map...")
self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')
logger.info("ESA: Loading TF-IDF model...")
self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')
logger.info("ESA: Loading similarity index...")
sim_fname = "%s.cluster.%d.centroids" % (model_prefix, 2000)
self.similarity_index = MatrixSimilarity.load(sim_fname, mmap='r')
#logger.info("ESA: Preloading reverse indexes...")
#self.similarity_index.preload_reverse_index()
logger.info("ESA: Finished loading model files.")
# Features in the similarity model, actually documents.
def num_features(self):
""" Returns number of documents in the index. """
return len(self.similarity_index)
def num_terms(self):
""" Returns the number of words / features / terms used in the vocabulary. """
return len(self.dictionary)
def featurize(self, input_str, num_best=None, use_reverse_index=True):
"""
Returns similar documents by cosine similarity based on TF-IDF score.
If num_best is left as None, returns a numpy.array with a score for
every document in the corpus. Otherwise, it returns the top-K scored
items as a list of (doc_idx, score) tuples.
If use_reverse_index is set to False, the forward index is used (and
the full corpus is queried). This is only a good idea when the number
of terms in the input string is big, such as the text of a long article.
For short documents, using the reverse index is usually much faster.
"""
logger.debug("input string: %s", input_str)
# Tokenize the input string.
input_str = utils.to_utf8(input_str, errors='replace').decode("utf8")
doc = wordpunct_tokenize(input_str)
doc = [w.lower() for w in doc]
# Convert from tokens to word ids from the model dictionary.
doc_bow = self.dictionary.doc2bow(doc)
# Get TF-IDF score for the document words (this does not update the TF-IDF model itself).
doc_tfidf = self.tfidf[doc_bow]
# Calculate similarity scores.
similar_docs = self.similarity_index[doc_tfidf]
# Fall back to self.num_best if it wasn't specified here.
if num_best is None:
num_best = self.num_best
if num_best is None:
return similar_docs
# Return top-k if requested.
return heapq.nlargest(num_best, enumerate(similar_docs), key=lambda item: item[1])