-
Notifications
You must be signed in to change notification settings - Fork 2
/
featurize.py
306 lines (250 loc) · 12.2 KB
/
featurize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
import time
import shelve
import logging
from collections import defaultdict
import numpy as np
import chainer
from chainer import cuda
import chainer.optimizers as O
from gensim.models.word2vec import KeyedVectors
from lda2vec import utils
from lda2vec import prepare_topics, print_top_words_per_topic, topic_coherence
from lda2vec import preprocess
from lda2vec import Corpus
from lda2vec import EmbedMixture
from lda2vec import dirichlet_likelihood
from lda2vec.utils import move
from model import LDA2Vec
class Lda2VecFeaturizer:
def __init__(self,\
clambda=200,\
n_topics=10,\
batchsize=4096,\
power=0.75,\
words_pretrained=True,\
temperature=1,\
max_length=1000,\
min_count=0,\
word2vec_path=None):
# 'Strength' of the dircihlet prior; 200.0 seems to work well
self.clambda = clambda
# Number of topics to fit
self.n_topics = n_topics #int(os.getenv('n_topics', 10))
self.batchsize = batchsize
# Power for neg sampling
self.power = power #float(os.getenv('power', 0.75))
# Intialize with pretrained word vectors
self.words_pretrained = words_pretrained #bool(int(os.getenv('pretrained', True)))
self.temp = temperature
self.max_length = max_length
self.min_count = min_count
self.word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
def preprocess(self, docs=None):
""" Uses spaCy to quickly tokenize text and return an array
of indices.
This method stores a global NLP directory in memory, and takes
up to a minute to run for the time. Later calls will have the
tokenizer in memory."""
assert (isinstance(docs, list)), ("input list of documents")
assert (all(isinstance(doc, unicode) for doc in docs)),("expected unicode, got string")
self.corpus = Corpus()
tokens, self.vocab = preprocess.tokenize(docs, self.max_length, merge=False,n_threads=4)
# Make a ranked list of rare vs frequent words
self.corpus.update_word_count(tokens)
self.corpus.finalize()
# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = self.corpus.to_compact(tokens)
# Remove extremely rare words
pruned = self.corpus.filter_count(compact, min_count=0)
# Convert the compactified arrays into bag of words arrays
bow = self.corpus.compact_to_bow(pruned)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
clean = self.corpus.subsample_frequent(pruned)
# Now flatten a 2D array of document per row and word position
# per column to a 1D array of words. This will also remove skips
# and OoV words
self.doc_ids = np.arange(pruned.shape[0])
self.flattened, (self.doc_ids,) = self.corpus.compact_to_flat(pruned, self.doc_ids)
self.vectors, s, f = self.corpus.compact_word_vectors(self.vocab, model = self.word2vec_model)
# vectors = np.delete(vectors,77743,0)
# Model Parameters
# Number of documents
self.n_docs = len(docs) #doc_ids.max() + 1
# Number of unique words in the vocabulary
self.n_vocab=self.flattened.max() + 1
doc_idx, lengths = np.unique(self.doc_ids, return_counts=True)
self.doc_lengths = np.zeros(self.doc_ids.max() + 1, dtype='int32')
self.doc_lengths[doc_idx] = lengths
# Count all token frequencies
tok_idx, freq = np.unique(self.flattened, return_counts=True)
self.term_frequency = np.zeros(self.n_vocab, dtype='int32')
self.term_frequency[tok_idx] = freq
self.fraction = self.batchsize * 1.0 / self.flattened.shape[0]
# Get the string representation for every compact key
self.words = self.corpus.word_list(self.vocab)[:self.n_vocab]
def train(self,docs=None, epochs=200, update_words=False, update_topics=True):
""" Takes the training documents as a list of documents
preprocesses the documents and reurns a dictionary data containing the
Topic distribution
Vocab
document length
and topic word distribution"""
texts = docs
docs = []
for text in texts:
docs.append(unicode(" ".join(word for word in text.split() if word in self.word2vec_model.vocab)))
logging.info("preprocessing...")
self.preprocess(docs)
logging.info('preprocessed!')
self.train_model = LDA2Vec(n_documents=self.n_docs,\
n_document_topics=self.n_topics,\
n_units=300,\
n_vocab=self.n_vocab,\
counts=self.term_frequency,\
n_samples=15,\
power=self.power,\
temperature=self.temp)
if self.words_pretrained:
self.train_model.sampler.W.data = self.vectors[:self.n_vocab, :]
optimizer = O.Adam()
optimizer.setup(self.train_model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)
j = 0
msgs = defaultdict(list)
for epoch in range(epochs):
print "epoch : ",epoch
data = prepare_topics(cuda.to_cpu(self.train_model.mixture.weights.W.data).copy(),
cuda.to_cpu(self.train_model.mixture.factors.W.data).copy(),
cuda.to_cpu(self.train_model.sampler.W.data).copy(),
self.words)
top_words = print_top_words_per_topic(data)
if j % 100 == 0 and j > 100:
coherence = topic_coherence(top_words)
for j in range(self.n_topics):
print j, coherence[(j, 'cv')]
kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
#progress[str(epoch)] = pickle.dumps(kw)
data['doc_lengths'] = self.doc_lengths
data['term_frequency'] = self.term_frequency
#np.savez('topics.pyldavis', **data)
for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened):
t0 = time.time()
optimizer.zero_grads()
l = self.train_model.fit_partial(d.copy(), f.copy(), update_words=update_words, update_topics=update_topics)
prior = self.train_model.prior()
loss = prior * self.fraction
loss.backward()
optimizer.update()
msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
"P:{prior:1.3e} R:{rate:1.3e}")
prior.to_cpu()
loss.to_cpu()
t1 = time.time()
dt = t1 - t0
rate = self.batchsize / dt
msgs["E"].append(epoch)
msgs["L"].append(float(l))
j += 1
logs = dict(loss=float(l), epoch=epoch, j=j, prior=float(prior.data), rate=rate)
print msg.format(**logs)
print "\n ================================= \n"
#serializers.save_hdf5("lda2vec.hdf5", self.model)
msgs["loss_per_epoch"].append(float(l))
return data, msgs
def initialize_infer(self,\
clambda=200,\
batchsize=4096,\
power=0.75,\
words_pretrained=True,\
temperature=1,\
max_length=1000,\
min_count=0):
""" Initializes parameters for testing, if needed
Usually not called. """
# 'Strength' of the dircihlet prior; 200.0 seems to work well
self.clambda = clambda
# Number of topics to fit
self.batchsize = batchsize
# Power for neg sampling
self.power = power #float(os.getenv('power', 0.75))
# Intialize with pretrained word vectors
self.words_pretrained = words_pretrained #bool(int(os.getenv('pretrained', True)))
self.temp = temperature
self.max_length = max_length
self.min_count = min_count
logging.info('Test parameters initialized!')
def infer(self,docs=None,epochs=200, update_words=False, update_topics=False, topic_vectors=None):
""" Infers the featurs of a new document that is passed in.
By running the Lda2vec algorithm again.
But by updating only the topic distributions"""
texts = docs
docs = []
for text in texts:
docs.append(unicode(" ".join(word for word in text.split() if word in self.word2vec_model.vocab)))
logging.info("preprocessing")
self.preprocess(docs)
logging.info('preprocessed!')
self.infer_model = LDA2Vec(n_documents=self.n_docs,\
n_document_topics=self.n_topics,\
n_units=300,\
n_vocab=self.n_vocab,\
counts=self.term_frequency,\
n_samples=15,\
power=self.power,\
temperature=self.temp)
if self.words_pretrained:
self.infer_model.sampler.W.data = self.vectors[:self.n_vocab, :]
self.infer_model.mixture.factors.W.data = self.train_model.mixture.factors.W.data
if topic_vectors is not None:
assert(topic_vectors.shape==self.infer_model.mixture.factors.W.data.shape), ("topic vectors shape doesn't match")
self.infer_model.mixture.factors.W.data = topic_vectors
optimizer = O.Adam()
optimizer.setup(self.infer_model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)
j = 0
msgs = defaultdict(list)
for epoch in range(epochs):
print "epoch : ",epoch
data = prepare_topics(cuda.to_cpu(self.infer_model.mixture.weights.W.data).copy(),
cuda.to_cpu(self.infer_model.mixture.factors.W.data).copy(),
cuda.to_cpu(self.infer_model.sampler.W.data).copy(),
self.words)
top_words = print_top_words_per_topic(data)
if j % 100 == 0 and j > 100:
coherence = topic_coherence(top_words)
for j in range(self.n_topics):
print j, coherence[(j, 'cv')]
kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
#progress[str(epoch)] = pickle.dumps(kw)
data['doc_lengths'] = self.doc_lengths
data['term_frequency'] = self.term_frequency
#np.savez('topics.pyldavis', **data)
for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened):
t0 = time.time()
optimizer.zero_grads()
l = self.infer_model.fit_partial(d.copy(), f.copy(), update_words=update_words, update_topics=update_topics)
prior = self.infer_model.prior()
loss = prior * self.fraction
loss.backward()
optimizer.update()
msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
"P:{prior:1.3e} R:{rate:1.3e}")
prior.to_cpu()
loss.to_cpu()
t1 = time.time()
dt = t1 - t0
rate = self.batchsize / dt
msgs["E"].append(epoch)
msgs["L"].append(float(l))
j += 1
logs = dict(loss=float(l), epoch=epoch, j=j, prior=float(prior.data), rate=rate)
print msg.format(**logs)
print "\n ================================= \n"
#serializers.save_hdf5("lda2vec.hdf5", self.model)
msgs["loss_per_epoch"].append(float(l))
return data, msgs