model.to_gpu() optimizer = O.Adam() optimizer.setup(model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0] progress = shelve.open('progress.shelve') for epoch in range(200): data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(), cuda.to_cpu(model.mixture.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) top_words = print_top_words_per_topic(data) coherence = topic_coherence(top_words) for j in range(n_topics): print j, coherence[(j, 'cv')] kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = doc_lengths data['term_frequency'] = term_frequency np.savez('topics.pyldavis', **data) for d, f in utils.chunks(batchsize, doc_ids, flattened): t0 = time.time() optimizer.zero_grads() l = model.fit_partial(d.copy(), f.copy()) prior = model.prior() loss = prior * fraction loss.backward()
if os.path.exists('lda.hdf5'): six.print_("Reloading from saved") serializers.load_hdf5("lda.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) j = 0 fraction = batchsize * 1.0 / bow.shape[0] for epoch in range(50000000): if epoch % 100 == 0: p = cuda.to_cpu(model.proportions.W.data).copy() f = cuda.to_cpu(model.factors.W.data).copy() w = cuda.to_cpu(model.embedding.W.data).copy() d = prepare_topics(p, f, w, words) print_top_words_per_topic(d) for (ids, batch) in utils.chunks(batchsize, np.arange(bow.shape[0]), bow): t0 = time.time() optimizer.zero_grads() rec, ld = model.forward(ids, batch) l = rec + ld l.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} " "P:{ld:1.3e} R:{rate:1.3e}") l.to_cpu() rec.to_cpu() ld.to_cpu() t1 = time.time() dt = t1 - t0 rate = batchsize / dt
def infer(self,docs=None,epochs=200, update_words=False, update_topics=False, topic_vectors=None): """ Infers the featurs of a new document that is passed in. By running the Lda2vec algorithm again. But by updating only the topic distributions""" texts = docs docs = [] for text in texts: docs.append(unicode(" ".join(word for word in text.split() if word in self.word2vec_model.vocab))) logging.info("preprocessing") self.preprocess(docs) logging.info('preprocessed!') self.infer_model = LDA2Vec(n_documents=self.n_docs,\ n_document_topics=self.n_topics,\ n_units=300,\ n_vocab=self.n_vocab,\ counts=self.term_frequency,\ n_samples=15,\ power=self.power,\ temperature=self.temp) if self.words_pretrained: self.infer_model.sampler.W.data = self.vectors[:self.n_vocab, :] self.infer_model.mixture.factors.W.data = self.train_model.mixture.factors.W.data if topic_vectors is not None: assert(topic_vectors.shape==self.infer_model.mixture.factors.W.data.shape), ("topic vectors shape doesn't match") self.infer_model.mixture.factors.W.data = topic_vectors optimizer = O.Adam() optimizer.setup(self.infer_model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 msgs = defaultdict(list) for epoch in range(epochs): print "epoch : ",epoch data = prepare_topics(cuda.to_cpu(self.infer_model.mixture.weights.W.data).copy(), cuda.to_cpu(self.infer_model.mixture.factors.W.data).copy(), cuda.to_cpu(self.infer_model.sampler.W.data).copy(), self.words) top_words = print_top_words_per_topic(data) if j % 100 == 0 and j > 100: coherence = topic_coherence(top_words) for j in range(self.n_topics): print j, coherence[(j, 'cv')] kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) #progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = self.doc_lengths data['term_frequency'] = self.term_frequency #np.savez('topics.pyldavis', **data) for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened): t0 = time.time() optimizer.zero_grads() l = self.infer_model.fit_partial(d.copy(), f.copy(), update_words=update_words, update_topics=update_topics) prior = self.infer_model.prior() loss = prior * self.fraction loss.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} " "P:{prior:1.3e} R:{rate:1.3e}") prior.to_cpu() loss.to_cpu() t1 = time.time() dt = t1 - t0 rate = self.batchsize / dt msgs["E"].append(epoch) msgs["L"].append(float(l)) j += 1 logs = dict(loss=float(l), epoch=epoch, j=j, prior=float(prior.data), rate=rate) print msg.format(**logs) print "\n ================================= \n" #serializers.save_hdf5("lda2vec.hdf5", self.model) msgs["loss_per_epoch"].append(float(l)) return data, msgs
model.to_gpu() optimizer = O.Adam() optimizer.setup(model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0] progress = shelve.open('progress.shelve') for epoch in range(200): data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(), cuda.to_cpu(model.mixture.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) top_words = print_top_words_per_topic(data) if j % 100 == 0 and j > 100: coherence = topic_coherence(top_words) for j in range(n_topics): six.print_(j, coherence[(j, 'cv')]) kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = doc_lengths data['term_frequency'] = term_frequency np.savez('topics.pyldavis', **data) for d, f in utils.chunks(batchsize, doc_ids, flattened): t0 = time.time() optimizer.zero_grads() l = model.fit_partial(d.copy(), f.copy()) prior = model.prior() loss = prior * fraction
serializers.load_hdf5("lda2vec.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0] for epoch in range(5000): ts = prepare_topics(cuda.to_cpu(model.mixture_sty.weights.W.data).copy(), cuda.to_cpu(model.mixture_sty.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) print_top_words_per_topic(ts) ts['doc_lengths'] = sty_len ts['term_frequency'] = term_frequency np.savez('topics.story.pyldavis', **ts) ta = prepare_topics(cuda.to_cpu(model.mixture_aut.weights.W.data).copy(), cuda.to_cpu(model.mixture_aut.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) print_top_words_per_topic(ta) ta['doc_lengths'] = aut_len ta['term_frequency'] = term_frequency np.savez('topics.author.pyldavis', **ta) for s, a, f in utils.chunks(batchsize, story_id, author_id, flattened): t0 = time.time() optimizer.zero_grads() l = model.fit_partial(s.copy(), a.copy(), f.copy())
serializers.load_hdf5("lda2vec.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0] for epoch in range(5000): ts = prepare_topics( cuda.to_cpu(model.mixture_sty.weights.W.data).copy(), cuda.to_cpu(model.mixture_sty.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) print_top_words_per_topic(ts) ts['doc_lengths'] = sty_len ts['term_frequency'] = term_frequency np.savez('topics.story.pyldavis', **ts) ta = prepare_topics( cuda.to_cpu(model.mixture_aut.weights.W.data).copy(), cuda.to_cpu(model.mixture_aut.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) print_top_words_per_topic(ta) ta['doc_lengths'] = aut_len ta['term_frequency'] = term_frequency np.savez('topics.author.pyldavis', **ta) for s, a, f in utils.chunks(batchsize, story_id, author_id, flattened): t0 = time.time() optimizer.zero_grads() l = model.fit_partial(s.copy(), a.copy(), f.copy())
def train( self, doc_ids, flattened, vocab, words, max_epochs=np.inf, verbose=False, #added vocab & words to add save(npz) #option during training loss_switch_epochs=0, # num epochs until LDA loss switched on save=False, save_every=1000, outdir="./out", summarize=True, summarize_every=1000, metadata="metadata.tsv", metadata_docs="metadata.docs.tsv"): n_vocab = flattened.max() + 1 # How many tokens are in each document doc_idx, lengths = np.unique(doc_ids, return_counts=True) doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32') doc_lengths[doc_idx] = lengths # Count all token frequencies tok_idx, freq = np.unique(flattened, return_counts=True) term_frequency = np.zeros(n_vocab, dtype='int32') term_frequency[tok_idx] = freq if save: try: os.mkdir(outdir) except OSError as e: #for Python 2 if e.errno == errno.EEXIST: pass saver = tf.train.Saver(tf.global_variables()) outdir = os.path.abspath(self.log_dir) if summarize: try: self.logger.flush() except (AttributeError): # not yet logging self.logger = tf.summary.FileWriter(self.log_dir, self.sesh.graph) merged = self._addSummaries(metadata, metadata_docs) j = 0 epoch = 0 fraction = self.batch_size / len(flattened) # == batch / n_corpus self.sesh.run(tf.assign(self.fraction, fraction)) progress = shelve.open('progress.shelve') # turn on LDA loss after n iters of training iters_per_epoch = (int(len(flattened) / self.batch_size) + np.ceil(len(flattened) % self.batch_size)) n = iters_per_epoch * loss_switch_epochs self.sesh.run(tf.assign(self.switch_loss, n)) now = datetime.now().isoformat()[11:] print("------- Training begin: {} -------\n".format(now)) while epoch < max_epochs: try: # doc_ids, word_idxs for d, f in utils.chunks(self.batch_size, doc_ids, flattened): t0 = datetime.now() feed_dict = self.make_feed_dict(d, f) # if len(feed_dict[self.pivot_idxs]) == 0: # print("Empty batch. Skipping...") # continue fetches = [ self.loss_lda, self.loss_word2vec, self.loss, self.train_op ] loss_lda, loss_word2vec, loss, _ = self.sesh.run( fetches, feed_dict=feed_dict) if j > 5: print(loss_lda, loss_word2vec, loss) #py2 j += 1 if verbose and j % 1000 == 0: msg = ( "J:{j:05d} E:{epoch:05d} L_nce:{l_word2vec:1.3e} " "L_dirichlet:{l_lda:1.3e} R:{rate:1.3e}") t1 = datetime.now().timestamp() dt = t1 - t0 rate = self.batch_size / dt logs = dict(l_word2vec=loss_word2vec, epoch=epoch, j=j, l_lda=loss_lda, rate=rate) print(msg.format(**logs)) if save and j % save_every == 0: outfile = os.path.join( outdir, "{}_lda2vec".format(self.datetime)) saver.save(self.sesh, outfile, global_step=self.step) if summarize and j % summarize_every == 0: summary = self.sesh.run(merged, feed_dict=feed_dict) self.logger.add_summary(summary, global_step=self.step) #if j % 100 == 0 and j > 100 and epoch > 1: # coherence = topic_coherence(top_words) # for j in range(n_topics): # print(j, coherence[(j, 'cv')]) # kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) # progress[str(epoch)] = pickle.dumps(kw) epoch += 1 a = self.mixture.W.eval(session=self.sesh) b = self.mixture.factors.eval(session=self.sesh) c = self.sampler.W.eval(session=self.sesh) data = prepare_topics(a, b, c, words) print("------- epoch: {}-------\n".format(epoch)) top_words = print_top_words_per_topic(data) data['doc_lengths'] = doc_lengths data['term_frequency'] = term_frequency np.savez('topics.pyldavis', **data) except (KeyboardInterrupt): break print("epoch", epoch) print("max", max_epochs) now = datetime.now().isoformat()[11:] print("------- Training end: {} -------\n".format(now)) if save: outfile = os.path.join(outdir, "{}_lda2vec".format(self.datetime)) saver.save(self.sesh, outfile, global_step=self.step) try: self.logger.flush() self.logger.close() except (AttributeError): # not logging pass
model = NSLDA(counts, n_docs, n_topics, n_units, n_vocab) if os.path.exists('nslda.hdf5'): print "Reloading from saved" serializers.load_hdf5("nslda.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) j = 0 fraction = batchsize * 1.0 / flattened.shape[0] for epoch in range(50000000): p = cuda.to_cpu(model.proportions.W.data).copy() f = cuda.to_cpu(model.factors.W.data).copy() w = cuda.to_cpu(model.loss_func.W.data).copy() d = prepare_topics(p, f, w, words) print_top_words_per_topic(d) for (doc_ids, flat) in utils.chunks(batchsize, doc_id, flattened): t0 = time.time() optimizer.zero_grads() rec, ld = model.forward(doc_ids, flat) l = rec + ld * fraction * strength l.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} " "P:{ld:1.3e} R:{rate:1.3e}") l.to_cpu() rec.to_cpu() ld.to_cpu() t1 = time.time() dt = t1 - t0 rate = batchsize / dt
# (Pdb) model.mixture.weights.W.data.shape -> (11314, 20) (weights) # (Pdb) model.mixture.factors.W.data.shape -> (20, 300) (factors -> factor_vector) # (Pdb) model.sampler.W.data.shape -> (5837, 300) (word_vectors) # (Pdb) len(words) -> 5837 (vocab) if gpu_id >= 0: data = prepare_topics(cuda.to_gpu(model.mixture.weights.W.data).copy(), cuda.to_gpu(model.mixture.factors.W.data).copy(), cuda.to_gpu(model.sampler.W.data).copy(), words, normalize = False) else: data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(), cuda.to_cpu(model.mixture.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words, normalize = False) top_words = print_top_words_per_topic(data) if j % 100 == 0 and j > 100: coherence = topic_coherence(top_words) for j in range(n_topics): print(j, coherence[(j, 'cv')]) kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = doc_lengths data['term_frequency'] = term_frequency np.savez('topics.pyldavis', **data) window_size = 5 word2vec_only = False # word2vec_only = epoch <= 5: