def create_topics(self, do_print, epoch): """ for an epoch this function prepares topics with given corpus word list and extracts top words from those. It stores intermediate results of the data in pyldavis file and the model in a hdf5 file . <<<< This is the LDA part >>>> :param do_print: print top words in an epoch :param epoch: index of an epoch :return: """ j=0 # prepare the topic_term_distributions, document_topic_distributions and term_frequencies using softmax data = prepare_topics(weights=cuda.to_cpu(self.model.mixture.weights.W.data).copy(), topic_vectors=cuda.to_cpu(self.model.mixture.factors.W.data).copy(), word_vectors=cuda.to_cpu(self.model.sampler.W.data).copy(), vocab=self.words, doprint=False) #top_words = print_top_words_per_topic(data, do_print=do_print) #if j % 100 == 0 and j > 100 and do_print: # coherence = topic_coherence(top_words) # for j in range(self.n_topics): # print j, coherence[(j, 'cv')] data['doc_lengths'] = self.doc_lengths data['term_frequency'] = self.term_frequency np.savez('topics_' + self.modelid + '.pyldavis', **data) for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened): self.update_per_chunk(d, epoch, f) j+=1 # saves the parameters of model into a file in hdf5 format serializers.save_hdf5("lda2vec_" + self.modelid + ".hdf5", self.model)
def _train(self): """ Train the stacked denoising autoencoders. """ if 'fold' in self.hyperparameters: current_fold = self.hyperparameters['fold'] + 1 else: current_fold = 0 term_freq = self.abstracts_preprocessor.get_term_frequency_sparse_matrix().todense() self.get_cnn() if self._verbose: print("CNN is constructed...") error = numpy.inf iterations = 0 batchsize = 2048 for epoch in range(1, 1 + self.n_iter): self.document_distribution = self.predict_sdae(term_freq) t0 = time.time() self.user_vecs = self.als_step(self.user_vecs, self.item_vecs, self.train_data, self._lambda, type='user') self.item_vecs = self.als_step(self.item_vecs, self.user_vecs, self.train_data, self._lambda, type='item') t1 = time.time() iterations += 1 if self._verbose: error = self.evaluator.get_rmse(self.user_vecs.dot(self.item_vecs.T), self.train_data) if current_fold == 0: logs = dict(it=iterations, epoch=epoch, loss=error, time=(t1 - t0)) print('Iteration:{it:05d} Epoch:{epoch:02d} Loss:{loss:1.4e} Time:{time:.3f}s'.format(**logs)) else: logs = dict(fold=current_fold, it=iterations, epoch=epoch, loss=error, time=(t1 - t0)) print('Fold:{fold:02d} Iteration:{it:05d} Epoch:{epoch:02d} Loss:{loss:1.4e} ' 'Time:{time:.3f}s'.format(**logs)) for inp_batch, item_batch in chunks(batchsize, term_freq, self.item_vecs): t0 = time.time() loss = self.train_sdae(inp_batch, item_batch) t1 = time.time() iterations += 1 if self._verbose: if current_fold == 0: msg = ('Iteration:{it:05d} Epoch:{epoch:02d} Loss:{loss:1.3e} Time:{tim:.3f}s') logs = dict(loss=float(loss), epoch=epoch, it=iterations, tim=(t1 - t0)) print(msg.format(**logs)) else: msg = ('Fold:{fold:02d} Iteration:{it:05d} Epoch:{epoch:02d} Loss:{loss:1.3e} Time:{tim:.3f}s') logs = dict(fold=current_fold, loss=float(loss), epoch=epoch, it=iterations, tim=(t1 - t0)) print(msg.format(**logs)) error = self.evaluator.get_rmse(self.user_vecs.dot(self.item_vecs.T), self.train_data) self.document_distribution = self.predict_sdae(term_freq) rms = self.evaluate_sdae(term_freq, self.item_vecs) if self._verbose: print(rms) # Garbage collection for keras backend.clear_session() if self._verbose: print("SDAE trained...") return rms
def _train(self): """ Train the LDA2Vec model, and store the document_distribution matrix. """ n_units = self.abstracts_preprocessor.get_num_units() # 2 lists which correspond to pairs ('doc_id', 'word_id') of all the words # in each document, 'word_id' according to the computed dictionary 'vocab' doc_ids, flattened = zip(*self.abstracts_preprocessor.get_article_to_words()) assert len(doc_ids) == len(flattened) flattened = numpy.array(flattened, dtype='int32') doc_ids = numpy.array(doc_ids, dtype='int32') # Word frequencies, for lda2vec_model n_vocab = self.abstracts_preprocessor.get_num_vocab() term_frequency = self.abstracts_preprocessor.get_term_frequencies() # Assuming that doc_ids are in the set {0, 1, ..., n - 1} assert doc_ids.max() + 1 == self.n_items # Initialize lda2vec model lda2v_model = LDA2Vec(n_documents=self.n_items, n_document_topics=self.n_factors, n_units=n_units, n_vocab=n_vocab, counts=term_frequency) if self._verbose: print("Initialize LDA2Vec model..., Training LDA2Vec...") # Initialize optimizers optimizer = optimizers.Adam() optimizer.setup(lda2v_model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) if self._verbose: print("Optimizer Initialized...") batchsize = 2048 iterations = 0 for epoch in range(1, self.n_iter + 1): for d, f in chunks(batchsize, doc_ids, flattened): t0 = time.time() if len(d) <= 10: continue optimizer.zero_grads() l = lda2v_model.fit_partial(d.copy(), f.copy()) prior = lda2v_model.prior() loss = prior loss.backward() optimizer.update() iterations += 1 t1 = time.time() if self._verbose: msg = "Iteration:{it:05d} Epoch:{epoch:02d} Loss:{loss:1.3e} Prior:{prior:1.3e} Time:{tim:.3f}s" logs = dict(loss=float(l), epoch=epoch, it=iterations, prior=float(prior.data), tim=(t1 - t0)) print(msg.format(**logs)) # Get document distribution matrix. self.document_distribution = lda2v_model.mixture.proportions(numpy.unique(doc_ids), True).data if self._verbose: print("LDA2Vec trained...")
counts = corpus.keys_counts[:n_vocab] # Get the string representation for every compact key words = corpus.word_list(vocab)[:n_vocab] model = NVDM(n_vocab, n_units) if os.path.exists('nvdm.hdf5'): print "Reloading from saved" serializers.load_hdf5("nvdm.hdf5", model) # model.to_gpu() optimizer = O.Adam() optimizer.setup(model) j = 0 fraction = batchsize * 1.0 / bow.shape[0] for epoch in range(500): for (batch, ) in utils.chunks(batchsize, bow): t0 = time.time() rec, kl = model.observe(batch) optimizer.zero_grads() l = rec + kl l.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} " "P:{kl:1.3e} R:{rate:1.3e}") l.to_cpu() rec.to_cpu() kl.to_cpu() t1 = time.time() dt = t1 - t0 rate = batchsize / dt logs = dict(rec=float(rec.data),
six.print_("Reloading from saved") serializers.load_hdf5("lda.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) j = 0 fraction = batchsize * 1.0 / bow.shape[0] for epoch in range(50000000): if epoch % 100 == 0: p = cuda.to_cpu(model.proportions.W.data).copy() f = cuda.to_cpu(model.factors.W.data).copy() w = cuda.to_cpu(model.embedding.W.data).copy() d = prepare_topics(p, f, w, words) print_top_words_per_topic(d) for (ids, batch) in utils.chunks(batchsize, np.arange(bow.shape[0]), bow): t0 = time.time() optimizer.zero_grads() rec, ld = model.forward(ids, batch) l = rec + ld l.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} " "P:{ld:1.3e} R:{rate:1.3e}") l.to_cpu() rec.to_cpu() ld.to_cpu() t1 = time.time() dt = t1 - t0 rate = batchsize / dt logs = dict(rec=float(rec.data),
progress = shelve.open('progress.shelve') for epoch in range(200): data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(), cuda.to_cpu(model.mixture.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) top_words = print_top_words_per_topic(data) coherence = topic_coherence(top_words) for j in range(n_topics): print j, coherence[(j, 'cv')] kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = doc_lengths data['term_frequency'] = term_frequency np.savez('topics.pyldavis', **data) for d, f in utils.chunks(batchsize, doc_ids, flattened): t0 = time.time() optimizer.zero_grads() l = model.fit_partial(d.copy(), f.copy()) prior = model.prior() loss = prior * fraction loss.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} " "P:{prior:1.3e} R:{rate:1.3e}") prior.to_cpu() loss.to_cpu() t1 = time.time() dt = t1 - t0 rate = batchsize / dt logs = dict(loss=float(l), epoch=epoch, j=j,
for epoch in range(200): data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(), cuda.to_cpu(model.mixture.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) top_words = print_top_words_per_topic(data) if j % 100 == 0 and j > 100: coherence = topic_coherence(top_words) for j in range(n_topics): six.print_(j, coherence[(j, 'cv')]) kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = doc_lengths data['term_frequency'] = term_frequency np.savez('topics.pyldavis', **data) for d, f in utils.chunks(batchsize, doc_ids, flattened): t0 = time.time() optimizer.zero_grads() l = model.fit_partial(d.copy(), f.copy()) prior = model.prior() loss = prior * fraction loss.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} " "P:{prior:1.3e} R:{rate:1.3e}") prior.to_cpu() loss.to_cpu() t1 = time.time() dt = t1 - t0 rate = batchsize / dt logs = dict(loss=float(l), epoch=epoch, j=j,
def train(self, pivot_words, target_words, doc_ids, data_size, num_epochs, switch_loss_epoch=0, save_every=1, report_every=1, print_topics_every=5, idx_to_word=None): """Train the Lda2vec Model. pivot_words, target_words, and doc_ids should be the same size. Args: pivot_words (np.array): Array of word idxs corresponding to pivot words target_words (np.array): Array of word idxs corresponding to target words doc_ids (TYPE): Document IDs linking word idxs to their docs data_size (TYPE): Length of pivot_words array num_epochs (TYPE): Number of epochs to train model switch_loss_epoch (int, optional): Epoch to switch on LDA loss. LDA loss not learned until this epoch save_every (int, optional): Save model every "save_every" epoch report_every (int, optional): Report model metrics every "report_every" epoch. print_topics_every (int, optional): Print top 10 words in each topic every "print_topics_every" idx_to_word (None, optional): IDX to word mapping - Required if you want to see word-topic membership """ # Calculate fraction used in DL Loss calculation temp_fraction = self.batch_size * 1.0 / data_size # Assign the fraction placeholder variable with the value we calculated self.sesh.run(tf.assign(self.fraction, temp_fraction)) # Calculate the number of iterations per epoch so we can figure out when to switch the loss iters_per_epoch = int(data_size / self.batch_size) + np.ceil( data_size % self.batch_size) # Calculate what step we would be on @ the switch loss epoch switch_loss_step = iters_per_epoch * switch_loss_epoch # Assign the switch loss variable with the step we just calculated self.sesh.run(tf.assign(self.switch_loss, switch_loss_step)) if self.save_graph_def: # Initialize a tensorflow Saver object saver = tf.train.Saver() # Initialize a tensorflow summary writer so we can save logs writer = tf.summary.FileWriter(self.logdir + '/', graph=self.sesh.graph) # Iterate over the number of epochs we want to train for for e in range(num_epochs): print('\nEPOCH:', e + 1) # Get a batch worth of data for p, t, d in utils.chunks(self.batch_size, pivot_words, target_words, doc_ids): # Create the feed dict from the batched data feed_dict = {self.x: p, self.y: t, self.docs: d} # Values we want to fetch whenever we run the model fetches = [ self.merged, self.optimizer, self.loss, self.loss_word2vec, self.loss_lda, self.step ] # Run a step of the model summary, _, l, lw2v, llda, step = self.sesh.run( fetches, feed_dict=feed_dict) # Prints log every "report_every" epoch if (e + 1) % report_every == 0: print('LOSS', l, 'w2v', lw2v, 'lda', llda) # Saves model every "save_every" epoch if (e + 1) % save_every == 0 and self.save_graph_def: writer.add_summary(summary, step) writer.flush() writer.close() save_path = saver.save(self.sesh, self.logdir + '/model.ckpt') writer = tf.summary.FileWriter(self.logdir + '/', graph=self.sesh.graph) # Prints out membership of words in each topic every "print_topics_every" epoch if e > 0 and (e + 1) % print_topics_every == 0: idxs = np.arange(self.num_topics) words, sims = self.get_k_closest(idxs, in_type='topic', idx_to_word=idx_to_word, k=10, verbose=True) # Save after all epochs are finished, but only if we didn't just save if self.save_graph_def and (e + 1) % save_every != 0: writer.add_summary(summary, step) writer.flush() writer.close() save_path = saver.save(self.sesh, self.logdir + '/model.ckpt')
def infer(self,docs=None,epochs=200, update_words=False, update_topics=False, topic_vectors=None): """ Infers the featurs of a new document that is passed in. By running the Lda2vec algorithm again. But by updating only the topic distributions""" texts = docs docs = [] for text in texts: docs.append(unicode(" ".join(word for word in text.split() if word in self.word2vec_model.vocab))) logging.info("preprocessing") self.preprocess(docs) logging.info('preprocessed!') self.infer_model = LDA2Vec(n_documents=self.n_docs,\ n_document_topics=self.n_topics,\ n_units=300,\ n_vocab=self.n_vocab,\ counts=self.term_frequency,\ n_samples=15,\ power=self.power,\ temperature=self.temp) if self.words_pretrained: self.infer_model.sampler.W.data = self.vectors[:self.n_vocab, :] self.infer_model.mixture.factors.W.data = self.train_model.mixture.factors.W.data if topic_vectors is not None: assert(topic_vectors.shape==self.infer_model.mixture.factors.W.data.shape), ("topic vectors shape doesn't match") self.infer_model.mixture.factors.W.data = topic_vectors optimizer = O.Adam() optimizer.setup(self.infer_model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 msgs = defaultdict(list) for epoch in range(epochs): print "epoch : ",epoch data = prepare_topics(cuda.to_cpu(self.infer_model.mixture.weights.W.data).copy(), cuda.to_cpu(self.infer_model.mixture.factors.W.data).copy(), cuda.to_cpu(self.infer_model.sampler.W.data).copy(), self.words) top_words = print_top_words_per_topic(data) if j % 100 == 0 and j > 100: coherence = topic_coherence(top_words) for j in range(self.n_topics): print j, coherence[(j, 'cv')] kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) #progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = self.doc_lengths data['term_frequency'] = self.term_frequency #np.savez('topics.pyldavis', **data) for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened): t0 = time.time() optimizer.zero_grads() l = self.infer_model.fit_partial(d.copy(), f.copy(), update_words=update_words, update_topics=update_topics) prior = self.infer_model.prior() loss = prior * self.fraction loss.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} " "P:{prior:1.3e} R:{rate:1.3e}") prior.to_cpu() loss.to_cpu() t1 = time.time() dt = t1 - t0 rate = self.batchsize / dt msgs["E"].append(epoch) msgs["L"].append(float(l)) j += 1 logs = dict(loss=float(l), epoch=epoch, j=j, prior=float(prior.data), rate=rate) print msg.format(**logs) print "\n ================================= \n" #serializers.save_hdf5("lda2vec.hdf5", self.model) msgs["loss_per_epoch"].append(float(l)) return data, msgs
cuda.to_cpu(model.mixture_sty.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) print_top_words_per_topic(ts) ts['doc_lengths'] = sty_len ts['term_frequency'] = term_frequency np.savez('topics.story.pyldavis', **ts) ta = prepare_topics(cuda.to_cpu(model.mixture_aut.weights.W.data).copy(), cuda.to_cpu(model.mixture_aut.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) print_top_words_per_topic(ta) ta['doc_lengths'] = aut_len ta['term_frequency'] = term_frequency np.savez('topics.author.pyldavis', **ta) for s, a, f in utils.chunks(batchsize, story_id, author_id, flattened): t0 = time.time() optimizer.zero_grads() l = model.fit_partial(s.copy(), a.copy(), f.copy()) prior = model.prior() loss = prior * fraction loss.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} " "P:{prior:1.3e} R:{rate:1.3e}") prior.to_cpu() loss.to_cpu() t1 = time.time() dt = t1 - t0 rate = batchsize / dt logs = dict(loss=float(l), epoch=epoch, j=j,
cuda.to_cpu(model.mixture_sty.weights.W.data).copy(), cuda.to_cpu(model.mixture_sty.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) print_top_words_per_topic(ts) ts['doc_lengths'] = sty_len ts['term_frequency'] = term_frequency np.savez('topics.story.pyldavis', **ts) ta = prepare_topics( cuda.to_cpu(model.mixture_aut.weights.W.data).copy(), cuda.to_cpu(model.mixture_aut.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) print_top_words_per_topic(ta) ta['doc_lengths'] = aut_len ta['term_frequency'] = term_frequency np.savez('topics.author.pyldavis', **ta) for s, a, f in utils.chunks(batchsize, story_id, author_id, flattened): t0 = time.time() optimizer.zero_grads() l = model.fit_partial(s.copy(), a.copy(), f.copy()) prior = model.prior() loss = prior * fraction loss.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} " "P:{prior:1.3e} R:{rate:1.3e}") prior.to_cpu() loss.to_cpu() t1 = time.time() dt = t1 - t0 rate = batchsize / dt logs = dict(loss=float(l),
def train( self, doc_ids, flattened, max_epochs=np.inf, verbose=False, loss_switch_epochs=0, # num epochs until LDA loss switched on save=False, save_every=1000, outdir="./out", summarize=True, summarize_every=1000, metadata="metadata.tsv", metadata_docs="metadata.docs.tsv"): if save: try: os.mkdir(outdir) except (FileExistsError): pass saver = tf.train.Saver(tf.global_variables()) outdir = os.path.abspath(self.log_dir) if summarize: try: self.logger.flush() except (AttributeError): # not yet logging self.logger = tf.summary.FileWriter(self.log_dir, self.sesh.graph) merged = self._addSummaries(metadata, metadata_docs) j = 0 epoch = 0 fraction = self.batch_size / len(flattened) # == batch / n_corpus self.sesh.run(tf.assign(self.fraction, fraction)) # turn on LDA loss after n iters of training iters_per_epoch = (int(len(flattened) / self.batch_size) + np.ceil(len(flattened) % self.batch_size)) n = iters_per_epoch * loss_switch_epochs self.sesh.run(tf.assign(self.switch_loss, n)) now = datetime.now().isoformat()[11:] print("------- Training begin: {} -------\n".format(now)) while epoch < max_epochs: try: # doc_ids, word_idxs for d, f in utils.chunks(self.batch_size, doc_ids, flattened): t0 = datetime.now().timestamp() feed_dict = self.make_feed_dict(d, f) # if len(feed_dict[self.pivot_idxs]) == 0: # print("Empty batch. Skipping...") # continue fetches = [ self.loss_lda, self.loss_word2vec, self.loss, self.train_op ] loss_lda, loss_word2vec, loss, _ = self.sesh.run( fetches, feed_dict=feed_dict) j += 1 if verbose and j % 1000 == 0: msg = ( "J:{j:05d} E:{epoch:05d} L_nce:{l_word2vec:1.3e} " "L_dirichlet:{l_lda:1.3e} R:{rate:1.3e}") t1 = datetime.now().timestamp() dt = t1 - t0 rate = self.batch_size / dt logs = dict(l_word2vec=loss_word2vec, epoch=epoch, j=j, l_lda=loss_lda, rate=rate) print(msg.format(**logs)) if save and j % save_every == 0: outfile = os.path.join( outdir, "{}_lda2vec".format(self.datetime)) saver.save(self.sesh, outfile, global_step=self.step) if summarize and j % summarize_every == 0: summary = self.sesh.run(merged, feed_dict=feed_dict) self.logger.add_summary(summary, global_step=self.step) epoch += 1 except (KeyboardInterrupt): break print("epoch", epoch) print("max", max_epochs) now = datetime.now().isoformat()[11:] print("------- Training end: {} -------\n".format(now)) if save: outfile = os.path.join(outdir, "{}_lda2vec".format(self.datetime)) saver.save(self.sesh, outfile, global_step=self.step) try: self.logger.flush() self.logger.close() except (AttributeError): # not logging pass sys.exit(0)
def train( self, doc_ids, flattened, vocab, words, max_epochs=np.inf, verbose=False, #added vocab & words to add save(npz) #option during training loss_switch_epochs=0, # num epochs until LDA loss switched on save=False, save_every=1000, outdir="./out", summarize=True, summarize_every=1000, metadata="metadata.tsv", metadata_docs="metadata.docs.tsv"): n_vocab = flattened.max() + 1 # How many tokens are in each document doc_idx, lengths = np.unique(doc_ids, return_counts=True) doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32') doc_lengths[doc_idx] = lengths # Count all token frequencies tok_idx, freq = np.unique(flattened, return_counts=True) term_frequency = np.zeros(n_vocab, dtype='int32') term_frequency[tok_idx] = freq if save: try: os.mkdir(outdir) except OSError as e: #for Python 2 if e.errno == errno.EEXIST: pass saver = tf.train.Saver(tf.global_variables()) outdir = os.path.abspath(self.log_dir) if summarize: try: self.logger.flush() except (AttributeError): # not yet logging self.logger = tf.summary.FileWriter(self.log_dir, self.sesh.graph) merged = self._addSummaries(metadata, metadata_docs) j = 0 epoch = 0 fraction = self.batch_size / len(flattened) # == batch / n_corpus self.sesh.run(tf.assign(self.fraction, fraction)) progress = shelve.open('progress.shelve') # turn on LDA loss after n iters of training iters_per_epoch = (int(len(flattened) / self.batch_size) + np.ceil(len(flattened) % self.batch_size)) n = iters_per_epoch * loss_switch_epochs self.sesh.run(tf.assign(self.switch_loss, n)) now = datetime.now().isoformat()[11:] print("------- Training begin: {} -------\n".format(now)) while epoch < max_epochs: try: # doc_ids, word_idxs for d, f in utils.chunks(self.batch_size, doc_ids, flattened): t0 = datetime.now() feed_dict = self.make_feed_dict(d, f) # if len(feed_dict[self.pivot_idxs]) == 0: # print("Empty batch. Skipping...") # continue fetches = [ self.loss_lda, self.loss_word2vec, self.loss, self.train_op ] loss_lda, loss_word2vec, loss, _ = self.sesh.run( fetches, feed_dict=feed_dict) if j > 5: print(loss_lda, loss_word2vec, loss) #py2 j += 1 if verbose and j % 1000 == 0: msg = ( "J:{j:05d} E:{epoch:05d} L_nce:{l_word2vec:1.3e} " "L_dirichlet:{l_lda:1.3e} R:{rate:1.3e}") t1 = datetime.now().timestamp() dt = t1 - t0 rate = self.batch_size / dt logs = dict(l_word2vec=loss_word2vec, epoch=epoch, j=j, l_lda=loss_lda, rate=rate) print(msg.format(**logs)) if save and j % save_every == 0: outfile = os.path.join( outdir, "{}_lda2vec".format(self.datetime)) saver.save(self.sesh, outfile, global_step=self.step) if summarize and j % summarize_every == 0: summary = self.sesh.run(merged, feed_dict=feed_dict) self.logger.add_summary(summary, global_step=self.step) #if j % 100 == 0 and j > 100 and epoch > 1: # coherence = topic_coherence(top_words) # for j in range(n_topics): # print(j, coherence[(j, 'cv')]) # kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) # progress[str(epoch)] = pickle.dumps(kw) epoch += 1 a = self.mixture.W.eval(session=self.sesh) b = self.mixture.factors.eval(session=self.sesh) c = self.sampler.W.eval(session=self.sesh) data = prepare_topics(a, b, c, words) print("------- epoch: {}-------\n".format(epoch)) top_words = print_top_words_per_topic(data) data['doc_lengths'] = doc_lengths data['term_frequency'] = term_frequency np.savez('topics.pyldavis', **data) except (KeyboardInterrupt): break print("epoch", epoch) print("max", max_epochs) now = datetime.now().isoformat()[11:] print("------- Training end: {} -------\n".format(now)) if save: outfile = os.path.join(outdir, "{}_lda2vec".format(self.datetime)) saver.save(self.sesh, outfile, global_step=self.step) try: self.logger.flush() self.logger.close() except (AttributeError): # not logging pass
print "Reloading from saved" serializers.load_hdf5("lda.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) j = 0 fraction = batchsize * 1.0 / bow.shape[0] for epoch in range(50000000): if epoch % 100 == 0: p = cuda.to_cpu(model.proportions.W.data).copy() f = cuda.to_cpu(model.factors.W.data).copy() w = cuda.to_cpu(model.embedding.W.data).copy() d = prepare_topics(p, f, w, words) print_top_words_per_topic(d) for (ids, batch) in utils.chunks(batchsize, np.arange(bow.shape[0]), bow): t0 = time.time() optimizer.zero_grads() rec, ld = model.forward(ids, batch) l = rec + ld l.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} " "P:{ld:1.3e} R:{rate:1.3e}") l.to_cpu() rec.to_cpu() ld.to_cpu() t1 = time.time() dt = t1 - t0 rate = batchsize / dt logs = dict(rec=float(rec.data), epoch=epoch, j=j,
counts = corpus.keys_counts[:n_vocab] # Get the string representation for every compact key words = corpus.word_list(vocab)[:n_vocab] model = NVDM(n_vocab, n_units) if os.path.exists('nvdm.hdf5'): print "Reloading from saved" serializers.load_hdf5("nvdm.hdf5", model) # model.to_gpu() optimizer = O.Adam() optimizer.setup(model) j = 0 fraction = batchsize * 1.0 / bow.shape[0] for epoch in range(500): for (batch,) in utils.chunks(batchsize, bow): t0 = time.time() rec, kl = model.observe(batch) optimizer.zero_grads() l = rec + kl l.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} " "P:{kl:1.3e} R:{rate:1.3e}") l.to_cpu() rec.to_cpu() kl.to_cpu() t1 = time.time() dt = t1 - t0 rate = batchsize / dt logs = dict(rec=float(rec.data), epoch=epoch, j=j,