def train_model(): updates = [] log_lik = [] # Train model print "Parsing dir.." docs, vocab, word_idx, sent_idx, sentences = data.parse_dir("./data/small/") l = MgLda(n_local_topics, n_global_topics, docs, vocab, word_idx, sentences, sent_idx) print "Done! Running %d iterations..."%iterations for i in range(0, iterations): start = time.time() l.update() duration = time.time() - start updates.append(l.updates) log_lik.append(l.log_lik) times.append(duration) print "Iteration %d, Duration: %f"%(i, duration) # Save model and data dir = os.path.join("models/", datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) os.makedirs(dir) with open(dir+'/mglda_model_2.pkl', 'wb') as output: pickle.dump(l, output, -1) updates = np.asarray(updates) log_lik = np.asarray(log_lik) np.savetxt(dir+'/updates.txt', updates) np.savetxt(dir+'/log_lik.txt', log_lik) np.savetxt(dir+'/times.txt', np.asarray(times)) print "Model and data saved!" # Add some plots print "Adding some plots.." x = np.arange(0, iterations) fig, ax = plt.subplots( nrows=1, ncols=1 ) ax.plot(x, log_lik/100000) plt.xlabel("Iterations") plt.ylabel("Log likelihood x 10^5") plt.title("MG-LDA training") plt.grid(True) fig.savefig(dir + "/log_lik.png") plt.close(fig) # Plot log updates fig, ax = plt.subplots( nrows=1, ncols=1 ) ax.plot(x, updates) plt.xlabel("Iterations") plt.ylabel("Updates") plt.title("MG-LDA training") plt.grid(True) fig.savefig(dir + "/updates.png") plt.close(fig) # Return the model return l
new_z = np.random.multinomial(1, p_z).argmax() loglikelihood += log(p_z[new_z]) self.assignment[i][w] = new_z self.nwz[cur_w, new_z] += 1 self.ndz[i, new_z] += 1 self.nz[new_z] += 1 print "Loglikelihood %f" % loglikelihood self.loglikelihood.append(loglikelihood) # EXAMPLE print "Parsing dir.." docs, vocab, word_idx, _, _ = data.parse_dir("./data/all/") # docs, vocab, word_idx, _, _ = data.parse_dir("/Users/jeisses/Documents/datasets/nlp/movie/review_polarity/txt_sentoken/all/") print "Done" print " ====== " print "Setting up LDA.." l = Lda(10, docs, vocab, word_idx) print "Training for 200 iterations..." for i in range(0, 200): l.update() if i % 10 == 0: print " -- iteration %d ---" % i print "Done. Top words for reach topic:" l.top_words(10) loglikelihood = [item / 100000 for item in l.loglikelihood]