def testPerplexityOnRealData(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) with open(AclDictPath, "rb") as f: d = pkl.load(f) data.convert_to_dtype(np.int32) data.prune_and_shuffle(min_doc_len=50, min_link_count=2) data.convert_to_undirected_graph() data.convert_to_binary_link_matrix() # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(data.words.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model K = 10 model = lda.newModelAtRandom(data, K, dtype=dtype) queryState = lda.newQueryState(data, model) trainPlan = lda.newTrainPlan(iterations=300, logFrequency=50, fastButInaccurate=False, debug=True) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = lda.train (data, model, queryState, trainPlan) # with open(newModelFileFromModel(model), "wb") as f: # pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() plt.show() vocab = lda.wordDists(model) plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Print out the most likely topic words topWordCount = 10 kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)] like = lda.log_likelihood(data, model, query) perp = perplexity_from_like(like, data.word_count) print ("Prior %s" % (str(model.topicPrior))) print ("Perplexity: %f\n\n" % perp) for k in range(model.K): print ("\nTopic %d\n=============================" % k) print ("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))
def testCrossValPerplexityOnRealDataWithLdaGibbsInc(self): ActiveFolds = 3 dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) data.convert_to_dtype(np.int32) # Gibbs expects integers as input, regardless of model dtype data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # Training setup TrainSamplesPerTopic = 10 QuerySamplesPerTopic = 2 Thin = 2 Debug = False # Start running experiments topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] for K in topicCounts: trainPlan = lda_gibbs.newTrainPlan(K * TrainSamplesPerTopic, thin=Thin, debug=Debug) queryPlan = lda_gibbs.newTrainPlan(K * QuerySamplesPerTopic, thin=Thin, debug=Debug) trainPerps = [] queryPerps = [] for fold in range(ActiveFolds): # range(NumFolds): trainData, queryData = data.cross_valid_split(fold, NumFolds) estData, evalData = queryData.doc_completion_split() model = lda_gibbs.newModelAtRandom(trainData, K, dtype=dtype) query = lda_gibbs.newQueryState(trainData, model) # Train the model, and the immediately save the result to a file for subsequent inspection model, trainResult, (_, _, _) = lda_gibbs.train (trainData, model, query, trainPlan) like = lda_gibbs.log_likelihood(trainData, model, trainResult) perp = perplexity_from_like(like, trainData.word_count) trainPerps.append(perp) query = lda_gibbs.newQueryState(estData, model) _, queryResult = lda_gibbs.query(estData, model, query, queryPlan) like = lda_gibbs.log_likelihood(evalData, model, queryResult) perp = perplexity_from_like(like, evalData.word_count) queryPerps.append(perp) trainPerps.append(sum(trainPerps) / ActiveFolds) queryPerps.append(sum(queryPerps) / ActiveFolds) print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps]))) print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))