def _testPerplexityOnRealData(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) with open(AclDictPath, "rb") as f: dic = pkl.load(f) data.convert_to_dtype(dtype) data.convert_to_undirected_graph() data.convert_to_binary_link_matrix() data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(data.words.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model K = TopicCount model = rtm.newModelAtRandom(data, K, dtype=dtype) queryState = rtm.newQueryState(data, model) trainPlan = rtm.newTrainPlan(iterations=50, logFrequency=LogFreq, fastButInaccurate=False, debug=True) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = rtm.train (data, model, queryState, trainPlan) # with open(newModelFileFromModel(model), "wb") as f: # pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() plt.show() # Print out the most likely topic words # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0)))) vocab = rtm.wordDists(model) topWordCount = 10 kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)] like = rtm.log_likelihood(data, model, query) perp = perplexity_from_like(like, data.word_count) print ("Prior %s" % (str(model.topicPrior))) print ("Perplexity: %f\n\n" % perp) for k in range(model.K): print("\nTopic %d\n=============================" % k) print("\n".join("%-20s\t%0.4f" % (dic[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))
def testMapOnRealData(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) with open(AclDictPath, "rb") as f: dic = pkl.load(f) data.convert_to_dtype(dtype) data.convert_to_undirected_graph() data.convert_to_binary_link_matrix() data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) trainData, testData = data.doc_completion_split() for pseudoNegCount in (5, 10, 25, 50, 100): rd.seed(0xC0FFEE) # Initialise the model K = TopicCount model = rtm.newModelAtRandom(trainData, K, dtype=dtype, pseudoNegCount=data.doc_count * pseudoNegCount) queryState = rtm.newQueryState(trainData, model) trainPlan = rtm.newTrainPlan(iterations=50, logFrequency=LogFreq, fastButInaccurate=False, debug=True) # Train the model, and the immediately save the result to a file for subsequent inspection model, topics, (bndItrs, bndVals, bndLikes) = rtm.train(trainData, model, queryState, trainPlan) # with open(newModelFileFromModel(model), "wb") as f: # pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() plt.show() # Print out the most likely topic words # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0)))) vocab = rtm.wordDists(model) topWordCount = 10 kTopWordInds = [self.topWordInds(vocab[k, :], topWordCount) for k in range(K)] like = rtm.log_likelihood(trainData, model, topics) perp = perplexity_from_like(like, trainData.word_count) # print ("Prior %s" % (str(model.topicPrior))) print ("Pseudo Neg-Count: %d " % pseudoNegCount) print ("\tTrain Perplexity: %f\n\n" % perp) # for k in range(model.K): # print ("\nTopic %d\n=============================" % k) # print ("\n".join("%-20s\t%0.4f" % (dic[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount))) min_probs = rtm.min_link_probs(model, topics, testData.links) link_probs = rtm.link_probs(model, topics, min_probs) try: map = mean_average_prec(testData.links, link_probs) except: print ("Unexpected error") print("\tThe Mean-Average-Precision is %.3f" % map)