def testOnRealData(self): rd.seed(0xDAFF0D12) path = "/Users/bryanfeeney/Desktop/NIPS" with open(path + "/ar.pkl", "rb") as f: X, W, feats_dict, dic = pkl.load(f) if W.dtype != DTYPE: W = W.astype(DTYPE) if X.dtype != DTYPE: X = X.astype(DTYPE) D,T = W.shape _,F = X.shape freq = np.squeeze(np.asarray(W.sum(axis=0))) scale = np.reciprocal(1. + freq) K = 10 P = 30 model = stm.newModelAtRandom(X, W, P, K, 0.1, 0.1, dtype=DTYPE) queryState = stm.newQueryState(W, model) trainPlan = stm.newTrainPlan(iterations=100, logFrequency=1, fastButInaccurate=True, debug=True) model, query, (bndItrs, bndVals, bndLikes) = stm.train (W, X, model, queryState, trainPlan) with open(newModelFile("stm-yv-bou-nips-ar", K, None), "wb") as f: pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() plt.show() # Print the top topic words topWordCount = 100 kTopWordInds = [self.topWordInds(dic, model.vocab[k,:] * scale, topWordCount) \ for k in range(K)] print ("Perplexity: %f\n\n" % ctm.perplexity(W, model, query)) print ("\t\t".join (["Topic " + str(k) for k in range(K)])) print ("\n".join ("\t".join (dic[kTopWordInds[k][c]] + "\t%0.4f" % model.vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))
def _testLikelihoodOnModelDerivedExample(self): print("Cross-validated likelihoods on model-derived example") rd.seed(0xBADB055) # Global init for repeatable test D, T, K, F, P = 200, 100, 10, 12, 8 tpcs, vocab, docLens, X, W = sampleFromModel(D, T, K, F, P) plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r) plt.show() W = W.astype(DTYPE) X = X.astype(DTYPE) # Create the cross-validation folds folds = 5 foldSize = ceil(D / 5) querySize = foldSize trainSize = D - querySize trainLikely = [] trainWordCount = [] queryLikely = [] queryWordCount = [] for fold in range(folds): # Split the datasets start = fold * foldSize end = start + trainSize trainSet = np.arange(start,end) % D querySet = np.arange(end, end + querySize) % D X_train, W_train = X[trainSet,:], W[trainSet,:] X_query, W_query = X[querySet,:], W[querySet,:] # Train the model model = stm.newModelAtRandom(X_train, W_train, P, K, 0.1, 0.1, dtype=DTYPE) queryState = stm.newQueryState(W_train, model) plan = stm.newTrainPlan(iterations=1000, logFrequency=1) model, query, (bndItrs, bndVals, bndLikes) = stm.train (W_train, X_train, model, queryState, plan) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() plt.show() # Plot the topic covariance self._plotCov(model) # Plot the vocab plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Calculating the training set likelihood trainLikely.append(stm.log_likelihood(W_train, model, queryState)) trainWordCount.append(W_train.data.sum()) # Now query the model. plan = stm.newTrainPlan(iterations=1000) queryState = stm.newQueryState(W_query, model) model, queryState = stm.query(W_query, X_query, model, queryState, plan) queryLikely.append(stm.log_likelihood(W_query, model, queryState)) queryWordCount.append(W_query.data.sum()) # Check and print results. for fold in range(folds): trainPerp = np.exp(-trainLikely[fold]/trainWordCount[fold]) queryPerp = np.exp(-queryLikely[fold]/queryWordCount[fold]) print("Fold %3d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainLikely[fold], queryLikely[fold])) print(" Perplexity: %12.2f \t Perplexity: %12.2f" % (trainPerp, queryPerp)) self.assertTrue(queryPerp < 60.0) # Maximum perplexity. self.assertTrue(trainPerp < 60.0) print("End of Test")