def _newQueryStateFromCtm(data, model): import model.ctm_bohning as ctm ctm_model = ctm.newModelAtRandom(data, model.K, VocabPrior, model.dtype) ctm_query = ctm.newQueryState(data, model) ctm_plan = ctm.newTrainPlan(200, epsilon=1, logFrequency=100, debug=False) ctm_model, ctm_query, (_, _, _) = ctm.train(data, ctm_model, ctm_query, ctm_plan) model.vocab[:,:] = ctm_model.vocab model.topicCov[:,:] = ctm_model.sigT model.topicMean[:] = ctm_model.topicMean K, vocab, dtype = model.K, model.vocab, model.dtype D,T = data.words.shape assert T == vocab.shape[1], "The number of terms in the document-term matrix (" + str(T) + ") differs from that in the model-states vocabulary parameter " + str(vocab.shape[1]) docLens = np.squeeze(np.asarray(data.words.sum(axis=1))) outMeans = ctm_query.means outVarcs = np.ones((D,K), dtype=dtype) inMeans = np.ndarray(shape=(D,K), dtype=dtype) for d in range(D): inMeans[d,:] = rd.multivariate_normal(outMeans[d,:], model.topicCov) inVarcs = np.ones((D,K), dtype=dtype) inDocCov = np.ones((D,), dtype=dtype) return QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens)
def testPerplexityOnRealDataWithCtm(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) with open(AclDictPath, "rb") as f: d = pkl.load(f) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(data.words.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model K = 10 # TopicCount model = ctm.newModelAtRandom(data, K, dtype=dtype) queryState = ctm.newQueryState(data, model) trainPlan = ctm.newTrainPlan(iterations=200, logFrequency=10, fastButInaccurate=False, debug=False) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = ctm.train (data, model, queryState, trainPlan) # with open(newModelFileFromModel(model), "wb") as f: # pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() plt.show() fig, ax1 = plt.subplots() ax1.imshow(model.sigT, interpolation="nearest", cmap=cm.Greys_r) fig.show() plt.show() # Print out the most likely topic words # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0)))) vocab = ctm.wordDists(model) topWordCount = 10 kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)] like = ctm.log_likelihood(data, model, query) perp = perplexity_from_like(like, data.word_count) print ("Perplexity: %f\n\n" % perp) for k in range(model.K): print("\nTopic %d\n=============================" % k) print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))
def testOnRealData(self): print ("CTM/Bohning") rd.seed(0xC0FFEE) dtype = np.float64 path = "/Users/bryanfeeney/Desktop/NIPS" with open(path + "/ar.pkl", 'rb') as f: _, W, _, d = pkl.load(f) if len(d) == 1: d = d[0] if W.dtype != dtype: W = W.astype(dtype) docLens = np.squeeze(np.asarray(W.sum(axis=1))) good_rows = (np.where(docLens > 0.5))[0] if len(good_rows) < W.shape[0]: print ("Some rows in the doc-term matrix are empty. These have been removed.") W = W[good_rows, :] # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(W.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model K = 20 model = ctm.newModelAtRandom(W, K, dtype=dtype) queryState = ctm.newQueryState(W, model) trainPlan = ctm.newTrainPlan(iterations=750, logFrequency=10, fastButInaccurate=False, debug=True) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = ctm.train (W, None, model, queryState, trainPlan) with open(newModelFile("ctm-bohn-nips-ar", K, None), "wb") as f: pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the bound fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() fig.suptitle("CTM/Bohning (Identity Cov) on NIPS") plt.show() topWordCount = 100 kTopWordInds = [self.topWordInds(d, model.vocab[k,:] * scale, topWordCount) \ for k in range(K)] print ("Perplexity: %f\n\n" % ctm.perplexity(W, model, query)) print ("\t\t".join (["Topic " + str(k) for k in range(K)])) print ("\n".join ("\t".join (d[kTopWordInds[k][c]] + "\t%0.4f" % model.vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))
def testCrossValPerplexityOnRealDataWithCtmInc(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # Initialise the model trainPlan = ctm.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False) queryPlan = ctm.newTrainPlan(iterations=100, logFrequency=10, fastButInaccurate=False, debug=False) topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] for K in topicCounts: trainPerps = [] queryPerps = [] for fold in range(1): # range(NumFolds): trainData, queryData = data.cross_valid_split(fold, NumFolds) model = ctm.newModelAtRandom(trainData, K, dtype=dtype) query = ctm.newQueryState(trainData, model) # Train the model, and the immediately save the result to a file for subsequent inspection model, trainResult, (_, _, _) = ctm.train (trainData, model, query, trainPlan) like = ctm.log_likelihood(trainData, model, trainResult) perp = perplexity_from_like(like, trainData.word_count) trainPerps.append(perp) query = ctm.newQueryState(queryData, model) model, queryResult = ctm.query(queryData, model, query, queryPlan) like = ctm.log_likelihood(queryData, model, queryResult) perp = perplexity_from_like(like, queryData.word_count) queryPerps.append(perp) trainPerps.append(sum(trainPerps) / NumFolds) queryPerps.append(sum(queryPerps) / NumFolds) print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps]))) print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
def _testOnModelHandcraftedData(self): # # Create the vocab # T = 3 * 3 K = 5 # Horizontal bars vocab1 = ssp.coo_matrix(([1, 1, 1], ([0, 0, 0], [0, 1, 2])), shape=(3,3)).todense() #vocab2 = ssp.coo_matrix(([1, 1, 1], ([1, 1, 1], [0, 1, 2])), shape=(3,3)).todense() vocab3 = ssp.coo_matrix(([1, 1, 1], ([2, 2, 2], [0, 1, 2])), shape=(3,3)).todense() # Vertical bars vocab4 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 0, 0])), shape=(3,3)).todense() #vocab5 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [1, 1, 1])), shape=(3,3)).todense() vocab6 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [2, 2, 2])), shape=(3,3)).todense() # Diagonals vocab7 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 1, 2])), shape=(3,3)).todense() #vocab8 = ssp.coo_matrix(([1, 1, 1], ([2, 1, 0], [0, 1, 2])), shape=(3,3)).todense() # Put together T = vocab1.shape[0] * vocab1.shape[1] vocabs = [vocab1, vocab3, vocab4, vocab6, vocab7] # Create a single matrix with the flattened vocabularies vocabVectors = [] for vocab in vocabs: vocabVectors.append (np.squeeze(np.asarray (vocab.reshape((1,T))))) vocab = normalizerows_ip(np.array(vocabVectors, dtype=DTYPE)) # Plot the vocab ones = np.ones(vocabs[0].shape) for k in range(K): plt.subplot(2, 3, k) plt.imshow(ones - vocabs[k], interpolation="none", cmap = cm.Greys_r) plt.show() # # Create the corpus # rd.seed(0xC0FFEE) D = 1000 # Make sense (of a sort) of this by assuming that these correspond to # Kittens Omelettes Puppies Oranges Tomatoes Dutch People Basketball Football #topicMean = np.array([10, 25, 5, 15, 5, 5, 10, 25]) # topicCovar = np.array(\ # [[ 100, 5, 55, 20, 5, 15, 4, 0], \ # [ 5, 100, 5, 10, 70, 5, 0, 0], \ # [ 55, 5, 100, 5, 5, 10, 0, 5], \ # [ 20, 10, 5, 100, 30, 30, 20, 10], \ # [ 5, 70, 5, 30, 100, 0, 0, 0], \ # [ 15, 5, 10, 30, 0, 100, 10, 40], \ # [ 4, 0, 0, 20, 0, 10, 100, 20], \ # [ 0, 0, 5, 10, 0, 40, 20, 100]], dtype=DTYPE) / 100.0 topicMean = np.array([25, 15, 40, 5, 15]) self.assertEqual(100, topicMean.sum()) topicCovar = np.array(\ [[ 100, 5, 55, 20, 5 ], \ [ 5, 100, 5, 10, 70 ], \ [ 55, 5, 100, 5, 5 ], \ [ 20, 10, 5, 100, 30 ], \ [ 5, 70, 5, 30, 100 ], \ ], dtype=DTYPE) / 100.0 meanWordCount = 80 wordCounts = rd.poisson(meanWordCount, size=D) topicDists = rd.multivariate_normal(topicMean, topicCovar, size=D) W = topicDists.dot(vocab) * wordCounts[:, np.newaxis] W = ssp.csr_matrix (W.astype(DTYPE)) # # Train the model # model = ctm.newModelAtRandom(W, K, dtype=DTYPE) queryState = ctm.newQueryState(W, model) trainPlan = ctm.newTrainPlan(iterations=65, plot=True, logFrequency=1) self.assertTrue (0.99 < np.sum(model.topicMean) < 1.01) return self._doTest (W, model, queryState, trainPlan)
def _testOnModelDerivedExample(self): print("Cross-validated likelihoods on model-derived example") rd.seed(0xBADB055) # Global init for repeatable test D, T, K = 1000, 100, 7 # Document count, vocabularly size ("term count") and topic count tpcs, vocab, docLens, W = self._sampleFromModel(D, T, K) W = W.astype(DTYPE) plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Create the cross-validation folds folds = 5 foldSize = ceil(D / 5) querySize = foldSize trainSize = D - querySize for useDiagonalPriorCov in [False, True]: trainLikely = [] trainWordCount = [] queryLikely = [] queryWordCount = [] for fold in range(folds): # Split the datasets start = fold * foldSize end = start + trainSize trainSet = np.arange(start,end) % D querySet = np.arange(end, end + querySize) % D W_train = W[trainSet,:] W_query = W[querySet,:] # Train the model model = ctm.newModelAtRandom(W_train, K, dtype=DTYPE) queryState = ctm.newQueryState(W_train, model) plan = ctm.newTrainPlan(iterations=20, logFrequency=1, fastButInaccurate=useDiagonalPriorCov) model, queryState, (bndItrs, bndVals) = ctm.train (W_train, None, model, queryState, plan) # Plot the evoluation of the bound during training. plt.plot(bndItrs[5:], bndVals[5:]) plt.xlabel("Iterations") plt.ylabel("Variational Bound") plt.show() # Plot the topic covariance self._plotCov(model) # Plot the vocab plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Calculating the training set likelihood trainLikely.append(ctm.log_likelihood(W_train, model, queryState)) trainWordCount.append(W_train.data.sum()) # Now query the model. plan = ctm.newTrainPlan(iterations=10, fastButInaccurate=useDiagonalPriorCov) queryState = ctm.newQueryState(W_query, model) model, queryState = ctm.query(W_query, None, model, queryState, plan) queryLikely.append(ctm.log_likelihood(W_query, model, queryState)) queryWordCount.append(W_query.data.sum()) # Print out the likelihood and perplexity for each fold. print ("\n\n\nWith " + ("diagonal" if useDiagonalPriorCov else "full") + " covariances") for fold in range(folds): trainPerp = np.exp(-trainLikely[fold]/trainWordCount[fold]) queryPerp = np.exp(-queryLikely[fold]/queryWordCount[fold]) print("Fold %3d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainLikely[fold], queryLikely[fold])) print(" Perplexity: %12.2f \t Perplexity: %12.2f" % (trainPerp, queryPerp)) self.assertTrue(queryPerp < 60.0) # Maximum perplexity. self.assertTrue(trainPerp < 60.0) print ("\n\n") print("End of Test")