Example #1
0
def _newQueryStateFromCtm(data, model):
    import model.ctm_bohning as ctm

    ctm_model = ctm.newModelAtRandom(data, model.K, VocabPrior, model.dtype)
    ctm_query = ctm.newQueryState(data, model)
    ctm_plan  = ctm.newTrainPlan(200, epsilon=1, logFrequency=100, debug=False)

    ctm_model, ctm_query, (_, _, _) = ctm.train(data, ctm_model, ctm_query, ctm_plan)

    model.vocab[:,:]    = ctm_model.vocab
    model.topicCov[:,:] = ctm_model.sigT
    model.topicMean[:]  = ctm_model.topicMean

    K, vocab, dtype =  model.K, model.vocab, model.dtype

    D,T = data.words.shape
    assert T == vocab.shape[1], "The number of terms in the document-term matrix (" + str(T) + ") differs from that in the model-states vocabulary parameter " + str(vocab.shape[1])
    docLens = np.squeeze(np.asarray(data.words.sum(axis=1)))

    outMeans = ctm_query.means
    outVarcs = np.ones((D,K), dtype=dtype)

    inMeans = np.ndarray(shape=(D,K), dtype=dtype)
    for d in range(D):
        inMeans[d,:] = rd.multivariate_normal(outMeans[d,:], model.topicCov)
    inVarcs = np.ones((D,K), dtype=dtype)

    inDocCov  = np.ones((D,), dtype=dtype)

    return QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens)
Example #2
0
    def testPerplexityOnRealDataWithCtm(self):
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)
        with open(AclDictPath, "rb") as f:
            d = pkl.load(f)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # IDF frequency for when we print out the vocab later
        freq = np.squeeze(np.asarray(data.words.sum(axis=0)))
        scale = np.reciprocal(1 + freq)

        # Initialise the model
        K = 10 # TopicCount
        model      = ctm.newModelAtRandom(data, K, dtype=dtype)
        queryState = ctm.newQueryState(data, model)
        trainPlan  = ctm.newTrainPlan(iterations=200, logFrequency=10, fastButInaccurate=False, debug=False)

        # Train the model, and the immediately save the result to a file for subsequent inspection
        model, query, (bndItrs, bndVals, bndLikes) = ctm.train (data, model, queryState, trainPlan)
#        with open(newModelFileFromModel(model), "wb") as f:
#            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)

        # Plot the evolution of the bound during training.
        fig, ax1 = plt.subplots()
        ax1.plot(bndItrs, bndVals, 'b-')
        ax1.set_xlabel('Iterations')
        ax1.set_ylabel('Bound', color='b')

        ax2 = ax1.twinx()
        ax2.plot(bndItrs, bndLikes, 'r-')
        ax2.set_ylabel('Likelihood', color='r')

        fig.show()
        plt.show()

        fig, ax1 = plt.subplots()
        ax1.imshow(model.sigT, interpolation="nearest", cmap=cm.Greys_r)
        fig.show()
        plt.show()

        # Print out the most likely topic words
        # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0))))
        vocab = ctm.wordDists(model)
        topWordCount = 10
        kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)]

        like = ctm.log_likelihood(data, model, query)
        perp = perplexity_from_like(like, data.word_count)

        print ("Perplexity: %f\n\n" % perp)

        for k in range(model.K):
            print("\nTopic %d\n=============================" % k)
            print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))
 def testOnRealData(self):
     print ("CTM/Bohning")
     rd.seed(0xC0FFEE)
     dtype = np.float64
     
     path = "/Users/bryanfeeney/Desktop/NIPS"
     with open(path + "/ar.pkl", 'rb') as f:
         _, W, _, d = pkl.load(f)
     if len(d) == 1:
         d = d[0]
     
     if W.dtype != dtype:
         W = W.astype(dtype)
     
     docLens   = np.squeeze(np.asarray(W.sum(axis=1)))
     good_rows = (np.where(docLens > 0.5))[0]
     if len(good_rows) < W.shape[0]:
         print ("Some rows in the doc-term matrix are empty. These have been removed.")
     W = W[good_rows, :]
     
     # IDF frequency for when we print out the vocab later
     freq = np.squeeze(np.asarray(W.sum(axis=0)))
     scale = np.reciprocal(1 + freq)
    
    
     # Initialise the model  
     K = 20
     model      = ctm.newModelAtRandom(W, K, dtype=dtype)
     queryState = ctm.newQueryState(W, model)
     trainPlan  = ctm.newTrainPlan(iterations=750, logFrequency=10, fastButInaccurate=False, debug=True)
     
     # Train the model, and the immediately save the result to a file for subsequent inspection
     model, query, (bndItrs, bndVals, bndLikes) = ctm.train (W, None, model, queryState, trainPlan)
     with open(newModelFile("ctm-bohn-nips-ar", K, None), "wb") as f:
         pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)
         
     # Plot the bound
     fig, ax1 = plt.subplots()
     ax1.plot(bndItrs, bndVals, 'b-')
     ax1.set_xlabel('Iterations')
     ax1.set_ylabel('Bound', color='b')
      
     ax2 = ax1.twinx()
     ax2.plot(bndItrs, bndLikes, 'r-')
     ax2.set_ylabel('Likelihood', color='r')
             
     fig.show()
     fig.suptitle("CTM/Bohning (Identity Cov) on NIPS")
     plt.show()
     
     topWordCount = 100
     kTopWordInds = [self.topWordInds(d, model.vocab[k,:] * scale, topWordCount) \
                     for k in range(K)]
     
     print ("Perplexity: %f\n\n" % ctm.perplexity(W, model, query))
     print ("\t\t".join (["Topic " + str(k) for k in range(K)]))
     print ("\n".join ("\t".join (d[kTopWordInds[k][c]] + "\t%0.4f" % model.vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))
Example #4
0
    def testCrossValPerplexityOnRealDataWithCtmInc(self):
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # Initialise the model
        trainPlan = ctm.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False)
        queryPlan = ctm.newTrainPlan(iterations=100, logFrequency=10, fastButInaccurate=False, debug=False)

        topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        for K in topicCounts:
            trainPerps = []
            queryPerps = []
            for fold in range(1): # range(NumFolds):
                trainData, queryData = data.cross_valid_split(fold, NumFolds)

                model = ctm.newModelAtRandom(trainData, K, dtype=dtype)
                query = ctm.newQueryState(trainData, model)

                # Train the model, and the immediately save the result to a file for subsequent inspection
                model, trainResult, (_, _, _) = ctm.train (trainData, model, query, trainPlan)

                like = ctm.log_likelihood(trainData, model, trainResult)
                perp = perplexity_from_like(like, trainData.word_count)
                trainPerps.append(perp)

                query = ctm.newQueryState(queryData, model)
                model, queryResult = ctm.query(queryData, model, query, queryPlan)

                like = ctm.log_likelihood(queryData, model, queryResult)
                perp = perplexity_from_like(like, queryData.word_count)
                queryPerps.append(perp)

            trainPerps.append(sum(trainPerps) / NumFolds)
            queryPerps.append(sum(queryPerps) / NumFolds)
            print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps])))
            print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
    def _testOnModelHandcraftedData(self):
        #
        # Create the vocab
        #
        T = 3 * 3
        K = 5
        
        # Horizontal bars
        vocab1 = ssp.coo_matrix(([1, 1, 1], ([0, 0, 0], [0, 1, 2])), shape=(3,3)).todense()
        #vocab2 = ssp.coo_matrix(([1, 1, 1], ([1, 1, 1], [0, 1, 2])), shape=(3,3)).todense()
        vocab3 = ssp.coo_matrix(([1, 1, 1], ([2, 2, 2], [0, 1, 2])), shape=(3,3)).todense()
        
        # Vertical bars
        vocab4 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 0, 0])), shape=(3,3)).todense()
        #vocab5 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [1, 1, 1])), shape=(3,3)).todense()
        vocab6 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [2, 2, 2])), shape=(3,3)).todense()
        
        # Diagonals
        vocab7 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 1, 2])), shape=(3,3)).todense()
        #vocab8 = ssp.coo_matrix(([1, 1, 1], ([2, 1, 0], [0, 1, 2])), shape=(3,3)).todense()
        
        # Put together
        T = vocab1.shape[0] * vocab1.shape[1]
        vocabs = [vocab1, vocab3, vocab4, vocab6, vocab7]
        
        # Create a single matrix with the flattened vocabularies
        vocabVectors = []
        for vocab in vocabs:
            vocabVectors.append (np.squeeze(np.asarray (vocab.reshape((1,T)))))
        
        vocab = normalizerows_ip(np.array(vocabVectors, dtype=DTYPE))
        
        # Plot the vocab
        ones = np.ones(vocabs[0].shape)
        for k in range(K):
            plt.subplot(2, 3, k)
            plt.imshow(ones - vocabs[k], interpolation="none", cmap = cm.Greys_r)
        plt.show()
        
        #
        # Create the corpus
        #
        rd.seed(0xC0FFEE)
        D = 1000

        # Make sense (of a sort) of this by assuming that these correspond to
        # Kittens    Omelettes    Puppies    Oranges    Tomatoes    Dutch People    Basketball    Football
        #topicMean = np.array([10, 25, 5, 15, 5, 5, 10, 25])
#        topicCovar = np.array(\
#            [[ 100,    5,     55,      20,     5,     15,      4,      0], \
#             [ 5,    100,      5,      10,    70,      5,      0,      0], \
#             [ 55,     5,    100,       5,     5,     10,      0,      5], \
#             [ 20,    10,      5,     100,    30,     30,     20,     10], \
#             [ 5,     70,      5,     30,    100,      0,      0,      0], \
#             [ 15,     5,     10,     30,      0,    100,     10,     40], \
#             [ 4,      0,      0,     20,      0,     10,    100,     20], \
#             [ 0,      0,      5,     10,      0,     40,     20,    100]], dtype=DTYPE) / 100.0

        topicMean = np.array([25, 15, 40, 5, 15])
        self.assertEqual(100, topicMean.sum())
        topicCovar = np.array(\
            [[ 100,    5,     55,      20,     5     ], \
             [ 5,    100,      5,      10,    70     ], \
             [ 55,     5,    100,       5,     5     ], \
             [ 20,    10,      5,     100,    30     ], \
             [ 5,     70,      5,     30,    100     ], \
             ], dtype=DTYPE) / 100.0
 
        
        meanWordCount = 80
        wordCounts = rd.poisson(meanWordCount, size=D)
        topicDists = rd.multivariate_normal(topicMean, topicCovar, size=D)
        W = topicDists.dot(vocab) * wordCounts[:, np.newaxis]
        W = ssp.csr_matrix (W.astype(DTYPE))
        
        #
        # Train the model
        #
        model      = ctm.newModelAtRandom(W, K, dtype=DTYPE)
        queryState = ctm.newQueryState(W, model)
        trainPlan  = ctm.newTrainPlan(iterations=65, plot=True, logFrequency=1)
        
        self.assertTrue (0.99 < np.sum(model.topicMean) < 1.01)
        
        return self._doTest (W, model, queryState, trainPlan)
 def _testOnModelDerivedExample(self):
     print("Cross-validated likelihoods on model-derived example")
     
     rd.seed(0xBADB055) # Global init for repeatable test
     D, T, K = 1000, 100, 7 # Document count, vocabularly size ("term count") and topic count
     tpcs, vocab, docLens, W = self._sampleFromModel(D, T, K)
     
     W = W.astype(DTYPE)
     
     plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r)
     plt.show()
     
     
     # Create the cross-validation folds
     folds     = 5
     foldSize  = ceil(D / 5)
     querySize = foldSize
     trainSize = D - querySize
     
     for useDiagonalPriorCov in [False, True]:
         trainLikely = []
         trainWordCount = []
         queryLikely = []
         queryWordCount = []
         
         for fold in range(folds):
             # Split the datasets
             start = fold * foldSize
             end   = start + trainSize
             
             trainSet = np.arange(start,end) % D
             querySet = np.arange(end, end + querySize) % D
             
             W_train = W[trainSet,:]
             W_query = W[querySet,:]
             
             # Train the model
             model = ctm.newModelAtRandom(W_train, K, dtype=DTYPE)
             queryState = ctm.newQueryState(W_train, model)
             
             plan  = ctm.newTrainPlan(iterations=20, logFrequency=1, fastButInaccurate=useDiagonalPriorCov)
             model, queryState, (bndItrs, bndVals) = ctm.train (W_train, None, model, queryState, plan)
                 
             # Plot the evoluation of the bound during training.
             plt.plot(bndItrs[5:], bndVals[5:])
             plt.xlabel("Iterations")
             plt.ylabel("Variational Bound")
             plt.show()
         
             # Plot the topic covariance
             self._plotCov(model)
             
             # Plot the vocab
             plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r)
             plt.show()
             
             # Calculating the training set likelihood
             trainLikely.append(ctm.log_likelihood(W_train, model, queryState))
             trainWordCount.append(W_train.data.sum())
             
             # Now query the model.
             plan       = ctm.newTrainPlan(iterations=10, fastButInaccurate=useDiagonalPriorCov)
             queryState = ctm.newQueryState(W_query, model)
             model, queryState = ctm.query(W_query, None, model, queryState, plan)
             
             queryLikely.append(ctm.log_likelihood(W_query, model, queryState))
             queryWordCount.append(W_query.data.sum())
          
         # Print out the likelihood and perplexity for each fold.   
         print ("\n\n\nWith " + ("diagonal" if useDiagonalPriorCov else "full") + " covariances")
         for fold in range(folds):
             trainPerp = np.exp(-trainLikely[fold]/trainWordCount[fold])
             queryPerp = np.exp(-queryLikely[fold]/queryWordCount[fold])
             
             print("Fold %3d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainLikely[fold], queryLikely[fold]))
             print("                    Perplexity: %12.2f \t           Perplexity: %12.2f" % (trainPerp, queryPerp))
     
             self.assertTrue(queryPerp < 60.0) # Maximum perplexity.
             self.assertTrue(trainPerp < 60.0)
         print ("\n\n")
         
     print("End of Test")