def testOnRealData(self):
     print ("CTM/Bohning")
     rd.seed(0xC0FFEE)
     dtype = np.float64
     
     path = "/Users/bryanfeeney/Desktop/NIPS"
     with open(path + "/ar.pkl", 'rb') as f:
         _, W, _, d = pkl.load(f)
     if len(d) == 1:
         d = d[0]
     
     if W.dtype != dtype:
         W = W.astype(dtype)
     
     docLens   = np.squeeze(np.asarray(W.sum(axis=1)))
     good_rows = (np.where(docLens > 0.5))[0]
     if len(good_rows) < W.shape[0]:
         print ("Some rows in the doc-term matrix are empty. These have been removed.")
     W = W[good_rows, :]
     
     # IDF frequency for when we print out the vocab later
     freq = np.squeeze(np.asarray(W.sum(axis=0)))
     scale = np.reciprocal(1 + freq)
    
    
     # Initialise the model  
     K = 20
     model      = ctm.newModelAtRandom(W, K, dtype=dtype)
     queryState = ctm.newQueryState(W, model)
     trainPlan  = ctm.newTrainPlan(iterations=750, logFrequency=10, fastButInaccurate=False, debug=True)
     
     # Train the model, and the immediately save the result to a file for subsequent inspection
     model, query, (bndItrs, bndVals, bndLikes) = ctm.train (W, None, model, queryState, trainPlan)
     with open(newModelFile("ctm-bohn-nips-ar", K, None), "wb") as f:
         pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)
         
     # Plot the bound
     fig, ax1 = plt.subplots()
     ax1.plot(bndItrs, bndVals, 'b-')
     ax1.set_xlabel('Iterations')
     ax1.set_ylabel('Bound', color='b')
      
     ax2 = ax1.twinx()
     ax2.plot(bndItrs, bndLikes, 'r-')
     ax2.set_ylabel('Likelihood', color='r')
             
     fig.show()
     fig.suptitle("CTM/Bohning (Identity Cov) on NIPS")
     plt.show()
     
     topWordCount = 100
     kTopWordInds = [self.topWordInds(d, model.vocab[k,:] * scale, topWordCount) \
                     for k in range(K)]
     
     print ("Perplexity: %f\n\n" % ctm.perplexity(W, model, query))
     print ("\t\t".join (["Topic " + str(k) for k in range(K)]))
     print ("\n".join ("\t".join (d[kTopWordInds[k][c]] + "\t%0.4f" % model.vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))
Beispiel #2
0
 def testOnRealData(self):
     print ("CTM/Bohning")
     rd.seed(0xC0FFEE)
     dtype = np.float64
     
     path = "/Users/bryanfeeney/Desktop/NIPS"
     with open(path + "/ar.pkl", 'rb') as f:
         _, W, _, d = pkl.load(f)
     if len(d) == 1:
         d = d[0]
     
     if W.dtype != dtype:
         W = W.astype(dtype)
     
     docLens   = np.squeeze(np.asarray(W.sum(axis=1)))
     good_rows = (np.where(docLens > 0.5))[0]
     if len(good_rows) < W.shape[0]:
         print ("Some rows in the doc-term matrix are empty. These have been removed.")
     W = W[good_rows, :]
     
     # IDF frequency for when we print out the vocab later
     freq = np.squeeze(np.asarray(W.sum(axis=0)))
     scale = np.reciprocal(1 + freq)
    
    
     # Initialise the model  
     K = 20
     model      = ctm.newModelAtRandom(W, K, dtype=dtype)
     queryState = ctm.newQueryState(W, model)
     trainPlan  = ctm.newTrainPlan(iterations=750, logFrequency=10, fastButInaccurate=False, debug=True)
     
     # Train the model, and the immediately save the result to a file for subsequent inspection
     model, query, (bndItrs, bndVals, bndLikes) = ctm.train (W, None, model, queryState, trainPlan)
     with open(newModelFile("ctm-bohn-nips-ar", K, None), "wb") as f:
         pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)
         
     # Plot the bound
     fig, ax1 = plt.subplots()
     ax1.plot(bndItrs, bndVals, 'b-')
     ax1.set_xlabel('Iterations')
     ax1.set_ylabel('Bound', color='b')
      
     ax2 = ax1.twinx()
     ax2.plot(bndItrs, bndLikes, 'r-')
     ax2.set_ylabel('Likelihood', color='r')
             
     fig.show()
     fig.suptitle("CTM/Bohning (Identity Cov) on NIPS")
     plt.show()
     
     topWordCount = 100
     kTopWordInds = [self.topWordInds(d, model.vocab[k,:] * scale, topWordCount) \
                     for k in range(K)]
     
     print ("Perplexity: %f\n\n" % ctm.perplexity(W, model, query))
     print ("\t\t".join (["Topic " + str(k) for k in range(K)]))
     print ("\n".join ("\t".join (d[kTopWordInds[k][c]] + "\t%0.4f" % model.vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))
    def testOnRealData(self):
        rd.seed(0xDAFF0D12)
        
#        path = "/Users/bryanfeeney/Desktop/NIPS"
#        with open(path + "/ar.pkl", "rb") as f:
#            X, W, _, dic = pkl.load(f)
        
        path = "/Users/bryanfeeney/Desktop/SmallerDB-NoCJK-WithFeats-Fixed"
        with open(path + "/all-in-one.pkl", "rb") as f:
            W, X, dic = pkl.load(f)
        
        if W.dtype != DTYPE:
            W = W.astype(DTYPE)
        if X.dtype != DTYPE:
            X = X.astype(DTYPE)
        
        D,T = W.shape
        _,F = X.shape
        
        freq = np.squeeze(np.asarray(W.sum(axis=0)))
        scale = np.reciprocal(1. + freq)
        
        K = 10
        P = 5
        model      = stm.newModelAtRandom(X, W, P, K, 0.1, 0.1, dtype=DTYPE)
        queryState = stm.newQueryState(W, model)
        trainPlan  = stm.newTrainPlan(iterations=50, logFrequency=1, debug=True)
        
        model, query, (bndItrs, bndVals, bndLikes) = stm.train (W, X, model, queryState, trainPlan)
        with open(newModelFile("stm-yv-bohn-nips-ar", K, None), "wb") as f:
            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)
             
        # Plot the evolution of the bound during training.
        fig, ax1 = plt.subplots()
        ax1.plot(bndItrs, bndVals, 'b-')
        ax1.set_xlabel('Iterations')
        ax1.set_ylabel('Bound', color='b')
        
        ax2 = ax1.twinx()
        ax2.plot(bndItrs, bndLikes, 'r-')
        ax2.set_ylabel('Likelihood', color='r')
        
        fig.show()
        plt.show()
        
        # Print the top words
        topWordCount = 100
        kTopWordInds = [self.topWordInds(dic, model.vocab[k,:] * scale, topWordCount) \
                        for k in range(K)]
        
        print ("Perplexity: %f\n\n" % ctm.perplexity(W, model, query))
        print ("\t\t".join (["Topic " + str(k) for k in range(K)]))
        print ("\n".join ("\t".join (dic[kTopWordInds[k][c]] + "\t%0.4f" % model.vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))