Ejemplo n.º 1
0
    def testCrossValPerplexityOnRealDataWithLdaOldInc(self):
        ActiveFolds = 3
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # Initialise the model
        trainPlan = lda_old.newTrainPlan(iterations=800, logFrequency=200, fastButInaccurate=False, debug=False)
        queryPlan = lda_old.newTrainPlan(iterations=24, logFrequency=12, fastButInaccurate=False, debug=False)

        topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        for K in topicCounts:
            trainPerps = []
            queryPerps = []
            for fold in range(ActiveFolds): # range(NumFolds):
                trainData, queryData = data.cross_valid_split(fold, NumFolds)

                model = lda_old.newModelAtRandom(trainData, K, dtype=dtype)
                query = lda_old.newQueryState(trainData, model)

                # Train the model, and the immediately save the result to a file for subsequent inspection
                model, trainResult, (_, _, _) = lda_old.train (trainData, model, query, trainPlan)

                like = lda_old.log_likelihood(trainData, model, trainResult)
                perp = perplexity_from_like(like, trainData.word_count)
                trainPerps.append(perp)

                query = lda_old.newQueryState(queryData, model)
                model, queryResult = lda_old.query(queryData, model, query, queryPlan)

                like = lda_old.log_likelihood(queryData, model, queryResult)
                perp = perplexity_from_like(like, queryData.word_count)
                queryPerps.append(perp)

            trainPerps.append(sum(trainPerps) / ActiveFolds)
            queryPerps.append(sum(queryPerps) / ActiveFolds)
            print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps])))
            print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
Ejemplo n.º 2
0
    def testOnRealData(self):
        dtype = np.float64 # DTYPE
        
        rd.seed(0xBADB055)
        path = "/Users/bryanfeeney/Desktop/NIPS"
        with open(path + "/ar.pkl", 'rb') as f:
            _, W, _, d = pkl.load(f)
            
        if len(d) == 1:
            d = d[0]
        
        if W.dtype != dtype:
            W = W.astype(dtype)
        
        docLens   = np.squeeze(np.asarray(W.sum(axis=1)))
        good_rows = (np.where(docLens > 0.5))[0]
        if len(good_rows) < W.shape[0]:
            print ("Some rows in the doc-term matrix are empty. These have been removed.")
        W = W[good_rows, :]
        
        # IDF frequency for when we print out the vocab later
        freq = np.squeeze(np.asarray(W.sum(axis=0)))
        scale = np.reciprocal(1 + freq)
       
        # Initialise the model  
        K = 10
        model      = lda.newModelAtRandom(W, K, dtype=dtype)
        queryState = lda.newQueryState(W, model)
        trainPlan  = lda.newTrainPlan(iterations=40, logFrequency=10, fastButInaccurate=False, debug=True)
        
        # Train the model, and the immediately save the result to a file for subsequent inspection
        model, query, (bndItrs, bndVals, bndLikes) = lda.train (W, None, model, queryState, trainPlan)
#        with open(newModelFileFromModel(model), "wb") as f:
#            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)
        
        # Plot the evolution of the bound during training.
        fig, ax1 = plt.subplots()
        ax1.plot(bndItrs, bndVals, 'b-')
        ax1.set_xlabel('Iterations')
        ax1.set_ylabel('Bound', color='b')
        
        ax2 = ax1.twinx()
        ax2.plot(bndItrs, bndLikes, 'r-')
        ax2.set_ylabel('Likelihood', color='r')
        
        fig.show()
        plt.show()
        
        vocab = lda.vocab(model)
        plt.imshow(vocab, interpolation="nearest", cmap=cm.Greys_r)
        plt.show()
            
        # Print out the most likely topic words
        topWordCount = 100
        kTopWordInds = [self.topWordInds(d, vocab[k,:] * scale, topWordCount) \
                        for k in range(K)]
        
        print ("Prior %s" % (str(model.topicPrior)))
        print ("Perplexity: %f\n\n" % lda.perplexity(W, model, query))
        print ("\t\t".join (["Topic " + str(k) for k in range(K)]))
        print ("\n".join ("\t".join (d[kTopWordInds[k][c]] + "\t%0.4f" % vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))