Example #1
0
    def testOnRealData(self):
        dtype = np.float64 # DTYPE
        
        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=NipsWordsPath, links_file=NipsCitePath)
        with open(NipsDictPath, "rb") as f:
            d = pkl.load(f)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=50, min_link_count=0)
        
        # IDF frequency for when we print out the vocab later
        freq = np.squeeze(np.asarray(data.words.sum(axis=0)))
        scale = np.reciprocal(1 + freq)
       
        # Initialise the model  
        K = 10
        model      = lda.newModelAtRandom(data, K, dtype=dtype)
        queryState = lda.newQueryState(data, model)
        trainPlan  = lda.newTrainPlan(iterations=30, logFrequency=2, debug=False, batchSize=50, rate_retardation=1, forgetting_rate=0.75)
        
        # Train the model, and the immediately save the result to a file for subsequent inspection
        model, query, (bndItrs, bndVals, bndLikes) = lda.train (data, model, queryState, trainPlan)
#        with open(newModelFileFromModel(model), "wb") as f:
#            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)
        
        # Plot the evolution of the bound during training.
        fig, ax1 = plt.subplots()
        ax1.plot(bndItrs, bndVals, 'b-')
        ax1.set_xlabel('Iterations')
        ax1.set_ylabel('Bound', color='b')
        
        ax2 = ax1.twinx()
        ax2.plot(bndItrs, bndLikes, 'r-')
        ax2.set_ylabel('Likelihood', color='r')
        
        fig.show()
        plt.show()
        
        vocab = lda.wordDists(model)
        plt.imshow(vocab, interpolation="nearest", cmap=cm.Greys_r)
        plt.show()
            
        # Print out the most likely topic words
        topWordCount = 100
        kTopWordInds = [topWordIndices(vocab[k, :] * scale, topWordCount) \
                        for k in range(K)]

        # Print out the most likely topic words
        print("Prior %s" % (str(model.topicPrior)))
        print("Perplexity: %f\n\n" % word_perplexity(lda.log_likelihood, model, query, data))
        print("")
        printWordDists(K, lda.wordDists(model), d)
Example #2
0
def lda_vocab(ldaModel):
    if ldaModel.name == lda_gibbs.MODEL_NAME:
        return lda_gibbs.wordDists(ldaModel)
    elif ldaModel.name == lda_vb.MODEL_NAME:
        return lda_vb.wordDists(ldaModel)
    else:
        raise ValueError("Unknown LDA implementation")
Example #3
0
def wordDists(model):
    return lda.wordDists(model.ldaModel)