Ejemplo n.º 1
0
def newModelAtRandom(data, K, noiseVar=9, predVar=None, topicPrior=None, vocabPrior=lda.VocabPrior, ldaModel=None, dtype=DTYPE):
    '''
    Creates a new LRO ModelState for the given training set and
    the given number of topics. Everything is instantiated purely
    at random, except for vocabularies, which are seeded with
    random documents, to get a good starting point.

    :param data: the DataSet, must contain words and links.
    :param K:    the number of topics
    :noiseVar:   the noise variance determining offset size
    :predVar:    the various around predictions, a two element vector, the first
                 being the prediction noise when links re not observed, the second
                 when links are observed
    :param topicPrior: the prior over topics, either a scalar or a K-dimensional vector
    :param vocabPrior: the prior over vocabs, either a scalar or a T-dimensional vector
    :param dtype: the datatype to be used throughout.

    Return:
    A ModelState object
    '''
    assert K > 1,   "There must be at least two topics"
    assert K < 255, "There can be no more than 255 topics"
    D,T = data.words.shape
    Q,P = data.links.shape
    assert D == Q and Q == P, "Link matrix must be square and have same row-count as word-matrix"

    if ldaModel is None:
        ldaModel = lda.newModelAtRandom(data, K, topicPrior, vocabPrior, dtype)
    if predVar is None:
        predVar = np.array([0.01, 1])
    assert len(predVar) == 2
    scale = 1

    return ModelState(ldaModel, K, noiseVar, predVar, scale, dtype, MODEL_NAME)
Ejemplo n.º 2
0
def newModelAtRandom(data, K, method=TF_IDF, topicPrior=None, vocabPrior=lda.VocabPrior, ldaModel=None, dtype=DTYPE):
    '''
    Creates a new LRO ModelState for the given training set and
    the given number of topics. Everything is instantiated purely
    at random, except for vocabularies, which are seeded with
    random documents, to get a good starting point.

    :param data: the DataSet, must contain words and links.
    :param K:    the number of topics
    :param method: the method by which the documents will be compared, either their
    LDA topic distribution or their TF_IDF scores
    :param topicPrior: the prior over topics, either a scalar or a K-dimensional vector
    :param vocabPrior: the prior over vocabs, either a scalar or a T-dimensional vector
    :param dtype: the datatype to be used throughout.

    Return:
    A ModelState object
    '''
    assert K > 1,   "There must be at least two topics"
    assert K < 255, "There can be no more than 255 topics"
    D,T = data.words.shape
    Q,P = data.links.shape
    assert D == Q and Q == P, "Link matrix must be square and have same row-count as word-matrix"

    if ldaModel is None:
        ldaModel = lda.newModelAtRandom(data, K, topicPrior, vocabPrior, dtype)

    if method == TF_IDF:
        modelName = MODEL_NAME_PREFIX + TF_IDF
    elif method == LDA:
        modelName = MODEL_NAME_PREFIX + LDA
    else:
        raise ValueError("Incorrect method name")

    return ModelState(ldaModel, K, method, dtype, modelName)
Ejemplo n.º 3
0
    def testOnRealData(self):
        dtype = np.float64 # DTYPE
        
        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=NipsWordsPath, links_file=NipsCitePath)
        with open(NipsDictPath, "rb") as f:
            d = pkl.load(f)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=50, min_link_count=0)
        
        # IDF frequency for when we print out the vocab later
        freq = np.squeeze(np.asarray(data.words.sum(axis=0)))
        scale = np.reciprocal(1 + freq)
       
        # Initialise the model  
        K = 10
        model      = lda.newModelAtRandom(data, K, dtype=dtype)
        queryState = lda.newQueryState(data, model)
        trainPlan  = lda.newTrainPlan(iterations=30, logFrequency=2, debug=False, batchSize=50, rate_retardation=1, forgetting_rate=0.75)
        
        # Train the model, and the immediately save the result to a file for subsequent inspection
        model, query, (bndItrs, bndVals, bndLikes) = lda.train (data, model, queryState, trainPlan)
#        with open(newModelFileFromModel(model), "wb") as f:
#            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)
        
        # Plot the evolution of the bound during training.
        fig, ax1 = plt.subplots()
        ax1.plot(bndItrs, bndVals, 'b-')
        ax1.set_xlabel('Iterations')
        ax1.set_ylabel('Bound', color='b')
        
        ax2 = ax1.twinx()
        ax2.plot(bndItrs, bndLikes, 'r-')
        ax2.set_ylabel('Likelihood', color='r')
        
        fig.show()
        plt.show()
        
        vocab = lda.wordDists(model)
        plt.imshow(vocab, interpolation="nearest", cmap=cm.Greys_r)
        plt.show()
            
        # Print out the most likely topic words
        topWordCount = 100
        kTopWordInds = [topWordIndices(vocab[k, :] * scale, topWordCount) \
                        for k in range(K)]

        # Print out the most likely topic words
        print("Prior %s" % (str(model.topicPrior)))
        print("Perplexity: %f\n\n" % word_perplexity(lda.log_likelihood, model, query, data))
        print("")
        printWordDists(K, lda.wordDists(model), d)
Ejemplo n.º 4
0
    def testPerplexityOnRealDataWithLdaInc(self):
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)
        with open(AclDictPath, "rb") as f:
            d = pkl.load(f)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # IDF frequency for when we print out the vocab later
        freq = np.squeeze(np.asarray(data.words.sum(axis=0)))
        scale = np.reciprocal(1 + freq)

        # Initialise the model
        topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        perps = []
        for K in topicCounts:
            model      = lda.newModelAtRandom(data, K, dtype=dtype)
            queryState = lda.newQueryState(data, model)
            trainPlan  = lda.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False)

            # Train the model, and the immediately save the result to a file for subsequent inspection
            model, query, (bndItrs, bndVals, bndLikes) = lda.train (data, model, queryState, trainPlan)
    #        with open(newModelFileFromModel(model), "wb") as f:
    #            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)

            # Print out the most likely topic words
            # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0))))
            # vocab = lda.wordDists(model)
            # topWordCount = 10
            # kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)]

            like = lda.log_likelihood(data, model, query)
            perp = perplexity_from_like(like, data.word_count)

            perps.append(perp)

            print ("K = %2d : Perplexity = %f\n\n" % (K, perp))
            #
            # for k in range(model.K):
            #     print("\nTopic %d\n=============================" % k)
            #     print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))

        # Plot the evolution of the bound during training.
        fig, ax1 = plt.subplots()
        ax1.plot(topicCounts, perps, 'b-')
        ax1.set_xlabel('Topic Count')
        ax1.set_ylabel('Perplexity', color='b')

        fig.show()
        plt.show()
Ejemplo n.º 5
0
    def testCrossValPerplexityOnRealDataWithLdaInc(self):
        ActiveFolds = 3
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # Initialise the model
        trainPlan = lda.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False)
        queryPlan = lda.newTrainPlan(iterations=50, logFrequency=5, fastButInaccurate=False, debug=False)

        topicCounts = [30, 35, 40, 45, 50] # [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        for K in topicCounts:
            trainPerps = []
            queryPerps = []
            for fold in range(ActiveFolds): # range(NumFolds):
                trainData, queryData = data.cross_valid_split(fold, NumFolds)

                model = lda.newModelAtRandom(trainData, K, dtype=dtype)
                query = lda.newQueryState(trainData, model)

                # Train the model, and the immediately save the result to a file for subsequent inspection
                model, trainResult, (_, _, _) = lda.train (trainData, model, query, trainPlan)

                like = lda.log_likelihood(trainData, model, trainResult)
                perp = perplexity_from_like(like, trainData.word_count)
                trainPerps.append(perp)

                estData, evalData = queryData.doc_completion_split()
                query = lda.newQueryState(estData, model)
                model, queryResult = lda.query(estData, model, query, queryPlan)

                like = lda.log_likelihood(evalData, model, queryResult)
                perp = perplexity_from_like(like, evalData.word_count)
                queryPerps.append(perp)

            trainPerps.append(sum(trainPerps) / ActiveFolds)
            queryPerps.append(sum(queryPerps) / ActiveFolds)
            print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps])))
            print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))