def train (data, model, query, trainPlan, isQuery=False):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.

    Params:
    :param data: the dataset, must contain both words and links
    :param model: the actual model, which is modified in-place
    :param query: the query results - essentially all the "local" variables
            matched to the given observations
    :param trainPlan: how to execute the training process (e.g. iterations,
                 log-interval etc.)

    Return:
    An new modelstate and a new querystate object with the learnt parameters,
    and and a tuple of iteration, vb-bound measurement and log-likelhood
    measurement
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    ldaModel, method, K, dtype, modelName = \
        model.ldaModel, model.method, model.K, model.dtype, model.name
    ldaTopics = query.ldaTopics

    D, K = data.doc_count, ldaModel.K

    # Step 1: Learn the topics using vanilla LDA
    if method == TF_IDF:
        # First do TF
        docLens = np.squeeze(np.array(data.words.sum(axis=1)))
        reps = data.words.copy()
        #reps /= docLens[:, np.newaxis] replaced with line below to retain sparsity
        reps = ssp.diags(np.reciprocal(docLens), 0).dot(reps)

        occ  = data.words.astype(np.bool).astype(dtype)
        docCount = np.squeeze(np.array(occ.sum(axis=0)))
        docCount += 1
        idf = np.log(D / docCount)

        # reps *= idf[np.newaxis, :]
        reps = reps.dot(ssp.diags(idf, 0))
    elif method == LDA:
        plan = lda.newTrainPlan(iterations, logFrequency=logFrequency, debug=debug)
        if isQuery:
            _, ldaTopics = lda.query(data, ldaModel, lda.newQueryState(data, ldaModel), plan)
        elif ldaTopics is None or not ldaTopics.processed:
            ldaModel, ldaTopics, (_, _, _) = lda.train(data, ldaModel, lda.newQueryState(data, ldaModel), plan)
        reps = np.sqrt(lda.topicDists(ldaTopics))
    else:
        raise ValueError("Unknown method %s" % method)


    return ModelState(ldaModel, K, method, dtype, modelName), \
           QueryState(reps, ldaTopics), \
           ([0], [0], [0])
Exemple #2
0
    def testCrossValPerplexityOnRealDataWithLdaInc(self):
        ActiveFolds = 3
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # Initialise the model
        trainPlan = lda.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False)
        queryPlan = lda.newTrainPlan(iterations=50, logFrequency=5, fastButInaccurate=False, debug=False)

        topicCounts = [30, 35, 40, 45, 50] # [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        for K in topicCounts:
            trainPerps = []
            queryPerps = []
            for fold in range(ActiveFolds): # range(NumFolds):
                trainData, queryData = data.cross_valid_split(fold, NumFolds)

                model = lda.newModelAtRandom(trainData, K, dtype=dtype)
                query = lda.newQueryState(trainData, model)

                # Train the model, and the immediately save the result to a file for subsequent inspection
                model, trainResult, (_, _, _) = lda.train (trainData, model, query, trainPlan)

                like = lda.log_likelihood(trainData, model, trainResult)
                perp = perplexity_from_like(like, trainData.word_count)
                trainPerps.append(perp)

                estData, evalData = queryData.doc_completion_split()
                query = lda.newQueryState(estData, model)
                model, queryResult = lda.query(estData, model, query, queryPlan)

                like = lda.log_likelihood(evalData, model, queryResult)
                perp = perplexity_from_like(like, evalData.word_count)
                queryPerps.append(perp)

            trainPerps.append(sum(trainPerps) / ActiveFolds)
            queryPerps.append(sum(queryPerps) / ActiveFolds)
            print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps])))
            print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
    def testOnRealData(self):
        dtype = np.float64 # DTYPE
        
        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=NipsWordsPath, links_file=NipsCitePath)
        with open(NipsDictPath, "rb") as f:
            d = pkl.load(f)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=50, min_link_count=0)
        
        # IDF frequency for when we print out the vocab later
        freq = np.squeeze(np.asarray(data.words.sum(axis=0)))
        scale = np.reciprocal(1 + freq)
       
        # Initialise the model  
        K = 10
        model      = lda.newModelAtRandom(data, K, dtype=dtype)
        queryState = lda.newQueryState(data, model)
        trainPlan  = lda.newTrainPlan(iterations=30, logFrequency=2, debug=False, batchSize=50, rate_retardation=1, forgetting_rate=0.75)
        
        # Train the model, and the immediately save the result to a file for subsequent inspection
        model, query, (bndItrs, bndVals, bndLikes) = lda.train (data, model, queryState, trainPlan)
#        with open(newModelFileFromModel(model), "wb") as f:
#            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)
        
        # Plot the evolution of the bound during training.
        fig, ax1 = plt.subplots()
        ax1.plot(bndItrs, bndVals, 'b-')
        ax1.set_xlabel('Iterations')
        ax1.set_ylabel('Bound', color='b')
        
        ax2 = ax1.twinx()
        ax2.plot(bndItrs, bndLikes, 'r-')
        ax2.set_ylabel('Likelihood', color='r')
        
        fig.show()
        plt.show()
        
        vocab = lda.wordDists(model)
        plt.imshow(vocab, interpolation="nearest", cmap=cm.Greys_r)
        plt.show()
            
        # Print out the most likely topic words
        topWordCount = 100
        kTopWordInds = [topWordIndices(vocab[k, :] * scale, topWordCount) \
                        for k in range(K)]

        # Print out the most likely topic words
        print("Prior %s" % (str(model.topicPrior)))
        print("Perplexity: %f\n\n" % word_perplexity(lda.log_likelihood, model, query, data))
        print("")
        printWordDists(K, lda.wordDists(model), d)
Exemple #4
0
    def testPerplexityOnRealDataWithLdaInc(self):
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)
        with open(AclDictPath, "rb") as f:
            d = pkl.load(f)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # IDF frequency for when we print out the vocab later
        freq = np.squeeze(np.asarray(data.words.sum(axis=0)))
        scale = np.reciprocal(1 + freq)

        # Initialise the model
        topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        perps = []
        for K in topicCounts:
            model      = lda.newModelAtRandom(data, K, dtype=dtype)
            queryState = lda.newQueryState(data, model)
            trainPlan  = lda.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False)

            # Train the model, and the immediately save the result to a file for subsequent inspection
            model, query, (bndItrs, bndVals, bndLikes) = lda.train (data, model, queryState, trainPlan)
    #        with open(newModelFileFromModel(model), "wb") as f:
    #            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)

            # Print out the most likely topic words
            # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0))))
            # vocab = lda.wordDists(model)
            # topWordCount = 10
            # kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)]

            like = lda.log_likelihood(data, model, query)
            perp = perplexity_from_like(like, data.word_count)

            perps.append(perp)

            print ("K = %2d : Perplexity = %f\n\n" % (K, perp))
            #
            # for k in range(model.K):
            #     print("\nTopic %d\n=============================" % k)
            #     print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))

        # Plot the evolution of the bound during training.
        fig, ax1 = plt.subplots()
        ax1.plot(topicCounts, perps, 'b-')
        ax1.set_xlabel('Topic Count')
        ax1.set_ylabel('Perplexity', color='b')

        fig.show()
        plt.show()
Exemple #5
0
def newQueryState(data, model, withLdaTopics=None):
    '''
    Creates a new LRO QueryState object. This contains all
    parameters and random variables tied to individual
    datapoints.

    Param:
    :param data:  the dataset, must contain words and links.
    :param model: the model state object

    Return:
    A QueryState object
    '''
    if withLdaTopics is None:
        withLdaTopics = lda.newQueryState(data, model.ldaModel)
    offsets  = np.zeros((data.doc_count, model.K))

    return QueryState(withLdaTopics, offsets)
Exemple #6
0
def train (data, model, query, trainPlan, isQuery=False):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.

    Params:
    :param data: the dataset, must contain both words and links
    :param model: the actual model, which is modified in-place
    :param query: the query results - essentially all the "local" variables
            matched to the given observations
    :param trainPlan: how to execute the training process (e.g. iterations,
                 log-interval etc.)

    Return:
    An new modelstate and a new querystate object with the learnt parameters,
    and and a tuple of iteration, vb-bound measurement and log-likelhood
    measurement
    '''
    ldaPlan, iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        trainPlan.ldaPlan, trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    ldaModel, noiseVar, predVar, scale, dtype = \
        model.ldaModel, model.noiseVar, model.predVar, model.scale, model.dtype
    ldaQuery, offsetTopicDists = \
        query.ldaQuery, query.offsetTopicDists

    D, K = data.doc_count, ldaModel.K
    epsilon = 0.01 * D * K if epsilon is None else epsilon
    tau = [predVar[0], predVar[1]]

    # Step 1: Learn the topics using vanilla LDA
    print (time.strftime('%X') + " Beginning Topic Inference")
    if isQuery:
        _, ldaQuery = lda.query(data, ldaModel, lda.newQueryState(data, ldaModel), ldaPlan)
    elif not ldaModel.processed:
        ldaModel, ldaQuery, (_, _, _) = lda.train(data, ldaModel, ldaQuery, ldaPlan)
    print (time.strftime('%X') + " Topic Inference Completed")

    tops = lda.topicDists(ldaQuery)
    offs = tops.copy()
    topsSum = tops.T.dot(tops)

    # Step 2: reverse the links matrix so we can talk about the origin (not target) of links
    inlinks = data.links.T.tocsr()

    # Step 3: Learn the scaling factor and offsets for each link's target-doc till converged
    print ("Learning Offsets")
    for itr in range(iterations):
        print ("Iteration " + str(itr), end=": ")

        # Record the current scale of the offsets
        before = la.norm(offs / scale)

        # Update the scale
        lhs, rhs = 0, 0
        for p in range(data.doc_count):
            lhs += (tau[1] - tau[0]) * (tops[inlinks[p,:].indices,:].dot(offs[p,:]) ** 2).sum()
            lhs += tau[0] * (offs[p,:].dot(topsSum).dot(offs[p,:]) - offs[p,:].dot(np.outer(tops[p,:],tops[p,:])).dot(offs[p,:]))
            rhs += tau[1] * tops[inlinks[p,:].indices,:].dot(offs[p,:]).sum()

        scale = rhs / lhs

        # Update the offset for every target doc
        for p in range(data.doc_count):
            lhs  = (tau[1] - tau[0]) * np.einsum("dj,k->jk", tops[inlinks[p,:].indices,:], tops[p,:])
            lhs += tau[0] * (np.einsum("dj,k->jk", tops, tops[p,:]) - np.outer(tops[p,:], tops[p,:]))
            lhs *= (scale * scale)
            lhs[np.diag_indices_from(lhs)] += noiseVar

            rhs  = tops[p,:] + scale * tau[1] * tops[inlinks[p,:].indices,:].sum(axis=0)

            offs[p,:] = la.inv(lhs).dot(rhs)

        # Check has the offsets changed significantly
        after = la.norm(offs / scale)
        print ("%f --> %f. scale=%f" % (before, after, scale))
        if abs(before - after) < epsilon:
            break

    return ModelState(ldaModel, K, noiseVar, predVar, scale, dtype, MODEL_NAME), \
           QueryState(ldaQuery, offs), \
           ([0], [0], [0])