Example #1
0
def train (data, model, query, trainPlan, isQuery=False):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.

    Params:
    :param data: the dataset, must contain both words and links
    :param model: the actual model, which is modified in-place
    :param query: the query results - essentially all the "local" variables
            matched to the given observations
    :param trainPlan: how to execute the training process (e.g. iterations,
                 log-interval etc.)

    Return:
    An new modelstate and a new querystate object with the learnt parameters,
    and and a tuple of iteration, vb-bound measurement and log-likelhood
    measurement
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    ldaModel, method, K, dtype, modelName = \
        model.ldaModel, model.method, model.K, model.dtype, model.name
    ldaTopics = query.ldaTopics

    D, K = data.doc_count, ldaModel.K

    # Step 1: Learn the topics using vanilla LDA
    if method == TF_IDF:
        # First do TF
        docLens = np.squeeze(np.array(data.words.sum(axis=1)))
        reps = data.words.copy()
        #reps /= docLens[:, np.newaxis] replaced with line below to retain sparsity
        reps = ssp.diags(np.reciprocal(docLens), 0).dot(reps)

        occ  = data.words.astype(np.bool).astype(dtype)
        docCount = np.squeeze(np.array(occ.sum(axis=0)))
        docCount += 1
        idf = np.log(D / docCount)

        # reps *= idf[np.newaxis, :]
        reps = reps.dot(ssp.diags(idf, 0))
    elif method == LDA:
        plan = lda.newTrainPlan(iterations, logFrequency=logFrequency, debug=debug)
        if isQuery:
            _, ldaTopics = lda.query(data, ldaModel, lda.newQueryState(data, ldaModel), plan)
        elif ldaTopics is None or not ldaTopics.processed:
            ldaModel, ldaTopics, (_, _, _) = lda.train(data, ldaModel, lda.newQueryState(data, ldaModel), plan)
        reps = np.sqrt(lda.topicDists(ldaTopics))
    else:
        raise ValueError("Unknown method %s" % method)


    return ModelState(ldaModel, K, method, dtype, modelName), \
           QueryState(reps, ldaTopics), \
           ([0], [0], [0])
Example #2
0
    def testCrossValPerplexityOnRealDataWithLdaInc(self):
        ActiveFolds = 3
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # Initialise the model
        trainPlan = lda.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False)
        queryPlan = lda.newTrainPlan(iterations=50, logFrequency=5, fastButInaccurate=False, debug=False)

        topicCounts = [30, 35, 40, 45, 50] # [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        for K in topicCounts:
            trainPerps = []
            queryPerps = []
            for fold in range(ActiveFolds): # range(NumFolds):
                trainData, queryData = data.cross_valid_split(fold, NumFolds)

                model = lda.newModelAtRandom(trainData, K, dtype=dtype)
                query = lda.newQueryState(trainData, model)

                # Train the model, and the immediately save the result to a file for subsequent inspection
                model, trainResult, (_, _, _) = lda.train (trainData, model, query, trainPlan)

                like = lda.log_likelihood(trainData, model, trainResult)
                perp = perplexity_from_like(like, trainData.word_count)
                trainPerps.append(perp)

                estData, evalData = queryData.doc_completion_split()
                query = lda.newQueryState(estData, model)
                model, queryResult = lda.query(estData, model, query, queryPlan)

                like = lda.log_likelihood(evalData, model, queryResult)
                perp = perplexity_from_like(like, evalData.word_count)
                queryPerps.append(perp)

            trainPerps.append(sum(trainPerps) / ActiveFolds)
            queryPerps.append(sum(queryPerps) / ActiveFolds)
            print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps])))
            print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
Example #3
0
def train (data, model, query, trainPlan, isQuery=False):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.

    Params:
    :param data: the dataset, must contain both words and links
    :param model: the actual model, which is modified in-place
    :param query: the query results - essentially all the "local" variables
            matched to the given observations
    :param trainPlan: how to execute the training process (e.g. iterations,
                 log-interval etc.)

    Return:
    An new modelstate and a new querystate object with the learnt parameters,
    and and a tuple of iteration, vb-bound measurement and log-likelhood
    measurement
    '''
    ldaPlan, iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        trainPlan.ldaPlan, trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    ldaModel, noiseVar, predVar, scale, dtype = \
        model.ldaModel, model.noiseVar, model.predVar, model.scale, model.dtype
    ldaQuery, offsetTopicDists = \
        query.ldaQuery, query.offsetTopicDists

    D, K = data.doc_count, ldaModel.K
    epsilon = 0.01 * D * K if epsilon is None else epsilon
    tau = [predVar[0], predVar[1]]

    # Step 1: Learn the topics using vanilla LDA
    print (time.strftime('%X') + " Beginning Topic Inference")
    if isQuery:
        _, ldaQuery = lda.query(data, ldaModel, lda.newQueryState(data, ldaModel), ldaPlan)
    elif not ldaModel.processed:
        ldaModel, ldaQuery, (_, _, _) = lda.train(data, ldaModel, ldaQuery, ldaPlan)
    print (time.strftime('%X') + " Topic Inference Completed")

    tops = lda.topicDists(ldaQuery)
    offs = tops.copy()
    topsSum = tops.T.dot(tops)

    # Step 2: reverse the links matrix so we can talk about the origin (not target) of links
    inlinks = data.links.T.tocsr()

    # Step 3: Learn the scaling factor and offsets for each link's target-doc till converged
    print ("Learning Offsets")
    for itr in range(iterations):
        print ("Iteration " + str(itr), end=": ")

        # Record the current scale of the offsets
        before = la.norm(offs / scale)

        # Update the scale
        lhs, rhs = 0, 0
        for p in range(data.doc_count):
            lhs += (tau[1] - tau[0]) * (tops[inlinks[p,:].indices,:].dot(offs[p,:]) ** 2).sum()
            lhs += tau[0] * (offs[p,:].dot(topsSum).dot(offs[p,:]) - offs[p,:].dot(np.outer(tops[p,:],tops[p,:])).dot(offs[p,:]))
            rhs += tau[1] * tops[inlinks[p,:].indices,:].dot(offs[p,:]).sum()

        scale = rhs / lhs

        # Update the offset for every target doc
        for p in range(data.doc_count):
            lhs  = (tau[1] - tau[0]) * np.einsum("dj,k->jk", tops[inlinks[p,:].indices,:], tops[p,:])
            lhs += tau[0] * (np.einsum("dj,k->jk", tops, tops[p,:]) - np.outer(tops[p,:], tops[p,:]))
            lhs *= (scale * scale)
            lhs[np.diag_indices_from(lhs)] += noiseVar

            rhs  = tops[p,:] + scale * tau[1] * tops[inlinks[p,:].indices,:].sum(axis=0)

            offs[p,:] = la.inv(lhs).dot(rhs)

        # Check has the offsets changed significantly
        after = la.norm(offs / scale)
        print ("%f --> %f. scale=%f" % (before, after, scale))
        if abs(before - after) < epsilon:
            break

    return ModelState(ldaModel, K, noiseVar, predVar, scale, dtype, MODEL_NAME), \
           QueryState(ldaQuery, offs), \
           ([0], [0], [0])