def testLikelihoodOnModelDerivedExample(self):
     print("Cross-validated likelihoods on model-derived example")
     
     rd.seed(0xBADB055) # Global init for repeatable test
     modelState, _, _, _, X, W = sampleFromModel()
     D, T, K, Q, F, P = X.shape[0], modelState.T, modelState.K, modelState.Q, modelState.F, modelState.P
     
     # Create the cross-validation folds
     folds     = 5
     foldSize  = ceil(D / 5)
     querySize = foldSize
     trainSize = D - querySize
     
     for fold in range(folds):
         start = fold * foldSize
         end   = start + trainSize
         
         trainSet = np.arange(start,end) % D
         querySet = np.arange(end, end + querySize) % D
         
         X_train, W_train = X[trainSet,:], W[trainSet,:]
         X_query, W_query = X[querySet,:], W[querySet,:]
         
         modelState = newVbModelState(K, Q, F, P, T)
         modelState, queryState = train(modelState, X_train, W_train, iterations=100, logInterval=10, plotInterval=100)
         trainSetLikely = log_likelihood(modelState, X_train, W_train, queryState)
         
         queryState = query(modelState, X_query, W_query, iterations=50, epsilon=0.001, logInterval = 10, plotInterval = 100)
         querySetLikely = log_likelihood(modelState, X_query, W_query, queryState)
         
         print("Fold %d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainSetLikely, querySetLikely))
        
     print("End of Test")
    def testLikelihoodOnModelDerivedExample(self):
        print("Cross-validated likelihoods on model-derived example")

        rd.seed(0xBADB055)  # Global init for repeatable test
        modelState, _, _, _, X, W = sampleFromModel()
        D, T, K, Q, F, P = X.shape[
            0], modelState.T, modelState.K, modelState.Q, modelState.F, modelState.P

        # Create the cross-validation folds
        folds = 5
        foldSize = ceil(D / 5)
        querySize = foldSize
        trainSize = D - querySize

        for fold in range(folds):
            start = fold * foldSize
            end = start + trainSize

            trainSet = np.arange(start, end) % D
            querySet = np.arange(end, end + querySize) % D

            X_train, W_train = X[trainSet, :], W[trainSet, :]
            X_query, W_query = X[querySet, :], W[querySet, :]

            modelState = newVbModelState(K, Q, F, P, T)
            modelState, queryState = train(modelState,
                                           X_train,
                                           W_train,
                                           iterations=100,
                                           logInterval=10,
                                           plotInterval=100)
            trainSetLikely = log_likelihood(modelState, X_train, W_train,
                                            queryState)

            queryState = query(modelState,
                               X_query,
                               W_query,
                               iterations=50,
                               epsilon=0.001,
                               logInterval=10,
                               plotInterval=100)
            querySetLikely = log_likelihood(modelState, X_query, W_query,
                                            queryState)

            print(
                "Fold %d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f"
                % (fold, trainSetLikely, querySetLikely))

        print("End of Test")
Example #3
0
def train(modelState, X, W, plan):
    """
    Updates a model state object for a topic model based on side-information, and
    create a query state object with topic assignments for each do in the train set.
    
    The parameters are
    
    modelState - the model state with all the model parameters
    X          - the D x F matrix of side information vectors
    W          - the D x V matrix of word **count** vectors.
    plan       - how we should execute the inference procedure (iterations, logging
                 etc). See newInferencePlan() in sidetopics_uyv
    
    This returns a tuple of new model-state and query-state. The latter object will
    contain X and W and also
    
    s      - A D-dimensional vector describing the offset in our bound on the true value of ln sum_k e^theta_dk 
    lxi    - A DxK matrix used in the above bound, containing the negative Jakkola function applied to the 
             quadratic term xi
    lambda - the topics we've inferred for the current batch of documents
    nu     - the variance of topics we've inferred (independent)
    """
    # Unpack the model state tuple for ease of use and maybe speed improvements
    K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq = (
        modelState.K,
        modelState.Q,
        modelState.F,
        modelState.P,
        modelState.T,
        modelState.A,
        modelState.varA,
        modelState.Y,
        modelState.omY,
        modelState.sigY,
        modelState.sigT,
        modelState.U,
        modelState.V,
        modelState.vocab,
        modelState.topicVar,
        modelState.featVar,
        modelState.lowTopicVar,
        modelState.lowFeatVar,
    )
    iterations, epsilon, logCount, plot, plotFile, plotIncremental, fastButInaccurate = (
        plan.iterations,
        plan.epsilon,
        plan.logFrequency,
        plan.plot,
        plan.plotFile,
        plan.plotIncremental,
        plan.fastButInaccurate,
    )
    queryPlan = newInferencePlan(1, epsilon, logFrequency=0, plot=False)

    if W.dtype.kind == "i":  # for the sparseScalorQuotientOfDot() method to work
        W = W.astype(DTYPE)

    # Get ready to plot the evolution of the likelihood, with multiplicative updates (e.g. 1, 2, 4, 8, 16, 32, ...)
    if logCount > 0:
        multiStepSize = np.power(iterations, 1.0 / logCount)
        logIter = 1
        elbos = []
        likes = []
        iters = []
    else:
        logIter = iterations + 1
    lastVarBoundValue = -sys.float_info.max
    verify_and_log = _quickPrintElbo if DEBUG else _doNothing

    # Prior covariances and mean
    overSsq, overAsq, overKsq, overTsq = 1.0 / sigmaSq, 1.0 / alphaSq, 1.0 / kappaSq, 1.0 / tauSq

    # We'll need the total word count per doc, and total count of docs
    docLen = np.squeeze(np.asarray(W.sum(axis=1)))  # Force to a one-dimensional array for np.newaxis trick to work
    D = len(docLen)
    print(
        "Training %d topic model with %d x %d word-matrix W, %d x %d feature matrix X, and latent feature and topics spaces of size %d and %d respectively"
        % (K, D, T, D, F, P, Q)
    )

    #  No need to recompute X'X every time
    if X.dtype != DTYPE:
        X = X.astype(DTYPE)
    XTX = X.T.dot(X)

    # Identity matrices that occur
    I_P = ssp.eye(P, P, 0, DTYPE)
    I_F = ssp.eye(F, F, 0, DTYPE, "csc")  # X is CSR, XTX is consequently CSC, sparse inverse requires CSC

    # Assign initial values to the query parameters
    expLmda = np.exp(rd.random((D, K)).astype(DTYPE))
    nu = np.ones((D, K), DTYPE)
    s = np.zeros((D,), DTYPE)
    lxi = negJakkola(np.ones((D, K), DTYPE))

    # the variance of A is an unchanging function of X, assuming
    # that alphaSq is also unchanging.
    print("Inverting gram matrix")
    aI_XTX = (overAsq * I_F + XTX).todense()
    varA = la.inv(aI_XTX)

    # Scaled word counts is W / expLmda.dot(vocab). It's going to be exactly
    # as sparse as W, which is why we initialise it in this manner.
    scaledWordCounts = W.copy()
    lmda = np.log(expLmda, out=expLmda)

    print("Launching inference")
    for iteration in range(iterations):

        # =============================================================
        # E-Step
        #   Model dists are q(Theta|A;Lambda;nu) q(A|Y) q(Y) and q(Z)....
        #   Where lambda is the posterior mean of theta.
        # =============================================================

        # Y, sigY, omY
        #
        UTU = U.T.dot(U)
        sigY = la.inv(overTsq * I_P + overAsq * UTU)
        verify_and_log(
            "E-Step: q(Y) [sigY]",
            iteration,
            X,
            W,
            K,
            Q,
            F,
            P,
            T,
            A,
            varA,
            Y,
            omY,
            sigY,
            sigT,
            U,
            V,
            vocab,
            sigmaSq,
            alphaSq,
            kappaSq,
            tauSq,
            expLmda,
            None,
            nu,
            lxi,
            s,
            docLen,
        )

        Y = A.dot(U).dot(sigY)
        verify_and_log(
            "E-Step: q(Y) [Mean]",
            iteration,
            X,
            W,
            K,
            Q,
            F,
            P,
            T,
            A,
            varA,
            Y,
            omY,
            sigY,
            sigT,
            U,
            V,
            vocab,
            sigmaSq,
            alphaSq,
            kappaSq,
            tauSq,
            expLmda,
            None,
            nu,
            lxi,
            s,
            docLen,
        )

        # A
        #
        A = varA.dot(X.T.dot(lmda) + U.dot(Y.T)).T
        np.exp(expLmda, out=expLmda)  # from here on in we assume we're working with exp(lmda)
        verify_and_log(
            "E-Step: q(A)",
            iteration,
            X,
            W,
            K,
            Q,
            F,
            P,
            T,
            A,
            varA,
            Y,
            omY,
            sigY,
            sigT,
            U,
            V,
            vocab,
            sigmaSq,
            alphaSq,
            kappaSq,
            tauSq,
            None,
            expLmda,
            nu,
            lxi,
            s,
            docLen,
        )

        # lmda_dk, nu_dk, s_d, and xi_dk
        #
        XAT = X.dot(A.T)
        #         query (VbSideTopicModelState (K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq), \
        #                X, W, \
        #                queryPlan, \
        #                VbSideTopicQueryState(expLmda, nu, lxi, s, docLen), \
        #                scaledWordCounts=scaledWordCounts, \
        #                XAT = XAT)

        # =============================================================
        #  M-Step
        #    The projection used for A: U
        #    The vocabulary : vocab
        #    The topic correlation: sigT
        # =============================================================

        # U
        #
        U = la.solve(np.trace(sigT) * I_P + Y.T.dot(Y), Y.T.dot(A)).T
        verify_and_log(
            "M-Step: U",
            iteration,
            X,
            W,
            K,
            Q,
            F,
            P,
            T,
            A,
            varA,
            Y,
            omY,
            sigY,
            sigT,
            U,
            V,
            vocab,
            sigmaSq,
            alphaSq,
            kappaSq,
            tauSq,
            None,
            expLmda,
            nu,
            lxi,
            s,
            docLen,
        )

        # vocab
        #
        #         factor = (scaledWordCounts.T.dot(expLmda)).T # Gets materialized as a dense matrix...
        #         vocab *= factor
        #         normalizerows_ip(vocab)

        # A hack to work around the fact that we've got no prior, and thus no
        # pseudo counts, so some values will collapse to zero
        #         vocab[vocab < sys.float_info.min] = sys.float_info.min

        #         verify_and_log ("M-Step: vocab", iteration, X, W, K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen)

        # sigT
        #
        lmda = np.log(expLmda, out=expLmda)
        A_from_U_Y = Y.dot(U.T)
        topic_from_A_X = X.dot(A.T)

        sigT = (
            1.0 / P * (Y.dot(Y.T))
            + 1.0 / F * (A - A_from_U_Y).dot((A - A_from_U_Y).T)
            + 1.0 / D * (lmda - topic_from_A_X).T.dot(lmda - topic_from_A_X)
        )
        sigT.flat[:: K + 1] += 1.0 / D * nu.sum(axis=0, dtype=DTYPE)

        verify_and_log(
            "M-Step: sigT",
            iteration,
            X,
            W,
            K,
            Q,
            F,
            P,
            T,
            A,
            varA,
            Y,
            omY,
            sigY,
            sigT,
            U,
            V,
            vocab,
            sigmaSq,
            alphaSq,
            kappaSq,
            tauSq,
            None,
            expLmda,
            nu,
            lxi,
            s,
            docLen,
        )

        # =============================================================
        #  Handle logging of variational bound, likelihood, etc.
        # =============================================================
        if iteration == logIter:
            np.exp(expLmda, out=expLmda)
            modelState = VbSideTopicModelState(
                K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq
            )
            queryState = VbSideTopicQueryState(expLmda, nu, lxi, s, docLen)

            elbo = varBound(modelState, queryState, X, W, None, XAT, XTX)
            likely = log_likelihood(modelState, X, W, queryState)  # recons_error(modelState, X, W, queryState)

            np.log(expLmda, out=expLmda)

            elbos.append(elbo)
            iters.append(iteration)
            likes.append(likely)
            print("\nIteration %5d  ELBO %15f   Log-Likelihood %15f" % (iteration, elbo, likely))

            logIter = min(np.ceil(logIter * multiStepSize), iterations - 1)

            if abs(elbo - lastVarBoundValue) < epsilon:
                break
            else:
                lastVarBoundValue = elbo

            if plot and plotIncremental:
                plot_bound(plotFile + "-iter-" + str(iteration), np.array(iters), np.array(elbos), np.array(likes))
        else:
            print(".", end="")
            sys.stdout.flush()

    # Right before we end, plot the evolution of the bound and likelihood
    # if we've been asked to do so.
    if plot:
        plot_bound(plotFile, iters, elbos, likes)

    return (
        VbSideTopicModelState(
            K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq
        ),
        VbSideTopicQueryState(expLmda, nu, lxi, s, docLen),
    )