def testLikelihoodOnModelDerivedExample(self): print("Cross-validated likelihoods on model-derived example") rd.seed(0xBADB055) # Global init for repeatable test modelState, _, _, _, X, W = sampleFromModel() D, T, K, Q, F, P = X.shape[0], modelState.T, modelState.K, modelState.Q, modelState.F, modelState.P # Create the cross-validation folds folds = 5 foldSize = ceil(D / 5) querySize = foldSize trainSize = D - querySize for fold in range(folds): start = fold * foldSize end = start + trainSize trainSet = np.arange(start,end) % D querySet = np.arange(end, end + querySize) % D X_train, W_train = X[trainSet,:], W[trainSet,:] X_query, W_query = X[querySet,:], W[querySet,:] modelState = newVbModelState(K, Q, F, P, T) modelState, queryState = train(modelState, X_train, W_train, iterations=100, logInterval=10, plotInterval=100) trainSetLikely = log_likelihood(modelState, X_train, W_train, queryState) queryState = query(modelState, X_query, W_query, iterations=50, epsilon=0.001, logInterval = 10, plotInterval = 100) querySetLikely = log_likelihood(modelState, X_query, W_query, queryState) print("Fold %d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainSetLikely, querySetLikely)) print("End of Test")
def testLikelihoodOnModelDerivedExample(self): print("Cross-validated likelihoods on model-derived example") rd.seed(0xBADB055) # Global init for repeatable test modelState, _, _, _, X, W = sampleFromModel() D, T, K, Q, F, P = X.shape[ 0], modelState.T, modelState.K, modelState.Q, modelState.F, modelState.P # Create the cross-validation folds folds = 5 foldSize = ceil(D / 5) querySize = foldSize trainSize = D - querySize for fold in range(folds): start = fold * foldSize end = start + trainSize trainSet = np.arange(start, end) % D querySet = np.arange(end, end + querySize) % D X_train, W_train = X[trainSet, :], W[trainSet, :] X_query, W_query = X[querySet, :], W[querySet, :] modelState = newVbModelState(K, Q, F, P, T) modelState, queryState = train(modelState, X_train, W_train, iterations=100, logInterval=10, plotInterval=100) trainSetLikely = log_likelihood(modelState, X_train, W_train, queryState) queryState = query(modelState, X_query, W_query, iterations=50, epsilon=0.001, logInterval=10, plotInterval=100) querySetLikely = log_likelihood(modelState, X_query, W_query, queryState) print( "Fold %d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainSetLikely, querySetLikely)) print("End of Test")
def train(modelState, X, W, plan): """ Updates a model state object for a topic model based on side-information, and create a query state object with topic assignments for each do in the train set. The parameters are modelState - the model state with all the model parameters X - the D x F matrix of side information vectors W - the D x V matrix of word **count** vectors. plan - how we should execute the inference procedure (iterations, logging etc). See newInferencePlan() in sidetopics_uyv This returns a tuple of new model-state and query-state. The latter object will contain X and W and also s - A D-dimensional vector describing the offset in our bound on the true value of ln sum_k e^theta_dk lxi - A DxK matrix used in the above bound, containing the negative Jakkola function applied to the quadratic term xi lambda - the topics we've inferred for the current batch of documents nu - the variance of topics we've inferred (independent) """ # Unpack the model state tuple for ease of use and maybe speed improvements K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq = ( modelState.K, modelState.Q, modelState.F, modelState.P, modelState.T, modelState.A, modelState.varA, modelState.Y, modelState.omY, modelState.sigY, modelState.sigT, modelState.U, modelState.V, modelState.vocab, modelState.topicVar, modelState.featVar, modelState.lowTopicVar, modelState.lowFeatVar, ) iterations, epsilon, logCount, plot, plotFile, plotIncremental, fastButInaccurate = ( plan.iterations, plan.epsilon, plan.logFrequency, plan.plot, plan.plotFile, plan.plotIncremental, plan.fastButInaccurate, ) queryPlan = newInferencePlan(1, epsilon, logFrequency=0, plot=False) if W.dtype.kind == "i": # for the sparseScalorQuotientOfDot() method to work W = W.astype(DTYPE) # Get ready to plot the evolution of the likelihood, with multiplicative updates (e.g. 1, 2, 4, 8, 16, 32, ...) if logCount > 0: multiStepSize = np.power(iterations, 1.0 / logCount) logIter = 1 elbos = [] likes = [] iters = [] else: logIter = iterations + 1 lastVarBoundValue = -sys.float_info.max verify_and_log = _quickPrintElbo if DEBUG else _doNothing # Prior covariances and mean overSsq, overAsq, overKsq, overTsq = 1.0 / sigmaSq, 1.0 / alphaSq, 1.0 / kappaSq, 1.0 / tauSq # We'll need the total word count per doc, and total count of docs docLen = np.squeeze(np.asarray(W.sum(axis=1))) # Force to a one-dimensional array for np.newaxis trick to work D = len(docLen) print( "Training %d topic model with %d x %d word-matrix W, %d x %d feature matrix X, and latent feature and topics spaces of size %d and %d respectively" % (K, D, T, D, F, P, Q) ) # No need to recompute X'X every time if X.dtype != DTYPE: X = X.astype(DTYPE) XTX = X.T.dot(X) # Identity matrices that occur I_P = ssp.eye(P, P, 0, DTYPE) I_F = ssp.eye(F, F, 0, DTYPE, "csc") # X is CSR, XTX is consequently CSC, sparse inverse requires CSC # Assign initial values to the query parameters expLmda = np.exp(rd.random((D, K)).astype(DTYPE)) nu = np.ones((D, K), DTYPE) s = np.zeros((D,), DTYPE) lxi = negJakkola(np.ones((D, K), DTYPE)) # the variance of A is an unchanging function of X, assuming # that alphaSq is also unchanging. print("Inverting gram matrix") aI_XTX = (overAsq * I_F + XTX).todense() varA = la.inv(aI_XTX) # Scaled word counts is W / expLmda.dot(vocab). It's going to be exactly # as sparse as W, which is why we initialise it in this manner. scaledWordCounts = W.copy() lmda = np.log(expLmda, out=expLmda) print("Launching inference") for iteration in range(iterations): # ============================================================= # E-Step # Model dists are q(Theta|A;Lambda;nu) q(A|Y) q(Y) and q(Z).... # Where lambda is the posterior mean of theta. # ============================================================= # Y, sigY, omY # UTU = U.T.dot(U) sigY = la.inv(overTsq * I_P + overAsq * UTU) verify_and_log( "E-Step: q(Y) [sigY]", iteration, X, W, K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, expLmda, None, nu, lxi, s, docLen, ) Y = A.dot(U).dot(sigY) verify_and_log( "E-Step: q(Y) [Mean]", iteration, X, W, K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, expLmda, None, nu, lxi, s, docLen, ) # A # A = varA.dot(X.T.dot(lmda) + U.dot(Y.T)).T np.exp(expLmda, out=expLmda) # from here on in we assume we're working with exp(lmda) verify_and_log( "E-Step: q(A)", iteration, X, W, K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen, ) # lmda_dk, nu_dk, s_d, and xi_dk # XAT = X.dot(A.T) # query (VbSideTopicModelState (K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq), \ # X, W, \ # queryPlan, \ # VbSideTopicQueryState(expLmda, nu, lxi, s, docLen), \ # scaledWordCounts=scaledWordCounts, \ # XAT = XAT) # ============================================================= # M-Step # The projection used for A: U # The vocabulary : vocab # The topic correlation: sigT # ============================================================= # U # U = la.solve(np.trace(sigT) * I_P + Y.T.dot(Y), Y.T.dot(A)).T verify_and_log( "M-Step: U", iteration, X, W, K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen, ) # vocab # # factor = (scaledWordCounts.T.dot(expLmda)).T # Gets materialized as a dense matrix... # vocab *= factor # normalizerows_ip(vocab) # A hack to work around the fact that we've got no prior, and thus no # pseudo counts, so some values will collapse to zero # vocab[vocab < sys.float_info.min] = sys.float_info.min # verify_and_log ("M-Step: vocab", iteration, X, W, K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen) # sigT # lmda = np.log(expLmda, out=expLmda) A_from_U_Y = Y.dot(U.T) topic_from_A_X = X.dot(A.T) sigT = ( 1.0 / P * (Y.dot(Y.T)) + 1.0 / F * (A - A_from_U_Y).dot((A - A_from_U_Y).T) + 1.0 / D * (lmda - topic_from_A_X).T.dot(lmda - topic_from_A_X) ) sigT.flat[:: K + 1] += 1.0 / D * nu.sum(axis=0, dtype=DTYPE) verify_and_log( "M-Step: sigT", iteration, X, W, K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen, ) # ============================================================= # Handle logging of variational bound, likelihood, etc. # ============================================================= if iteration == logIter: np.exp(expLmda, out=expLmda) modelState = VbSideTopicModelState( K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq ) queryState = VbSideTopicQueryState(expLmda, nu, lxi, s, docLen) elbo = varBound(modelState, queryState, X, W, None, XAT, XTX) likely = log_likelihood(modelState, X, W, queryState) # recons_error(modelState, X, W, queryState) np.log(expLmda, out=expLmda) elbos.append(elbo) iters.append(iteration) likes.append(likely) print("\nIteration %5d ELBO %15f Log-Likelihood %15f" % (iteration, elbo, likely)) logIter = min(np.ceil(logIter * multiStepSize), iterations - 1) if abs(elbo - lastVarBoundValue) < epsilon: break else: lastVarBoundValue = elbo if plot and plotIncremental: plot_bound(plotFile + "-iter-" + str(iteration), np.array(iters), np.array(elbos), np.array(likes)) else: print(".", end="") sys.stdout.flush() # Right before we end, plot the evolution of the bound and likelihood # if we've been asked to do so. if plot: plot_bound(plotFile, iters, elbos, likes) return ( VbSideTopicModelState( K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq ), VbSideTopicQueryState(expLmda, nu, lxi, s, docLen), )