def query(data, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' iterations, epsilon, logFrequency, fastButInaccurate, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype # Necessary temp variables (notably the count of topic to word assignments # per topic per doc) isigT = la.inv(sigT) W,X = data.words, data.feats # Enable logging or not. If enabled, we need the inner product of the feat matrix if debug: XTX = X.T.dot(X) debugFn = _debug_with_bound _debug_with_bound.old_bound=0 else: XTX = None debugFn = _debug_with_nothing # Iterate over parameters lastPerp = 1E+300 if dtype is np.float64 else 1E+30 for itr in range(iterations): # Estimate Z_dvk expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab) S = expMeans * R.dot(vocab.T) # Update the Means vMat = (2 * s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S rhsMat = vMat + X.dot(A.T).dot(isigT) # TODO Verify this lhsMat = np.reciprocal(np.diag(isigT)[np.newaxis,:] + n[:,np.newaxis] * 2 * lxi) # inverse of D diagonal matrices... means = lhsMat * rhsMat # as LHS is a diagonal matrix for all d, it's equivalent # to doing a hadamard product for all d debugFn (itr, means, "query-means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Variances varcs = 1./(2 * n[:,np.newaxis] * lxi + isigT.flat[::K+1]) debugFn (itr, varcs, "query-varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the approximation parameters lxi = ctm.negJakkolaOfDerivedXi(means, varcs, s) debugFn (itr, lxi, "query-lxi", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # s can sometimes grow unboundedly # Follow Bouchard's suggested approach of fixing it at zero # # s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1) # debugFn (itr, s, "s", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, lxi, s, n)) perp = perplexity_from_like(like, data.word_count) if itr > 20 and lastPerp - perp < 1: break lastPerp = perp return modelState, QueryState (means, expMeans, varcs, lxi, s, n)
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want it) A new query object with the update query parameters ''' W, X = data.words, data.feats assert W.dtype == modelState.dtype assert X.dtype == modelState.dtype D,_ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype # Book-keeping for logs boundIters = np.zeros(shape=(iterations // logFrequency,)) boundValues = np.zeros(shape=(iterations // logFrequency,)) likeValues = np.zeros(shape=(iterations // logFrequency,)) bvIdx = 0 _debug_with_bound.old_bound = 0 debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables isigT = la.inv(sigT) R = W.copy() sigT_regularizer = 0.001 aI_P = 1./lfv * ssp.eye(P, dtype=dtype) tI_F = 1./fv * ssp.eye(F, dtype=dtype) print("Creating posterior covariance of A, this will take some time...") XTX = X.T.dot(X) R_A = XTX if ssp.issparse(R_A): R_A = R_A.todense() # dense inverse typically as fast or faster than sparse inverse R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case R_A = la.inv(R_A) print("Covariance matrix calculated, launching inference") s.fill(0) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the covariance of the prior diff_a_yv = (A-Y.dot(V)) diff_m_xa = (means-X.dot(A.T)) sigT = 1./lfv * (Y.dot(Y.T)) sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T) sigT += diff_m_xa.T.dot(diff_m_xa) sigT.flat[::K+1] += varcs.sum(axis=0) sigT /= (P+F+D) sigT.flat[::K+1] += sigT_regularizer # Diagonalize it sigT = np.diag(sigT.flat[::K+1]) # and invert it. isigT = np.diag(np.reciprocal(sigT.flat[::K+1])) debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Building Blocks - temporarily replaces means with exp(means) expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) S = expMeans * R.dot(vocab.T) # Update the vocabulary vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) # Reset the means to their original form, and log effect of vocab update debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Finally update the parameter V V = la.inv(R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A)) debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # And now this is the E-Step, though it's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the distribution on the latent space R_Y_base = aI_P + 1/fv * V.dot(V.T) R_Y = la.inv(R_Y_base) debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) Y = 1./fv * A.dot(V.T).dot(R_Y) debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the mapping from the features to topics A = (1./fv * (Y).dot(V) + (X.T.dot(means)).T).dot(R_A) debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Means vMat = (s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S rhsMat = vMat + X.dot(A.T).dot(isigT) # TODO Verify this lhsMat = np.reciprocal(np.diag(isigT)[np.newaxis,:] + n[:,np.newaxis] * lxi) # inverse of D diagonal matrices... means = lhsMat * rhsMat # as LHS is a diagonal matrix for all d, it's equivalent # do doing a hadamard product for all d debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Variances varcs = 1./(n[:,np.newaxis] * lxi + isigT.flat[::K+1]) debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the approximation parameters lxi = 2 * ctm.negJakkolaOfDerivedXi(means, varcs, s) debugFn (itr, lxi, "lxi", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # s can sometimes grow unboundedly # Follow Bouchard's suggested approach of fixing it at zero # # s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1) # debugFn (itr, s, "s", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, lxi, s, n) boundValues[bvIdx] = var_bound(data, modelState, queryState, XTX) likeValues[bvIdx] = log_likelihood(data, modelState, queryState) boundIters[bvIdx] = itr perp = perplexity_from_like(likeValues[bvIdx], n.sum()) print (time.strftime('%X') + " : Iteration %d: Perplexity %4.2f bound %f" % (itr, perp, boundValues[bvIdx])) if bvIdx > 0 and boundValues[bvIdx - 1] > boundValues[bvIdx]: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx])) # print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max())) # Check to see if the improvment in the likelihood has fallen below the threshold if bvIdx > 1 and boundIters[bvIdx] > 50: lastPerp = perplexity_from_like(likeValues[bvIdx - 1], n.sum()) if lastPerp - perp < 1: boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likeValues, bvIdx) return modelState, queryState, (boundIters, boundValues, likeValues) bvIdx += 1 return \ ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, lxi, s, n), \ (boundIters, boundValues, likeValues)