def _sampleFromModel(self, D=200, T=100, K=10, avgWordsPerDoc=500): ''' Create a test dataset according to the model Params: D - Sample documents (each with associated features) T - Vocabulary size, the number of "terms". Must be a square number K - Observed topics avgWordsPerDoc - average number of words per document generated (Poisson) Returns: modelState - a model state object configured for training tpcs - the matrix of per-document topic distribution vocab - the matrix of per-topic word distributions docLens - the vector of document lengths X - the DxF side information matrix W - The DxW word matrix ''' # Generate vocab beta = 0.1 betaVec = np.ndarray((T, )) betaVec.fill(beta) vocab = rd.dirichlet(betaVec, size=K) # Geneate the shared covariance matrix # ...no real structure in this. sigT = rd.random((K, K)) sigT = sigT.dot(sigT) # Generate topic mean alpha = 1 alphaVec = np.ndarray((K, )) alphaVec.fill(alpha) topicMean = rd.dirichlet(alphaVec) # Generate the actual topics. tpcs = rd.multivariate_normal(topicMean, sigT, size=D) tpcs = rowwise_softmax(tpcs) # Generate the corpus docLens = rd.poisson(avgWordsPerDoc, (D, )).astype(np.float32) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # Return the initialised model, the true parameter values, and the # generated observations return tpcs, vocab, docLens, W
def selfSoftDot(matrix): ''' Considers the given matrix to be a collection of stacked row-vectors. Returns the sum of the dot products of each row-vector and its soft-max form. This words on DENSE matrices only, and it appears in this module simply for convenience. Uses fast, memory-efficient operations for matrices of single and double-precision numbers, uses fast-ish numpy code as a fallback, but at the cost of creating a copy of of the matrix. ''' assert not np.isfortran(matrix), "Matrix is not stored in row-major form" if matrix.dtype == np.float64: return compiled.selfSoftDot_f8(matrix) elif matrix.dtype == np.float32: return compiled.selfSoftDot_f4(matrix) if WarnIfSlow: sys.stderr.write("WARNING: Slow code path triggered (selfSoftDot)") return np.sum(matrix * rowwise_softmax(matrix))
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W = data.words D,_ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens K, topicMean, sigT, vocab, vocabPrior, A, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A, modelState.dtype # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables isigT = la.inv(sigT) R = W.copy() pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN pseudoObsVar = K + NIW_PSEUDO_OBS_VAR priorSigT_diag = np.ndarray(shape=(K,), dtype=dtype) priorSigT_diag.fill (NIW_PSI) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the mean and covariance of the prior topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \ if USE_NIW_PRIOR \ else means.mean(axis=0) debugFn (itr, topicMean, "topicMean", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) if USE_NIW_PRIOR: diff = means - topicMean[np.newaxis,:] sigT = diff.T.dot(diff) \ + pseudoObsVar * np.outer(topicMean, topicMean) sigT += np.diag(varcs.mean(axis=0) + priorSigT_diag) sigT /= (D + pseudoObsVar - K) else: sigT = np.cov(means.T) if sigT.dtype == np.float64 else np.cov(means.T).astype(dtype) sigT += np.diag(varcs.mean(axis=0)) if diagonalPriorCov: diag = np.diag(sigT) sigT = np.diag(diag) isigT = np.diag(1./ diag) else: isigT = la.inv(sigT) # FIXME Undo debug sigT = np.eye(K) isigT = la.inv(sigT) debugFn (itr, sigT, "sigT", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) # print(" sigT.det = " + str(la.det(sigT))) # Building Blocks - temporarily replaces means with exp(means) expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) # Update the vocabulary vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) # Reset the means to their original form, and log effect of vocab update R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) V = expMeans * R.dot(vocab.T) debugFn (itr, vocab, "vocab", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) # And now this is the E-Step, though itr's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the Variances: var_d = (2 N_d * A + isigT)^{-1} varcs = np.reciprocal(docLens[:,np.newaxis] * (K-1.)/K + np.diagonal(sigT)) debugFn (itr, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) # Update the Means rhs = V.copy() rhs += docLens[:,np.newaxis] * means.dot(A) + isigT.dot(topicMean) rhs -= docLens[:,np.newaxis] * rowwise_softmax(means, out=means) if diagonalPriorCov: means = varcs * rhs else: for d in range(D): means[d, :] = la.inv(isigT + docLens[d] * A).dot(rhs[d, :]) # means -= (means[:,0])[:,np.newaxis] debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: if debug: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if itr > 100 and len(likelyValues) > 3 \ and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0: break return \ ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
def recons_error (modelState, X, W, queryState): tpcs_inf = rowwise_softmax(queryState.lmda) W_inf = np.array(tpcs_inf.dot(modelState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32) return np.sum(np.square(W - W_inf)) / X.shape[0]
def query(data, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. Params: data - the dataset of words, features and links of which only words are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' iterations, epsilon, logFrequency, diagonalPriorCov, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug means, expMeans, docLens = queryState.means, queryState.expMeans, queryState.docLens K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = \ modelState.K, modelState.A, modelState.U, modelState.Y, modelState.V, modelState.covA, modelState.tv, modelState.ltv, modelState.fv, modelState.lfv, modelState.vocab, modelState.vocabPrior, modelState.dtype debugFn = _debug_with_bound if debug else _debug_with_nothing W = data.words D = W.shape[0] # Necessary temp variables (notably the count of topic to word assignments # per topic per doc) isigT = la.inv(sigT) # Update the Variances varcs = 1. / ((n * (K - 1.) / K)[:, np.newaxis] + isigT.flat[::K + 1]) debugFn(0, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, n) lastPerp = 1E+300 if dtype is np.float64 else 1E+30 R = W.copy() for itr in range(iterations): expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) V = expMeans * R.dot(vocab.T) # Update the Means rhs = V.copy() rhs += n[:, np.newaxis] * means.dot(A) + isigT.dot(topicMean) rhs -= n[:, np.newaxis] * rowwise_softmax(means, out=means) if diagonalPriorCov: means = varcs * rhs else: for d in range(D): means[d, :] = la.inv(isigT + n[d] * A).dot(rhs[d, :]) debugFn(itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, n) like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, n)) perp = perplexity_from_like(like, data.word_count) if itr > 20 and lastPerp - perp < 1: break lastPerp = perp return modelState, queryState
def train(data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, X = data.words, data.feats D, T = W.shape F = X.shape[1] # tmpNumDense = np.array([ # 4 , 8 , 2 , 0 , 0, # 0 , 6 , 0 , 17, 0, # 12 , 13 , 1 , 7 , 8, # 0 , 5 , 0 , 0 , 0, # 0 , 6 , 0 , 0 , 44, # 0 , 7 , 2 , 0 , 0], dtype=np.float64).reshape((6,5)) # tmpNum = ssp.csr_matrix(tmpNumDense) # # tmpDenomleft = (rd.random((tmpNum.shape[0], 12)) * 5).astype(np.int32).astype(np.float64) / 10 # tmpDenomRight = (rd.random((12, tmpNum.shape[1])) * 5).astype(np.int32).astype(np.float64) # # tmpResult = tmpNum.copy() # tmpResult = sparseScalarQuotientOfDot(tmpNum, tmpDenomleft, tmpDenomRight) # # print (str(tmpNum.todense())) # print (str(tmpDenomleft.dot(tmpDenomRight))) # print (str(tmpResult.todense())) # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, docLens = queryState.means, queryState.docLens K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = \ modelState.K, modelState.A, modelState.U, modelState.Y, modelState.V, modelState.covA, modelState.tv, modelState.ltv, modelState.fv, modelState.lfv, modelState.vocab, modelState.vocabPrior, modelState.dtype tp, fp, ltp, lfp = 1. / tv, 1. / fv, 1. / ltv, 1. / lfv # turn variances into precisions # FIXME Use passed in hypers print("tp = %f tv=%f" % (tp, tv)) vocabPrior = np.ones(shape=(T, ), dtype=modelState.dtype) # FIXME undo truncation F = 363 A = A[:F, :] X = X[:, :F] U = U[:F, :] data = DataSet(words=W, feats=X) # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables if covA is None: precA = (fp * ssp.eye(F) + X.T.dot(X)).todense() # As the inverse is almost always dense covA = la.inv(precA, overwrite_a=True) # it's faster to densify in advance uniqLens = np.unique(docLens) debugFn(-1, covA, "covA", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) H = 0.5 * (np.eye(K) - np.ones((K, K), dtype=dtype) / K) expMeans = means.copy() expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=W.copy()) lhs = H.copy() rhs = expMeans.copy() Y_rhs = Y.copy() # Iterate over parameters for itr in range(iterations): # Update U, V given A V = try_solve_sym_pos(Y.T.dot(U.T).dot(U).dot(Y), A.T.dot(U).dot(Y).T).T V /= V[0, 0] U = try_solve_sym_pos(Y.dot(V.T).dot(V).dot(Y.T), A.dot(V).dot(Y.T).T).T # Update Y given U, V, A Y_rhs[:, :] = U.T.dot(A).dot(V) Sv, Uv = la.eigh(V.T.dot(V), overwrite_a=True) Su, Uu = la.eigh(U.T.dot(U), overwrite_a=True) s = np.outer(Sv, Su).flatten() s += ltv * lfv np.reciprocal(s, out=s) M = Uu.T.dot(Y_rhs).dot(Uv) M *= unvec(s, row_count=M.shape[0]) Y = Uu.dot(M).dot(Uv.T) debugFn(itr, Y, "Y", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) A = covA.dot(fp * U.dot(Y).dot(V.T) + X.T.dot(means)) debugFn(itr, A, "A", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) # And now this is the E-Step, though itr's followed by updates for the # parameters also that handle the log-sum-exp approximation. # TODO One big sort by size, plus batch it. # Update the Means rhs[:, :] = expMeans rhs *= R.dot(vocab.T) rhs += X.dot(A) * tp rhs += docLens[:, np.newaxis] * means.dot(H) rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means) for l in uniqLens: inds = np.where(docLens == l)[0] lhs[:, :] = l * H lhs[np.diag_indices_from(lhs)] += tp lhs[:, :] = la.inv(lhs) means[inds, :] = rhs[inds, :].dot( lhs ) # left and right got switched going from vectors to matrices :-/ debugFn(itr, means, "means", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) # Standard deviation # DK = means.shape[0] * means.shape[1] # newTp = np.sum(means) # newTp = (-newTp * newTp) # rhs[:,:] = means # rhs *= means # newTp = DK * np.sum(rhs) - newTp # newTp /= DK * (DK - 1) # newTp = min(max(newTp, 1E-36), 1E+36) # tp = 1 / newTp # if itr % logFrequency == 0: # print ("Iter %3d stdev = %f, prec = %f, np.std^2=%f, np.mean=%f" % (itr, sqrt(newTp), tp, np.std(means.reshape((D*K,))) ** 2, np.mean(means.reshape((D*K,))))) # Update the vocabulary expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) vocab *= ( R.T.dot(expMeans) ).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) debugFn(itr, vocab, "vocab", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) # print ("Iter %3d Vocab.min = %f" % (itr, vocab.min())) # Update the vocab prior # vocabPrior = estimate_dirichlet_param (vocab, vocabPrior) # print ("Iter %3d VocabPrior.(min, max) = (%f, %f) VocabPrior.mean=%f" % (itr, vocabPrior.min(), vocabPrior.max(), vocabPrior.mean())) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name) queryState = QueryState(means, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print( time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: if debug: printStderr("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if itr > 100 and len(likelyValues) > 3 \ and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0: break return \ ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name), \ QueryState(means, expMeans, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
def varBound (modelState, queryState, X, W, Z = None, lnVocab = None, varA_U = None, XA = None, XTX = None): ''' For a current state of the model, and the query, for given inputs, outputs the variational lower-bound. Params modelState - the state of the model currently queryState - the state of the query currently X - the DxF matrix of features we're querying on, where D is the number of documents W - the DxT matrix of words ("terms") we're querying on Z - if this has already been calculated, it can be passed in. If not, we recalculate it from the model and query states. Z is the DxKxT tensor which for each document D and term T gives the proportion of those terms assigned to topic K lnVocab - the KxV matrix of the natural log applied to the vocabularly. Recalculated if not provided varA_U - the product of the column variance matrix and the matrix U. Recalculated if not provided XA - dot product of X and A, recalculated if not provided XTX - dot product of X-transpose and X, recalculated if not provided. Returns The (positive) variational lower bound ''' # Unpack the model and query state tuples for ease of use and maybe speed improvements (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab) = (modelState.K, modelState.F, modelState.T, modelState.P, modelState.A, modelState.varA, modelState.V, modelState.varV, modelState.U, modelState.sigma, modelState.tau, modelState.vocab) (lmda, nu, lxi, s, docLen) = (queryState.lmda, queryState.nu, queryState.lxi, queryState.s, queryState.docLen) # Get the number of samples from the shape. Ensure that the shapes are consistent # with the model parameters. (D, Tcheck) = W.shape if Tcheck != T: raise ValueError ("The shape of the document matrix W is invalid, T is %d but the matrix W has shape (%d, %d)" % (T, D, Tcheck)) (Dcheck, Fcheck) = X.shape if Dcheck != D: raise ValueError ("Inconsistent sizes between the matrices X and W, X has %d rows but W has %d" % (Dcheck, D)) if Fcheck != F: raise ValueError ("The shape of the feature matrix X is invalid. F is %d but the matrix X has shape (%d, %d)" % (F, Dcheck, Fcheck)) # We'll need the original xi for this and also Z, the 3D tensor of which for each document D #and term T gives the strenght of topic K. We'll also need the log of the vocab dist xi = deriveXi (lmda, nu, s) if lnVocab is None: lnVocab = safe_log(vocab) if Z is None: Z = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxV # lnProb1 is the bound on E[p(W|Theta)]. This is a bound, not an equality as we're using # Bouchard's softmax bound (NIPS 2007) here. That said, most of the subsequent terms # will discard additive constants, so strictly speaking none of them are equalities docLenLmdaLxi = docLen[:, np.newaxis] * lmda * lxi lnProb1 = 0.0 lnProb1 -= np.sum(docLenLmdaLxi * lmda) lnProb1 -= np.sum(docLen[:, np.newaxis] * nu * nu * lxi) lnProb1 += 2 * np.sum (s[:, np.newaxis] * docLenLmdaLxi) lnProb1 -= 0.5 * np.sum (docLen[:, np.newaxis] * lmda) lnProb1 += np.sum (lmda * np.einsum ('dt,dkt->dk', W, Z)) lnProb1 += np.sum(lnVocab * np.einsum('dt,dkt->kt', W, Z)) lnProb1 -= np.sum(W * np.einsum('dkt->dt', safe_x_log_x(Z))) lnProb1 -= np.sum(docLen[:,np.newaxis] * lxi * ((s**2)[:,np.newaxis] - xi**2)) lnProb1 += 0.5 * np.sum(docLen[:,np.newaxis] * (s[:,np.newaxis] + xi)) lnProb1 -= np.sum(docLen[:,np.newaxis] * safe_log_one_plus_exp_of(xi)) # lnProb2 is E[p(Theta|A)] if XA is None: XA = X.dot(A) if XTX is None: XTX = X.T.dot(X) sig2 = sigma * sigma tau2 = tau * tau lnProb2 = -0.5 * D * K * log (sig2) \ - 0.5 / sig2 * (np.sum(nu) + D*K * tau2 * np.sum(XTX * varA) + np.sum((lmda - XA)**2)) # lnProb3 is E[p(A|V)] if varA_U is None: varA_U = varA.dot(U) lnProb3 = -0.5 * K * F * log (2 * pi) \ -0.5 * K * F * log(tau2) \ -0.5 / tau2 * \ ( \ np.trace(varA)*K*tau2 \ + np.sum(varA_U * U) * K * tau2 \ + np.sum((A - U.dot(V)) ** 2) \ ) # lnProb4 is E[p(V)] lnProb4 = -0.5 * (np.trace(varV) * K * tau2 + np.sum(V*V)) # ent1 is H[q(Theta)] ent1 = 0.5 * np.sum (np.log(nu * nu)) # ent2 is H[q(A|V)] ent2 = 0.5 * F * K + log(2 * pi * e) + 0.5 * K * log (la.det(varA)) + 0.5 * F * K * log (tau2) # ent3 is H[q(V)] ent3 = 0.5 * P * K * log (2 * pi * e) + 0.5 * K * log (la.det(varV)) + 0.5 * P * K * log (tau2) result = lnProb1 + lnProb2 + lnProb3 + lnProb4 + ent1 + ent2 + ent3 # if (lnProb1 > 0) or (lnProb2 > 0) or (lnProb3 > 0) or (lnProb4 > 0): # print ("Whoopsie - lnProb > 0") # if result > 100: # print ("Well this is just ridiculous") return result
def train(modelState, X, W, iterations=10000, epsilon=0.001, logInterval = 0): ''' Creates a new query state object for a topic model based on side-information. This contains all those estimated parameters that are specific to the actual date being queried - this must be used in conjunction with a model state. The parameters are modelState - the model state with all the model parameters X - the D x F matrix of side information vectors W - the D x V matrix of word **count** vectors. iterations - how long to iterate for epsilon - currently ignored, in future, allows us to stop early. This returns a tuple of new model-state and query-state. The latter object will contain X and W and also s - A D-dimensional vector describing the offset in our bound on the true value of ln sum_k e^theta_dk lxi - A DxK matrix used in the above bound, containing the negative Jakkola function applied to the quadratic term xi lambda - the topics we've inferred for the current batch of documents nu - the variance of topics we've inferred (independent) ''' # Unpack the model state tuple for ease of use and maybe speed improvements (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab) = (modelState.K, modelState.F, modelState.T, modelState.P, modelState.A, modelState.varA, modelState.V, modelState.varV, modelState.U, modelState.sigma, modelState.tau, modelState.vocab) # Get ready to plot the evolution of the likelihood if logInterval > 0: elbos = np.zeros((iterations / logInterval,)) iters = np.zeros((iterations / logInterval,)) # We'll need the total word count per doc, and total count of docs docLen = W.sum(axis=1) D = len(docLen) # No need to recompute this every time XTX = X.T.dot(X) # Assign initial values to the query parameters lmda = rd.random((D, K)) nu = np.ones((D,K), np.float64) s = np.zeros((D,)) lxi = negJakkola (np.ones((D, K), np.float64)) XA = X.dot(A) for iteration in range(iterations): # Save repeated computation tsq = tau * tau; tsqIP = tsq * np.eye(P) trTsqIK = K * tsq # trace of the matrix tau * tau * np.eye(K) halfSig2 = 1./(sigma*sigma) tau2sig2 = (tau * tau) / (sigma * sigma) # ============================================================= # E-Step # Model dists are q(Theta|A;Lambda;nu) q(A|V) q(V) # Where lambda is the posterior mean of theta. # ============================================================= # # V, varV varV = la.inv (tsqIP + U.T.dot(U)) V = varV.dot(U.T).dot(A) _quickPrintElbo ("E-Step: q(V)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # A, varA # TODO, since only tau2sig2 changes at each step, would it be possible just to # amend the old inverse? # TODO Use sparse inverse varA = la.inv (tau2sig2 * XTX + np.eye(F)) A = varA.dot (U.dot(V) + X.T.dot(lmda)) XA = X.dot(A) _quickPrintElbo ("E-Step: q(A|V)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # lmda_dk lnVocab = safe_log (vocab) Z = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxT rho = 2 * s[:,np.newaxis] * lxi - 0.5 \ + np.einsum('dt,dkt->dk', W, Z) / docLen[:,np.newaxis] rhs = docLen[:,np.newaxis] * rho + halfSig2 * X.dot(A) lmda = rhs / (docLen[:,np.newaxis] * 2 * lxi + halfSig2) _quickPrintElbo ("E-Step: q(Theta|A;lamda)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # nu_dk # TODO Double check this again... nu = 1./ np.sqrt(2. * docLen[:, np.newaxis] * lxi + halfSig2) _quickPrintElbo ("E-Step: q(Theta|A;nu)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # ============================================================= # M-Step # Parameters for the softmax bound: lxi and s # The projection used for A: U # The vocabulary : vocab # The variances: tau, sigma # ============================================================= # # s_d # s = (K/4. + (lxi * lmda).sum(axis = 1)) / lxi.sum(axis=1) # _quickPrintElbo ("M-Step: max s", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # xi_dk lxi = negJakkolaOfDerivedXi(lmda, nu, s) _quickPrintElbo ("M-Step: max xi", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # vocab # # TODO, since vocab is in the RHS, is there any way to optimize this? Z = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxV vocab = normalizerows_ip (np.einsum('dt,dkt->kt', W, Z)) _quickPrintElbo ("M-Step: max vocab", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # U U = A.dot(V.T).dot (la.inv(trTsqIK * varV + V.dot(V.T))) _quickPrintElbo ("M-Step: max U", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # sigma # Equivalent to \frac{1}{DK} \left( \sum_d (\sum_k nu_{dk}) + tr(\Omega_A) x_d^{T} \Sigma_A x_d + (\lambda - A^{T} x_d)^{T}(\lambda - A^{T} x_d) \right) # # sigma = 1./(D*K) * (np.sum(nu) + D*K * tsq * np.sum(XTX * varA) + np.sum((lmda - XA)**2)) # # tau # Equivalent to \frac{1}{KF} \left( tr(\Sigma_A)tr(\Omega_A) + tr(\Sigma_V U U^{T})tr(\Omega_V) + tr ((M_A - U M_V)^{T} (M_A - U M_V)) \right) # varA_U = varA.dot(U) # tau_term1 = np.trace(varA)*K*tsq # tau_term2 = sum(varA_U[p,:].dot(U[p,:]) for p in xrange(P)) * K * tsq # tau_term3 = np.sum((A - U.dot(V)) ** 2) # # tau = 1./(K*F) * (tau_term1 + tau_term2 + tau_term3) if (logInterval > 0) and (iteration % logInterval == 0): elbo = varBound ( \ VbSideTopicModelState (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab), \ VbSideTopicQueryState(lmda, nu, lxi, s, docLen), X, W, Z, lnVocab, varA_U, XA, XTX) elbos[iteration / logInterval] = elbo iters[iteration / logInterval] = iteration print ("Iteration %5d ELBO %f" % (iteration, elbo)) if logInterval > 0: plot_bound(iters, elbos) return (VbSideTopicModelState (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab), \ VbSideTopicQueryState (lmda, nu, lxi, s, docLen))
def query(data, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' W, X = data.words, data.feats D, _ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug means, expMeans, varcs, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype # TODO Get ride of this via a command-line param iterations = max(iterations, 100) # Debugging debugFn = _debug_with_bound if debug else _debug_with_nothing _debug_with_bound.old_bound = 0 # Necessary values isigT = la.inv(sigT) lastPerp = 1E+300 if dtype is np.float64 else 1E+30 for itr in range(iterations): # Counts of topic assignments expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab) S = expMeans * R.dot(vocab.T) # the variance varcs[:] = 1. / ((n * (K - 1.) / K)[:, np.newaxis] + isigT.flat[::K + 1]) debugFn(itr, varcs, "query-varcs", W, X, None, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, n) # Update the Means rhs = X.dot(A.T).dot(isigT) rhs += S rhs += n[:, np.newaxis] * means.dot(Ab) rhs -= n[:, np.newaxis] * rowwise_softmax(means, out=means) # Long version inverses = dict() for d in range(D): if not n[d] in inverses: inverses[n[d]] = la.inv(isigT + n[d] * Ab) lhs = inverses[n[d]] means[d, :] = lhs.dot(rhs[d, :]) debugFn(itr, means, "query-means", W, X, None, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, n) like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, n)) perp = perplexity_from_like(like, data.word_count) if itr > 20 and lastPerp - perp < 1: break lastPerp = perp return modelState, queryState # query vars altered in-place
def train(data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, X = data.words, data.feats D, _ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype # Book-keeping for logs boundIters, boundValues, boundLikes = [], [], [] debugFn = _debug_with_bound if debug else _debug_with_nothing _debug_with_bound.old_bound = 0 # For efficient inference, we need a separate covariance for every unique # document length. For products to execute quickly, the doc-term matrix # therefore needs to be ordered in ascending terms of document length originalDocLens = docLens sortIdx = np.argsort(docLens, kind=STABLE_SORT_ALG ) # sort needs to be stable in order to be reversible W = W[sortIdx, :] # deep sorted copy X = X[sortIdx, :] means, varcs = means[sortIdx, :], varcs[sortIdx, :] docLens = originalDocLens[sortIdx] lens, inds = np.unique(docLens, return_index=True) inds = np.append(inds, [W.shape[0]]) # Initialize some working variables R = W.copy() aI_P = 1. / lfv * ssp.eye(P, dtype=dtype) print("Creating posterior covariance of A, this will take some time...") XTX = X.T.dot(X) R_A = XTX leastSquares = lambda feats, targets: la.lstsq( feats, targets, lapack_driver="gelsy")[0].T if ssp.issparse( R_A): # dense inverse typically as fast or faster than sparse R_A = to_dense_array( R_A) # inverse and the result is usually dense in any case leastSquares = lambda feats, targets: np.array( [ssp.linalg.lsqr(feats, targets[:, k])[0] for k in range(K)]) R_A.flat[::F + 1] += 1. / fv R_A = la.inv(R_A) print("Covariance matrix calculated, launching inference") priorSigt_diag = np.ndarray(shape=(K, ), dtype=dtype) priorSigt_diag.fill(0.001) # Iterate over parameters for itr in range(iterations): A = leastSquares(X, means) diff_a_yv = (A - Y.dot(V)) for _ in range(10): #(50 if itr == 0 else 1): # Update the covariance of the prior diff_m_xa = (means - X.dot(A.T)) sigT = 1. / lfv * (Y.dot(Y.T)) sigT += 1. / fv * diff_a_yv.dot(diff_a_yv.T) sigT += diff_m_xa.T.dot(diff_m_xa) sigT.flat[::K + 1] += varcs.sum(axis=0) # As small numbers lead to instable inverse estimates, we use the # fact that for a scalar a, (a .* X)^-1 = 1/a * X^-1 and use these # scales whenever we use the inverse of the unscaled covariance sigScale = 1. / (P + D + F) isigScale = 1. / sigScale isigT = la.inv(sigT) debugFn(itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the vocabulary vocab *= ( R.T.dot(expMeans) ).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) # Reset the means to their original form, and log effect of vocab update R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) S = expMeans * R.dot(vocab.T) debugFn(itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the Variances varcs = 1. / ((docLens * (K - 1.) / K)[:, np.newaxis] + isigScale * isigT.flat[::K + 1]) debugFn(itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the Means rhs = X.dot(A.T).dot(isigT) * isigScale rhs += S rhs += docLens[:, np.newaxis] * means.dot(Ab) rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means) # Faster version? for lenIdx in range(len(lens)): nd = lens[lenIdx] start, end = inds[lenIdx], inds[lenIdx + 1] lhs = la.inv(isigT + sigScale * nd * Ab) * sigScale means[start:end, :] = rhs[start:end, :].dot( lhs ) # huh?! Left and right refer to eqn for a single mean: once we're talking a DxK matrix it gets swapped # print("Vec-Means: %f, %f, %f, %f" % (means.min(), means.mean(), means.std(), means.max())) debugFn(itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) # for _ in range(150): # # Finally update the parameter V # V = la.inv(sigScale * R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A)) # debugFn(itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, # varcs, Ab, docLens) # # # Update the distribution on the latent space # R_Y_base = aI_P + 1 / fv * V.dot(V.T) # R_Y = la.inv(R_Y_base) # debugFn(itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, # means, varcs, Ab, docLens) # # Y = 1. / fv * A.dot(V.T).dot(R_Y) # debugFn(itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, # varcs, Ab, docLens) # # # Update the mapping from the features to topics # A = (1. / fv * Y.dot(V) + (X.T.dot(means)).T).dot(R_A) # debugFn(itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, # varcs, Ab, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, docLens) boundValues.append( var_bound(DataSet(W, feats=X), modelState, queryState, XTX)) boundLikes.append( log_likelihood(DataSet(W, feats=X), modelState, queryState)) boundIters.append(itr) perp = perplexity_from_like(boundLikes[-1], docLens.sum()) print( time.strftime('%X') + " : Iteration %d: Perplexity %4.0f bound %f" % (itr, perp, boundValues[-1])) if len(boundIters) >= 2 and boundValues[-2] > boundValues[-1]: printStderr("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max())) # Check to see if the improvement in the likelihood has fallen below the threshold if len(boundIters) > 2 and boundIters[-1] > 20: lastPerp = perplexity_from_like(boundLikes[-2], docLens.sum()) if lastPerp - perp < 1: break revert_sort = np.argsort(sortIdx, kind=STABLE_SORT_ALG) means = means[revert_sort, :] varcs = varcs[revert_sort, :] docLens = docLens[revert_sort] return \ ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, docLens), \ (boundIters, boundValues, boundLikes)
def _sampleFromModel(self, D=200, T=100, K=10, F=12, P=8, avgWordsPerDoc=500): ''' Create a test dataset according to the model Params: T - Vocabulary size, the number of "terms". Must be a square number K - Observed topics P - Latent features F - Observed features D - Sample documents (each with associated features) avgWordsPerDoc - average number of words per document generated (Poisson) Returns: modelState - a model state object configured for training tpcs - the matrix of per-document topic distribution vocab - the matrix of per-topic word distributions docLens - the vector of document lengths X - the DxF side information matrix W - The DxW word matrix ''' # Generate vocab beta = 0.1 betaVec = np.ndarray((T, )) betaVec.fill(beta) vocab = np.zeros((K, T)) for k in range(K): vocab[k, :] = rd.dirichlet(betaVec) # Geneate the shared covariance matrix sigT = rd.random((K, K)) sigT = sigT.dot(sigT) sigT.flat[::K + 1] += rd.random((K, )) * 4 # Just link two topics sigT[K // 2, K // 3] = 3 sigT[K // 3, K // 2] = 3 sigT[4 * K // 5, K // 5] = 4 sigT[K // 5, 4 * K // 5] = 4 # Generate Y, then V, then A lfv = 0.1 # latent feature variance (for Y) fv = 0.1 # feature variance (for A) Y = matrix_normal(np.zeros((K, P)), lfv * np.eye(P), sigT) V = matrix_normal(np.zeros((P, F)), fv * np.eye(F), lfv * np.eye(P)) A = matrix_normal(Y.dot(V), fv * np.eye(F), sigT) # Generate the input features. Assume the features are multinomial and sparse # (not quite a perfect match for the twitter example: twitter is binary, this # may not be) featuresDist = [1. / F] * F maxNonZeroFeatures = 3 X = np.zeros((D, F), dtype=np.float32) for d in range(D): X[d, :] = rd.multinomial(maxNonZeroFeatures, featuresDist) X = ssp.csr_matrix(X) # Use the features and the matrix A to generate the topics and documents tpcs = rowwise_softmax(X.dot(A.T)) docLens = rd.poisson(avgWordsPerDoc, (D, )).astype(np.float32) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # Return the initialised model, the true parameter values, and the # generated observations return tpcs, vocab, docLens, X, W
def train(data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, L, LT, X = data.words, data.links, ssp.csr_matrix( data.links.T), data.feats D, _ = W.shape out_links = np.squeeze(np.asarray(data.links.sum(axis=1))) # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, varcs, docLens = queryState.means, queryState.varcs, queryState.docLens K, topicMean, topicCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.vocab, modelState.A, modelState.dtype emit_counts = docLens + out_links # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] if debug: debugFn = _debug_with_bound initLikely = log_likelihood(data, modelState, queryState) initPerp = perplexity_from_like(initLikely, data.word_count) print("Initial perplexity is: %.2f" % initPerp) else: debugFn = _debug_with_nothing # Initialize some working variables W_weight = W.copy() L_weight = L.copy() LT_weight = LT.copy() pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN pseudoObsVar = K + NIW_PSEUDO_OBS_VAR priorSigT_diag = np.ndarray(shape=(K, ), dtype=dtype) priorSigT_diag.fill(NIW_PSI) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the mean and covariance of the prior topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \ if USE_NIW_PRIOR \ else means.mean(axis=0) debugFn(itr, topicMean, "topicMean", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) if USE_NIW_PRIOR: diff = means - topicMean[np.newaxis, :] topicCov = diff.T.dot(diff) \ + pseudoObsVar * np.outer(topicMean, topicMean) topicCov += np.diag(varcs.mean(axis=0) + priorSigT_diag) topicCov /= (D + pseudoObsVar - K) else: topicCov = np.cov( means.T) if topicCov.dtype == np.float64 else np.cov( means.T).astype(dtype) topicCov += np.diag(varcs.mean(axis=0)) if diagonalPriorCov: diag = np.diag(topicCov) topicCov = np.diag(diag) itopicCov = np.diag(1. / diag) else: itopicCov = la.inv(topicCov) debugFn(itr, topicCov, "topicCov", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) # print(" topicCov.det = " + str(la.det(topicCov))) # Building Blocks - temporarily replaces means with exp(means) expMeansCol = np.exp(means - means.max(axis=0)[np.newaxis, :]) lse_at_k = np.sum(expMeansCol, axis=0) F = 0.5 * means \ - (1. / (2*D + 2)) * means.sum(axis=0) \ - expMeansCol / lse_at_k[np.newaxis, :] expMeansRow = np.exp(means - means.max(axis=1)[:, np.newaxis]) W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight) # Update the vocabularies vocab *= ( W_weight.T.dot(expMeansRow) ).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += VocabPrior vocab = normalizerows_ip(vocab) docVocab = ( expMeansCol / lse_at_k[np.newaxis, :]).T # FIXME Dupes line in definitino of F # Recalculate w_top_sums with the new vocab and log vocab improvement W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight) w_top_sums = W_weight.dot(vocab.T) * expMeansRow debugFn(itr, vocab, "vocab", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) # Now do likewise for the links, do it twice to model in-counts (first) and # out-counts (Second). The difference is the transpose LT_weight = sparseScalarQuotientOfDot(LT, expMeansRow, docVocab, out=LT_weight) l_intop_sums = LT_weight.dot(docVocab.T) * expMeansRow in_counts = l_intop_sums.sum(axis=0) L_weight = sparseScalarQuotientOfDot(L, expMeansRow, docVocab, out=L_weight) l_outtop_sums = L_weight.dot(docVocab.T) * expMeansRow # Reset the means and use them to calculate the weighted sum of means meanSum = means.sum(axis=0) * in_counts # And now this is the E-Step, though itr's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the Variances: var_d = (2 N_d * A + itopicCov)^{-1} varcs = np.reciprocal(docLens[:, np.newaxis] * (0.5 - 1. / K) + np.diagonal(topicCov)) debugFn(itr, varcs, "varcs", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) # Update the Means rhs = w_top_sums.copy() rhs += l_intop_sums rhs += l_outtop_sums rhs += itopicCov.dot(topicMean) rhs += emit_counts[:, np.newaxis] * (means.dot(A) - rowwise_softmax(means)) rhs += in_counts[np.newaxis, :] * F if diagonalPriorCov: raise ValueError("Not implemented") else: for d in range(D): rhs_ = rhs[d, :] + (1. / (4 * D + 4)) * (meanSum - in_counts * means[d, :]) means[d, :] = la.inv(itopicCov + emit_counts[d] * A + np.diag(D * in_counts / (2 * D + 2))).dot(rhs_) if np.any(np.isnan(means[d, :])) or np.any( np.isinf(means[d, :])): pass if np.any(np.isnan( np.exp(means[d, :] - means[d, :].max()))) or np.any( np.isinf(np.exp(means[d, :] - means[d, :].max()))): pass debugFn(itr, means, "means", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, topicMean, topicCov, vocab, A, dtype, MODEL_NAME) queryState = QueryState(means, varcs, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print( time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: printStderr("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if False and itr > 100 and abs( perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum()) ) < 1.0: break return \ ModelState(K, topicMean, topicCov, vocab, A, dtype, MODEL_NAME), \ QueryState(means, varcs, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues))