def _doTest (self, W, model, queryState, trainPlan): D,_ = W.shape recons = rowwise_softmax(queryState.means).dot(model.vocab) reconsErr = 1./D * np.sum((np.asarray(W.todense()) - recons) * (np.asarray(W.todense()) - recons)) print ("Initial bound is %f\n\n" % ctm.var_bound(W, model, queryState)) print ("Initial reconstruction error is %f\n\n" % reconsErr) model, query, (bndItrs, bndVals, likelies) = ctm.train (W, None, model, queryState, trainPlan) # Plot the bound fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, likelies, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() plt.show() # Plot the inferred vocab plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r) plt.show() recons = rowwise_softmax(queryState.means).dot(model.vocab) reconsErr = 1./D * np.sum((np.asarray(W.todense()) - recons) * (np.asarray(W.todense()) - recons)) print ("Final reconstruction error is %f\n\n" % reconsErr)
def log_likelihood (data, modelState, queryState): ''' Return the log-likelihood of the given data W according to the model and the parameters inferred for the entries in W stored in the queryState object. ''' probs = rowwise_softmax(queryState.outMeans) doc_dist = colwise_softmax(queryState.inMeans) word_likely = np.sum( \ sparseScalarProductOfSafeLnDot(\ data.words, \ probs, \ modelState.vocab \ ).data \ ) link_likely = np.sum( \ sparseScalarProductOfSafeLnDot(\ data.links, \ probs, \ doc_dist \ ).data \ ) return word_likely + link_likely
def log_likelihood(data, modelState, queryState): """ Return the log-likelihood of the given data W according to the model and the parameters inferred for the entries in W stored in the queryState object. """ return np.sum(sparseScalarProductOfSafeLnDot(data.words, rowwise_softmax(queryState.means), modelState.vocab).data)
def query(data, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. Params: data - the dataset of words, features and links of which only words are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' iterations, epsilon, logFrequency, diagonalPriorCov, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug means, expMeans, varcs, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens K, topicMean, sigT, vocab, vocabPrior, A, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A, modelState.dtype debugFn = _debug_with_bound if debug else _debug_with_nothing W = data.words D = W.shape[0] # Necessary temp variables (notably the count of topic to word assignments # per topic per doc) isigT = la.inv(sigT) # Update the Variances varcs = 1./((n * (K-1.)/K)[:,np.newaxis] + isigT.flat[::K+1]) debugFn (0, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, n) lastPerp = 1E+300 if dtype is np.float64 else 1E+30 R = W.copy() for itr in range(iterations): expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) V = expMeans * R.dot(vocab.T) # Update the Means rhs = V.copy() rhs += n[:,np.newaxis] * means.dot(A) + isigT.dot(topicMean) rhs -= n[:,np.newaxis] * rowwise_softmax(means, out=means) if diagonalPriorCov: means = varcs * rhs else: for d in range(D): means[d,:] = la.inv(isigT + n[d] * A).dot(rhs[d,:]) debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, n) like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, n)) perp = perplexity_from_like(like, data.word_count) if itr > 20 and lastPerp - perp < 1: break lastPerp = perp return modelState, queryState
def query(data, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. Params: data - the dataset of words, features and links of which only words are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' iterations, epsilon, logFrequency, diagonalPriorCov, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug means, varcs, n = queryState.means, queryState.varcs, queryState.docLens K, topicMean, topicCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.vocab, modelState.A, modelState.dtype debugFn = _debug_with_bound if debug else _debug_with_nothing W = data.words D = W.shape[0] expMeansOut = np.exp(means - means.max(axis=1)[:, np.newaxis]) expMeansIn = np.exp(means - means.max(axis=0)[np.newaxis, :]) lse_at_k = expMeansIn.sum(axis=0) # Necessary temp variables (notably the count of topic to word assignments # per topic per doc) itopicCov = la.inv(topicCov) # Update the Variances varcs = 1./((n * (K-1.)/K)[:,np.newaxis] + itopicCov.flat[::K+1]) debugFn (0, varcs, "varcs", W, K, topicMean, topicCov, vocab, dtype, means, varcs, A, n) R = W.copy() for itr in range(iterations): R = sparseScalarQuotientOfDot(W, expMeansOut, vocab, out=R) V = expMeansOut * R.dot(vocab.T) # Update the Means rhs = V.copy() rhs += n[:, np.newaxis] * means.dot(A) + itopicCov.dot(topicMean) rhs -= n[:, np.newaxis] * rowwise_softmax(means, out=means) if diagonalPriorCov: means = varcs * rhs else: for d in range(D): means[d, :] = la.inv(itopicCov + n[d] * A).dot(rhs[d, :]) debugFn (itr, means, "means", W, K, topicMean, topicCov, vocab, dtype, means, varcs, A, n) return modelState, queryState
def _sampleFromModel(self, D=200, T=100, K=10, avgWordsPerDoc = 500): ''' Create a test dataset according to the model Params: D - Sample documents (each with associated features) T - Vocabulary size, the number of "terms". Must be a square number K - Observed topics avgWordsPerDoc - average number of words per document generated (Poisson) Returns: modelState - a model state object configured for training tpcs - the matrix of per-document topic distribution vocab - the matrix of per-topic word distributions docLens - the vector of document lengths X - the DxF side information matrix W - The DxW word matrix ''' # Generate vocab beta = 0.1 betaVec = np.ndarray((T,)) betaVec.fill(beta) vocab = rd.dirichlet(betaVec, size=K) # Geneate the shared covariance matrix # ...no real structure in this. sigT = rd.random((K,K)) sigT = sigT.dot(sigT) # Generate topic mean alpha = 1 alphaVec = np.ndarray((K,)) alphaVec.fill(alpha) topicMean = rd.dirichlet(alphaVec) # Generate the actual topics. tpcs = rd.multivariate_normal(topicMean, sigT, size=D) tpcs = rowwise_softmax(tpcs) # Generate the corpus docLens = rd.poisson(avgWordsPerDoc, (D,)).astype(np.float32) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # Return the initialised model, the true parameter values, and the # generated observations return tpcs, vocab, docLens, W
def _doTest (self, W, model, queryState, trainPlan): D,_ = W.shape recons = rowwise_softmax(queryState.means).dot(model.vocab) reconsErr = 1./D * np.sum((np.asarray(W.todense()) - recons) * (np.asarray(W.todense()) - recons)) print ("Initial bound is %f\n\n" % ctm.var_bound(W, model, queryState)) print ("Initial reconstruction error is %f\n\n" % reconsErr) model, query, (bndItrs, bndVals) = ctm.train (W, None, model, queryState, trainPlan) # Plot the bound plt.plot(bndItrs[5:], bndVals[5:]) plt.xlabel("Iterations") plt.ylabel("Variational Bound") plt.show() # Plot the inferred vocab plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r) plt.show() recons = rowwise_softmax(queryState.means).dot(model.vocab) reconsErr = 1./D * np.sum((np.asarray(W.todense()) - recons) * (np.asarray(W.todense()) - recons)) print ("Final reconstruction error is %f\n\n" % reconsErr)
def selfSoftDot(matrix): ''' Considers the given matrix to be a collection of stacked row-vectors. Returns the sum of the dot products of each row-vector and its soft-max form. This words on DENSE matrices only, and it appears in this module simply for convenience. Uses fast, memory-efficient operations for matrices of single and double-precision numbers, uses fast-ish numpy code as a fallback, but at the cost of creating a copy of of the matrix. ''' assert not np.isfortran(matrix), "Matrix is not stored in row-major form" if matrix.dtype == np.float64: return compiled.selfSoftDot_f8(matrix) elif matrix.dtype == np.float32: return compiled.selfSoftDot_f4(matrix) if WarnIfSlow: sys.stderr.write("WARNING: Slow code path triggered (selfSoftDot)") return np.sum(matrix * rowwise_softmax(matrix))
def train(data, modelState, queryState, trainPlan): """ Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters """ W, X = data.words, data.feats D, T = W.shape F = X.shape[1] # tmpNumDense = np.array([ # 4 , 8 , 2 , 0 , 0, # 0 , 6 , 0 , 17, 0, # 12 , 13 , 1 , 7 , 8, # 0 , 5 , 0 , 0 , 0, # 0 , 6 , 0 , 0 , 44, # 0 , 7 , 2 , 0 , 0], dtype=np.float64).reshape((6,5)) # tmpNum = ssp.csr_matrix(tmpNumDense) # # tmpDenomleft = (rd.random((tmpNum.shape[0], 12)) * 5).astype(np.int32).astype(np.float64) / 10 # tmpDenomRight = (rd.random((12, tmpNum.shape[1])) * 5).astype(np.int32).astype(np.float64) # # tmpResult = tmpNum.copy() # tmpResult = sparseScalarQuotientOfDot(tmpNum, tmpDenomleft, tmpDenomRight) # # print (str(tmpNum.todense())) # print (str(tmpDenomleft.dot(tmpDenomRight))) # print (str(tmpResult.todense())) # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = ( trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug, ) means, docLens = queryState.means, queryState.docLens K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = ( modelState.K, modelState.A, modelState.U, modelState.Y, modelState.V, modelState.covA, modelState.tv, modelState.ltv, modelState.fv, modelState.lfv, modelState.vocab, modelState.vocabPrior, modelState.dtype, ) tp, fp, ltp, lfp = 1.0 / tv, 1.0 / fv, 1.0 / ltv, 1.0 / lfv # turn variances into precisions # FIXME Use passed in hypers print("tp = %f tv=%f" % (tp, tv)) vocabPrior = np.ones(shape=(T,), dtype=modelState.dtype) # FIXME undo truncation F = 363 A = A[:F, :] X = X[:, :F] U = U[:F, :] data = DataSet(words=W, feats=X) # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables if covA is None: precA = (fp * ssp.eye(F) + X.T.dot(X)).todense() # As the inverse is almost always dense covA = la.inv(precA, overwrite_a=True) # it's faster to densify in advance uniqLens = np.unique(docLens) debugFn(-1, covA, "covA", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) H = 0.5 * (np.eye(K) - np.ones((K, K), dtype=dtype) / K) expMeans = means.copy() expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=W.copy()) lhs = H.copy() rhs = expMeans.copy() Y_rhs = Y.copy() # Iterate over parameters for itr in range(iterations): # Update U, V given A V = try_solve_sym_pos(Y.T.dot(U.T).dot(U).dot(Y), A.T.dot(U).dot(Y).T).T V /= V[0, 0] U = try_solve_sym_pos(Y.dot(V.T).dot(V).dot(Y.T), A.dot(V).dot(Y.T).T).T # Update Y given U, V, A Y_rhs[:, :] = U.T.dot(A).dot(V) Sv, Uv = la.eigh(V.T.dot(V), overwrite_a=True) Su, Uu = la.eigh(U.T.dot(U), overwrite_a=True) s = np.outer(Sv, Su).flatten() s += ltv * lfv np.reciprocal(s, out=s) M = Uu.T.dot(Y_rhs).dot(Uv) M *= unvec(s, row_count=M.shape[0]) Y = Uu.dot(M).dot(Uv.T) debugFn(itr, Y, "Y", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) A = covA.dot(fp * U.dot(Y).dot(V.T) + X.T.dot(means)) debugFn(itr, A, "A", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) # And now this is the E-Step, though itr's followed by updates for the # parameters also that handle the log-sum-exp approximation. # TODO One big sort by size, plus batch it. # Update the Means rhs[:, :] = expMeans rhs *= R.dot(vocab.T) rhs += X.dot(A) * tp rhs += docLens[:, np.newaxis] * means.dot(H) rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means) for l in uniqLens: inds = np.where(docLens == l)[0] lhs[:, :] = l * H lhs[np.diag_indices_from(lhs)] += tp lhs[:, :] = la.inv(lhs) means[inds, :] = rhs[inds, :].dot(lhs) # left and right got switched going from vectors to matrices :-/ debugFn(itr, means, "means", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) # Standard deviation # DK = means.shape[0] * means.shape[1] # newTp = np.sum(means) # newTp = (-newTp * newTp) # rhs[:,:] = means # rhs *= means # newTp = DK * np.sum(rhs) - newTp # newTp /= DK * (DK - 1) # newTp = min(max(newTp, 1E-36), 1E+36) # tp = 1 / newTp # if itr % logFrequency == 0: # print ("Iter %3d stdev = %f, prec = %f, np.std^2=%f, np.mean=%f" % (itr, sqrt(newTp), tp, np.std(means.reshape((D*K,))) ** 2, np.mean(means.reshape((D*K,))))) # Update the vocabulary expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) debugFn(itr, vocab, "vocab", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) # print ("Iter %3d Vocab.min = %f" % (itr, vocab.min())) # Update the vocab prior # vocabPrior = estimate_dirichlet_param (vocab, vocabPrior) # print ("Iter %3d VocabPrior.(min, max) = (%f, %f) VocabPrior.mean=%f" % (itr, vocabPrior.min(), vocabPrior.max(), vocabPrior.mean())) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name) queryState = QueryState(means, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print( time.strftime("%X") + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum())) ) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: if debug: printStderr("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if ( itr > 100 and len(likelyValues) > 3 and abs( perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum()) ) < 1.0 ): break return ( ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name), QueryState(means, expMeans, docLens), (np.array(boundIters), np.array(boundValues), np.array(likelyValues)), )
def query(data, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. The assumption is that there are no out-links associated with the documents, and that no documents in the training set link to any of these documents in the query set. The word and link vocabularies are kept fixed. Due to the assumption of no in-links, we don't learn the prior in-document covariance, nor the posterior distribution over in-links. Also, we don't modify Params: data - the dataset of words, features and links of which only words are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' W, L, LT, X = data.words, data.links, ssp.csr_matrix(data.links.T), data.feats D,_ = W.shape out_links = np.squeeze(np.asarray(data.links.sum(axis=1))) # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens = queryState.outMeans, queryState.outVarcs, queryState.inMeans, queryState.inVarcs, queryState.inDocCov, queryState.docLens K, topicMean, topicCov, outDocCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.outDocCov, modelState.vocab, modelState.A, modelState.dtype emit_counts = docLens + out_links # Initialize some working variables W_weight = W.copy() outDocPre = 1./outDocCov inDocPre = np.reciprocal(inDocCov) itopicCov = la.inv(topicCov) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step expMeansRow = np.exp(outMeans - outMeans.max(axis=1)[:, np.newaxis]) W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight) w_top_sums = W_weight.dot(vocab.T) * expMeansRow # Update the posterior variances outVarcs = np.reciprocal(emit_counts[:, np.newaxis] * (K-1)/(2*K) + (outDocPre + inDocPre[:,np.newaxis]) * np.diagonal(itopicCov)[np.newaxis,:]) # Update the out-means and in-means out_rhs = w_top_sums.copy() # No link outputs to model. out_rhs += itopicCov.dot(topicMean) / outDocCov out_rhs += emit_counts[:, np.newaxis] * (outMeans.dot(A) - rowwise_softmax(outMeans)) for d in range(D): outCov = la.inv(outDocPre * itopicCov + emit_counts[d] * A) outMeans[d, :] = outCov.dot(out_rhs[d,:]) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME) queryState = QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens) boundValues.append(0) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: # Check to see if the improvement in the bound has fallen below the threshold if itr > MinItersBeforeEarlyStop and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0: break return \ ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME), \ QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens)
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, L, LT, X = data.words, data.links, ssp.csr_matrix(data.links.T), data.feats D,_ = W.shape out_links = np.squeeze(np.asarray(data.links.sum(axis=1))) # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens = queryState.outMeans, queryState.outVarcs, queryState.inMeans, queryState.inVarcs, queryState.inDocCov, queryState.docLens K, topicMean, topicCov, outDocCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.outDocCov, modelState.vocab, modelState.A, modelState.dtype emit_counts = docLens + out_links # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] if debug: debugFn = _debug_with_bound initLikely = log_likelihood(data, modelState, queryState) initPerp = perplexity_from_like(initLikely, data.word_count) print ("Initial perplexity is: %.2f" % initPerp) else: debugFn = _debug_with_nothing # Initialize some working variables W_weight = W.copy() L_weight = L.copy() LT_weight = LT.copy() inDocCov, inDocPre = np.ones((D,)), np.ones((D,)) # Interestingly, outDocCov trades off good perplexity fits # with good ranking fits. > 10 gives better perplexity and # worse ranking. At 10 both are good. Below 10 both get # worse. Below 0.5, convergence stalls after the first iter. outDocCov, outDocPre = 10, 1./10 # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the mean and covariance of the prior over out-topics topicMean = outMeans.mean(axis=0) debugFn (itr, topicMean, "topicMean", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) outDiff = outMeans - topicMean[np.newaxis, :] inDiff = inMeans - outMeans for _ in range(5): # It typically takes three iterations for the three dependant covariances - # outDocCov, inDocCov and topicCov - to become consistent w.r.t each other topicCov = (outDocPre * outDiff).T.dot(outDiff) topicCov += (inDocPre[:,np.newaxis] * inDiff).T.dot(inDiff) topicCov += np.diag(outVarcs.sum(axis=0)) topicCov += np.diag(inVarcs.sum(axis=0)) topicCov += IWISH_S_SCALE * np.eye(K) topicCov /= (2 * D + IWISH_DENOM) itopicCov = la.inv(topicCov) debugFn (itr, topicMean, "topicCov", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) diffSig = inDiff.dot(itopicCov) diffSig *= inDiff inDocCov = diffSig.sum(axis=1) inDocCov += (outVarcs * np.diagonal(itopicCov)[np.newaxis, :]).sum(axis=1) inDocCov += (inVarcs * np.diagonal(itopicCov)[np.newaxis, :]).sum(axis=1) inDocCov += IGAMMA_B inDocCov /= (IGAMMA_A - 1 + K) inDocPre = np.reciprocal(inDocCov) debugFn (itr, inDocCov, "inDocCov", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) diffSig = outDiff.dot(itopicCov) diffSig *= outDiff # outDocCov = (IGAMMA_B + diffSig.sum() + (np.diagonal(itopicCov) * outVarcs).sum()) / (IGAMMA_A - 1 + (D * K)) # outDocPre = 1./outDocCov debugFn (itr, outDocCov, "outDocCov", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) # Apply the exp function to get the (unnormalised) softmaxes in both directions. expMeansCol = np.exp(inMeans - inMeans.max(axis=0)[np.newaxis, :]) lse_at_k = np.sum(expMeansCol, axis=0) F = 0.5 * inMeans \ - (0.5/ D) * inMeans.sum(axis=0) \ - expMeansCol / lse_at_k[np.newaxis, :] expMeansRow = np.exp(outMeans - outMeans.max(axis=1)[:, np.newaxis]) W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight) # Update the vocabularies vocab *= (W_weight.T.dot(expMeansRow)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += VocabPrior vocab = normalizerows_ip(vocab) docVocab = (expMeansCol / lse_at_k[np.newaxis, :]).T.copy() # FIXME Dupes line in definition of F # Recalculate w_top_sums with the new vocab and log vocab improvement W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight) w_top_sums = W_weight.dot(vocab.T) * expMeansRow debugFn (itr, vocab, "vocab", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) # Now do likewise for the links, do it twice to model in-counts (first) and # out-counts (Second). The difference is the transpose LT_weight = sparseScalarQuotientOfDot(LT, expMeansRow, docVocab, out=LT_weight) l_intop_sums = LT_weight.dot(docVocab.T) * expMeansRow in_counts = l_intop_sums.sum(axis=0) L_weight = sparseScalarQuotientOfDot(L, expMeansRow, docVocab, out=L_weight) l_outtop_sums = L_weight.dot(docVocab.T) * expMeansRow # Update the posterior variances outVarcs = np.reciprocal(emit_counts[:, np.newaxis] * (K-1)/(2*K) + (outDocPre + inDocPre[:,np.newaxis]) * np.diagonal(itopicCov)[np.newaxis,:]) debugFn (itr, outVarcs, "outVarcs", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) inVarcs = np.reciprocal(in_counts[np.newaxis,:] * (D-1)/(2*D) + inDocPre[:,np.newaxis] * np.diagonal(itopicCov)[np.newaxis,:]) debugFn (itr, inVarcs, "inVarcs", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) # Update the out-means and in-means out_rhs = w_top_sums.copy() out_rhs += l_outtop_sums out_rhs += itopicCov.dot(topicMean) / outDocCov out_rhs += inMeans.dot(itopicCov) / inDocCov[:,np.newaxis] out_rhs += emit_counts[:, np.newaxis] * (outMeans.dot(A) - rowwise_softmax(outMeans)) scaled_n_in = ((D-1.)/(2*D)) * ssp.diags(in_counts, 0) in_rhs = (inDocPre[:, np.newaxis] * outMeans).dot(itopicCov) in_rhs += ((-inMeans.sum(axis=0) * in_counts) / (4*D))[np.newaxis,:] in_rhs += l_intop_sums in_rhs += in_counts[np.newaxis, :] * F for d in range(D): in_rhs[d, :] += in_counts * inMeans[d, :] / (4*D) inMeans[d, :] = la.inv(inDocPre[d] * itopicCov + scaled_n_in).dot(in_rhs[d, :]) in_rhs[d,:] -= in_counts * inMeans[d, :] / (4*D) try: outCov = la.inv((outDocPre + inDocPre[d]) * itopicCov + emit_counts[d] * A) outMeans[d, :] = outCov.dot(out_rhs[d,:]) except la.LinAlgError as err: print ("ABORTING: " + str(err)) return \ ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME), \ QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues)) debugFn (itr, outMeans, "inMeans/outMeans", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) # debugFn (itr, inMeans, "inMeans", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME) queryState = QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if itr > MinItersBeforeEarlyStop and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0: break # if True or debug or itr % logFrequency == 0: # print(" Sigma %6.1f \t %9.3g, %9.3g, %9.3g" % (np.log(la.det(topicCov)), topicCov.min(), topicCov.mean(), topicCov.max()), end=" |") # print(" rho %6.1f \t %9.3g, %9.3g, %9.3g" % (sum(log(inDocCov[d]) for d in range(D)), inDocCov.min(), inDocCov.mean(), inDocCov.max()), end=" |") # print(" alpha %6.1f \t %9.3g" % (np.log(la.det(np.eye(K,) * outDocCov)), outDocCov), end=" |") # print(" inMeans %9.3g, %9.3g, %9.3g" % (inMeans.min(), inMeans.mean(), inMeans.max()), end=" |") # print(" outMeans %9.3g, %9.3g, %9.3g" % (outMeans.min(), outMeans.mean(), outMeans.max()), end=" |") # print(" inVarcs %6.1f \t %9.3g, %9.3g, %9.3g" % (sum(safe_log_det(np.diag(inVarcs[d])) for d in range(D)) / D, inVarcs.min(), inVarcs.mean(), inVarcs.max()), end=" |") # print(" outVarcs %6.1f \t %9.3g, %9.3g, %9.3g" % (sum(safe_log_det(np.diag(outVarcs[d])) for d in range(D)) / D, outVarcs.min(), outVarcs.mean(), outVarcs.max())) return \ ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME), \ QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
def query(data, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' W, X = data.words, data.feats D, _ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug means, expMeans, varcs, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype # Debugging debugFn = _debug_with_bound if debug else _debug_with_nothing _debug_with_bound.old_bound = 0 # Necessary values isigT = la.inv(sigT) lastPerp = 1E+300 if dtype is np.float64 else 1E+30 for itr in range(iterations): # Counts of topic assignments expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab) S = expMeans * R.dot(vocab.T) # the variance varcs[:] = 1./((n * (K-1.)/K)[:,np.newaxis] + isigT.flat[::K+1]) debugFn (itr, varcs, "query-varcs", W, X, None, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, n) # Update the Means rhs = X.dot(A.T).dot(isigT) rhs += S rhs += n[:,np.newaxis] * means.dot(Ab) rhs -= n[:,np.newaxis] * rowwise_softmax(means, out=means) # Long version inverses = dict() for d in range(D): if not n[d] in inverses: inverses[n[d]] = la.inv(isigT + n[d] * Ab) lhs = inverses[n[d]] means[d,:] = lhs.dot(rhs[d,:]) debugFn (itr, means, "query-means", W, X, None, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, n) like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, n)) perp = perplexity_from_like(like, data.word_count) if itr > 20 and lastPerp - perp < 1: break lastPerp = perp return modelState, queryState # query vars altered in-place
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, X = data.words, data.feats D, _ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype # Book-keeping for logs boundIters = np.zeros(shape=(iterations // logFrequency,)) boundValues = np.zeros(shape=(iterations // logFrequency,)) boundLikes = np.zeros(shape=(iterations // logFrequency,)) bvIdx = 0 debugFn = _debug_with_bound if debug else _debug_with_nothing _debug_with_bound.old_bound = 0 # For efficient inference, we need a separate covariance for every unique # document length. For products to execute quickly, the doc-term matrix # therefore needs to be ordered in ascending terms of document length originalDocLens = docLens sortIdx = np.argsort(docLens, kind=STABLE_SORT_ALG) # sort needs to be stable in order to be reversible W = W[sortIdx,:] # deep sorted copy X = X[sortIdx,:] means, varcs = means[sortIdx,:], varcs[sortIdx,:] docLens = originalDocLens[sortIdx] data = DataSet(W, feats=X) lens, inds = np.unique(docLens, return_index=True) inds = np.append(inds, [W.shape[0]]) # Initialize some working variables R = W.copy() aI_P = 1./lfv * ssp.eye(P, dtype=dtype) print("Creating posterior covariance of A, this will take some time...") XTX = X.T.dot(X) R_A = XTX R_A = R_A.todense() # dense inverse typically as fast or faster than sparse inverse R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case R_A = la.inv(R_A) print("Covariance matrix calculated, launching inference") priorSigt_diag = np.ndarray(shape=(K,), dtype=dtype) priorSigt_diag.fill (0.001) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the covariance of the prior diff_a_yv = (A-Y.dot(V)) diff_m_xa = (means-X.dot(A.T)) sigT = 1./lfv * (Y.dot(Y.T)) sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T) sigT += diff_m_xa.T.dot(diff_m_xa) sigT.flat[::K+1] += varcs.sum(axis=0) # As small numbers lead to instable inverse estimates, we use the # fact that for a scalar a, (a .* X)^-1 = 1/a * X^-1 and use these # scales whenever we use the inverse of the unscaled covariance sigScale = 1. / (P+D+F) isigScale = 1. / sigScale isigT = la.inv(sigT) debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Building Blocks - termporarily replaces means with exp(means) expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) if np.isnan(expMeans).any() or np.isinf(expMeans).any(): print ("Yoinks, Scoob..!") R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) # S = expMeans * R.dot(vocab.T) # Update the vocabulary vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) # Reset the means to their original form, and log effect of vocab update R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) S = expMeans * R.dot(vocab.T) debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Finally update the parameter V V = la.inv(sigScale * R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A)) debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # # And now this is the E-Step # # Update the distribution on the latent space R_Y_base = aI_P + 1/fv * V.dot(V.T) R_Y = la.inv(R_Y_base) debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) Y = 1./fv * A.dot(V.T).dot(R_Y) debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the mapping from the features to topics A = (1./fv * Y.dot(V) + (X.T.dot(means)).T).dot(R_A) debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the Variances varcs = 1./((docLens * (K-1.)/K)[:,np.newaxis] + isigScale * isigT.flat[::K+1]) debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the Means rhs = X.dot(A.T).dot(isigT) * isigScale rhs += S rhs += docLens[:,np.newaxis] * means.dot(Ab) rhs -= docLens[:,np.newaxis] * rowwise_softmax(means, out=means) # Long version # inverses = dict() # sca_means = means.copy() # for d in range(D): # if not n[d] in inverses: # inverses[n[d]] = la.inv(isigT + n[d] * Ab) # lhs = inverses[n[d]] # sca_means[d,:] = lhs.dot(rhs[d,:]) # print("Sca-Means: %f, %f, %f, %f" % (sca_means.min(), sca_means.mean(), sca_means.std(), sca_means.max())) # Faster version? for lenIdx in range(len(lens)): nd = lens[lenIdx] start, end = inds[lenIdx], inds[lenIdx + 1] lhs = la.inv(isigT + sigScale * nd * Ab) * sigScale means[start:end,:] = rhs[start:end,:].dot(lhs) # huh?! Left and right refer to eqn for a single mean: once we're talking a DxK matrix it gets swapped # print("Vec-Means: %f, %f, %f, %f" % (means.min(), means.mean(), means.std(), means.max())) debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, docLens) boundValues[bvIdx] = var_bound(DataSet(W, feats=X), modelState, queryState, XTX) boundLikes[bvIdx] = log_likelihood(DataSet(W, feats=X), modelState, queryState) boundIters[bvIdx] = itr perp = perplexity_from_like(boundLikes[bvIdx], docLens.sum()) print (time.strftime('%X') + " : Iteration %d: Perplexity %4.0f bound %f" % (itr, perp, boundValues[bvIdx])) if bvIdx > 0 and boundValues[bvIdx - 1] > boundValues[bvIdx]: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx])) # print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max())) # Check to see if the improvement in the likelihood has fallen below the threshold if bvIdx > 1 and boundIters[bvIdx] > 20: lastPerp = perplexity_from_like(boundLikes[bvIdx - 1], docLens.sum()) if lastPerp - perp < 1: boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, boundLikes, bvIdx) break bvIdx += 1 revert_sort = np.argsort(sortIdx, kind=STABLE_SORT_ALG) means = means[revert_sort,:] varcs = varcs[revert_sort,:] docLens = docLens[revert_sort] return \ ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, docLens), \ (boundIters, boundValues, boundLikes)
def cross_val_and_eval_hashtag_prec_at_m(data, mdl, sample_model, train_plan, word_dict, num_folds, fold_run_count=-1, model_dir= None): ''' Evaluate the precision at M for the top 50 hash-tags. In the held-out set, the hashtags are deleted. We train on all, both training and held-out, then evaluate the precision at M for the hashtags For values of M we use 10, 50, 100, 150, 250, 500 :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param word_dict the word dictionary, used to identify hashtags and print them out when the run is completed. :param num_folds: the number of folds to cross validation :param fold_run_count: for debugging stop early after processing the number of the folds :param model_dir: if not none, the models are stored in this directory. :return: the list of model files stored ''' MS = [10, 50, 100, 150, 200, 250, 1000, 1500, 3000, 5000, 10000] Precision, Recall = "precision", "recall" model_files = [] if fold_run_count < 1: fold_run_count = num_folds if num_folds <= 1: raise ValueError ("Number of folds must be greater than 1") hashtag_indices = popular_hashtag_indices (data, word_dict, 50) folds_finished = 0 # count of folds that finished successfully fold = 0 while fold < num_folds and folds_finished < fold_run_count: try: train_range, query_range = data.cross_valid_split_indices(fold, num_folds) segment_with_htags = data.words[train_range, :] held_out_segment_with_htags = data.words[query_range, :] held_out_segment_without_htags = data.words[query_range, :] held_out_segment_without_htags[:, hashtag_indices] = 0 train_words = ssp.vstack((segment_with_htags, held_out_segment_without_htags)) train_data = data.copy_with_changes(words=train_words) # Train the model print ("Duplicating model template... ", end="") model = mdl.newModelFromExisting(sample_model) train_tops = mdl.newQueryState(train_data, model) print ("Starting training") model, train_tops, (train_itrs, train_vbs, train_likes) \ = mdl.train(train_data, model, train_tops, train_plan) # Predict hashtags dist = rowwise_softmax(train_tops.means) # For each hash-tag, for each value of M, evaluate the precision results = {Recall : dict(), Precision : dict()} for hi in hashtag_indices: h_probs = dist[query_range,:].dot(model.vocab[:,hi]) h_count = held_out_segment_with_htags[:, hi].sum() results[Recall][word_dict[hi]] = { -1 : h_count } results[Precision][word_dict[hi]] = { -1 : h_count } for m in MS: top_m = h_probs.argsort()[-m:][::-1] true_pos = held_out_segment_with_htags[top_m, hi].sum() rec_denom = min(m, h_count) results[Precision][word_dict[hi]][m] = true_pos / m results[Recall][word_dict[hi]][m] = true_pos / rec_denom print ("%10s\t%20s\t%6s\t" % ("Metric", "Hashtag", "Count") + "\t".join("%5d" % m for m in MS)) for htag, prec_results in results[Precision].items(): print ("%10s\t%20s\t%6d\t%s" % ("Precision", htag, prec_results[-1], "\t".join(("%0.3f" % prec_results[m] for m in MS)))) for htag, prec_results in results[Recall].items(): print ("%10s\t%20s\t%6d\t%s" % ("Recall", htag, prec_results[-1], "\t".join(("%0.3f" % prec_results[m] for m in MS)))) # Save the model model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, None, mdl) except Exception as e: traceback.print_exc() print("Abandoning fold %d due to the error : %s" % (fold, str(e))) finally: fold += 1 return model_files
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W = data.words D,_ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens K, topicMean, sigT, vocab, vocabPrior, A, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A, modelState.dtype # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables isigT = la.inv(sigT) R = W.copy() pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN pseudoObsVar = K + NIW_PSEUDO_OBS_VAR priorSigT_diag = np.ndarray(shape=(K,), dtype=dtype) priorSigT_diag.fill (NIW_PSI) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the mean and covariance of the prior topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \ if USE_NIW_PRIOR \ else means.mean(axis=0) debugFn (itr, topicMean, "topicMean", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) if USE_NIW_PRIOR: diff = means - topicMean[np.newaxis,:] sigT = diff.T.dot(diff) \ + pseudoObsVar * np.outer(topicMean, topicMean) sigT += np.diag(varcs.mean(axis=0) + priorSigT_diag) sigT /= (D + pseudoObsVar - K) else: sigT = np.cov(means.T) if sigT.dtype == np.float64 else np.cov(means.T).astype(dtype) sigT += np.diag(varcs.mean(axis=0)) if diagonalPriorCov: diag = np.diag(sigT) sigT = np.diag(diag) isigT = np.diag(1./ diag) else: isigT = la.inv(sigT) # FIXME Undo debug sigT = np.eye(K) isigT = la.inv(sigT) debugFn (itr, sigT, "sigT", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) # print(" sigT.det = " + str(la.det(sigT))) # Building Blocks - temporarily replaces means with exp(means) expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) # Update the vocabulary vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) # Reset the means to their original form, and log effect of vocab update R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) V = expMeans * R.dot(vocab.T) debugFn (itr, vocab, "vocab", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) # And now this is the E-Step, though itr's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the Variances: var_d = (2 N_d * A + isigT)^{-1} varcs = np.reciprocal(docLens[:,np.newaxis] * (K-1.)/K + np.diagonal(sigT)) debugFn (itr, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) # Update the Means rhs = V.copy() rhs += docLens[:,np.newaxis] * means.dot(A) + isigT.dot(topicMean) rhs -= docLens[:,np.newaxis] * rowwise_softmax(means, out=means) if diagonalPriorCov: means = varcs * rhs else: for d in range(D): means[d, :] = la.inv(isigT + docLens[d] * A).dot(rhs[d, :]) # means -= (means[:,0])[:,np.newaxis] debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: if debug: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if itr > 100 and len(likelyValues) > 3 \ and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0: break return \ ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
def _sampleFromModel(self, D=200, T=100, K=10, F=12, P=8, avgWordsPerDoc = 500): ''' Create a test dataset according to the model Params: T - Vocabulary size, the number of "terms". Must be a square number K - Observed topics P - Latent features F - Observed features D - Sample documents (each with associated features) avgWordsPerDoc - average number of words per document generated (Poisson) Returns: modelState - a model state object configured for training tpcs - the matrix of per-document topic distribution vocab - the matrix of per-topic word distributions docLens - the vector of document lengths X - the DxF side information matrix W - The DxW word matrix ''' # Generate vocab beta = 0.1 betaVec = np.ndarray((T,)) betaVec.fill(beta) vocab = np.zeros((K,T)) for k in range(K): vocab[k,:] = rd.dirichlet(betaVec) # Geneate the shared covariance matrix sigT = rd.random((K,K)) sigT = sigT.dot(sigT) sigT.flat[::K+1] += rd.random((K,)) * 4 # Just link two topics sigT[K//2, K//3] = 3 sigT[K//3, K//2] = 3 sigT[4 * K//5, K//5] = 4 sigT[K//5, 4 * K//5] = 4 # Generate Y, then V, then A lfv = 0.1 # latent feature variance (for Y) fv = 0.1 # feature variance (for A) Y = matrix_normal(np.zeros((K,P)), lfv * np.eye(P), sigT) V = matrix_normal(np.zeros((P,F)), fv * np.eye(F), lfv * np.eye(P)) A = matrix_normal(Y.dot(V), fv * np.eye(F), sigT) # Generate the input features. Assume the features are multinomial and sparse # (not quite a perfect match for the twitter example: twitter is binary, this # may not be) featuresDist = [1. / F] * F maxNonZeroFeatures = 3 X = np.zeros((D,F), dtype=np.float32) for d in range(D): X[d,:] = rd.multinomial(maxNonZeroFeatures, featuresDist) X = ssp.csr_matrix(X) # Use the features and the matrix A to generate the topics and documents tpcs = rowwise_softmax (X.dot(A.T)) docLens = rd.poisson(avgWordsPerDoc, (D,)).astype(np.float32) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # Return the initialised model, the true parameter values, and the # generated observations return tpcs, vocab, docLens, X, W
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, L, LT, X = data.words, data.links, ssp.csr_matrix(data.links.T), data.feats D,_ = W.shape out_links = np.squeeze(np.asarray(data.links.sum(axis=1))) # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, varcs, docLens = queryState.means, queryState.varcs, queryState.docLens K, topicMean, topicCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.vocab, modelState.A, modelState.dtype emit_counts = docLens + out_links # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] if debug: debugFn = _debug_with_bound initLikely = log_likelihood(data, modelState, queryState) initPerp = perplexity_from_like(initLikely, data.word_count) print ("Initial perplexity is: %.2f" % initPerp) else: debugFn = _debug_with_nothing # Initialize some working variables W_weight = W.copy() L_weight = L.copy() LT_weight = LT.copy() pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN pseudoObsVar = K + NIW_PSEUDO_OBS_VAR priorSigT_diag = np.ndarray(shape=(K,), dtype=dtype) priorSigT_diag.fill (NIW_PSI) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the mean and covariance of the prior topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \ if USE_NIW_PRIOR \ else means.mean(axis=0) debugFn (itr, topicMean, "topicMean", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) if USE_NIW_PRIOR: diff = means - topicMean[np.newaxis,:] topicCov = diff.T.dot(diff) \ + pseudoObsVar * np.outer(topicMean, topicMean) topicCov += np.diag(varcs.mean(axis=0) + priorSigT_diag) topicCov /= (D + pseudoObsVar - K) else: topicCov = np.cov(means.T) if topicCov.dtype == np.float64 else np.cov(means.T).astype(dtype) topicCov += np.diag(varcs.mean(axis=0)) if diagonalPriorCov: diag = np.diag(topicCov) topicCov = np.diag(diag) itopicCov = np.diag(1./ diag) else: itopicCov = la.inv(topicCov) debugFn (itr, topicCov, "topicCov", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) # print(" topicCov.det = " + str(la.det(topicCov))) # Building Blocks - temporarily replaces means with exp(means) expMeansCol = np.exp(means - means.max(axis=0)[np.newaxis, :]) lse_at_k = np.sum(expMeansCol, axis=0) F = 0.5 * means \ - (1. / (2*D + 2)) * means.sum(axis=0) \ - expMeansCol / lse_at_k[np.newaxis, :] expMeansRow = np.exp(means - means.max(axis=1)[:, np.newaxis]) W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight) # Update the vocabularies vocab *= (W_weight.T.dot(expMeansRow)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += VocabPrior vocab = normalizerows_ip(vocab) docVocab = (expMeansCol / lse_at_k[np.newaxis, :]).T # FIXME Dupes line in definitino of F # Recalculate w_top_sums with the new vocab and log vocab improvement W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight) w_top_sums = W_weight.dot(vocab.T) * expMeansRow debugFn (itr, vocab, "vocab", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) # Now do likewise for the links, do it twice to model in-counts (first) and # out-counts (Second). The difference is the transpose LT_weight = sparseScalarQuotientOfDot(LT, expMeansRow, docVocab, out=LT_weight) l_intop_sums = LT_weight.dot(docVocab.T) * expMeansRow in_counts = l_intop_sums.sum(axis=0) L_weight = sparseScalarQuotientOfDot(L, expMeansRow, docVocab, out=L_weight) l_outtop_sums = L_weight.dot(docVocab.T) * expMeansRow # Reset the means and use them to calculate the weighted sum of means meanSum = means.sum(axis=0) * in_counts # And now this is the E-Step, though itr's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the Variances: var_d = (2 N_d * A + itopicCov)^{-1} varcs = np.reciprocal(docLens[:, np.newaxis] * (0.5 - 1./K) + np.diagonal(topicCov)) debugFn (itr, varcs, "varcs", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) # Update the Means rhs = w_top_sums.copy() rhs += l_intop_sums rhs += l_outtop_sums rhs += itopicCov.dot(topicMean) rhs += emit_counts[:, np.newaxis] * (means.dot(A) - rowwise_softmax(means)) rhs += in_counts[np.newaxis, :] * F if diagonalPriorCov: raise ValueError("Not implemented") else: for d in range(D): rhs_ = rhs[d, :] + (1. / (4 * D + 4)) * (meanSum - in_counts * means[d, :]) means[d, :] = la.inv(itopicCov + emit_counts[d] * A + np.diag(D * in_counts / (2 * D + 2))).dot(rhs_) if np.any(np.isnan(means[d, :])) or np.any (np.isinf(means[d, :])): pass if np.any(np.isnan(np.exp(means[d, :] - means[d, :].max()))) or np.any (np.isinf(np.exp(means[d, :] - means[d, :].max()))): pass debugFn (itr, means, "means", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, topicMean, topicCov, vocab, A, dtype, MODEL_NAME) queryState = QueryState(means, varcs, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if False and itr > 100 and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0: break return \ ModelState(K, topicMean, topicCov, vocab, A, dtype, MODEL_NAME), \ QueryState(means, varcs, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues))