def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words are used in this model modelState - the actual LDA model. In a training run (query = False) this will be mutated in place, and then returned. queryState - the query results - essentially all the "local" variables matched to the given observations. This will be mutated in-place and then returned. trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) query - Return: The updated model object (note parameters are updated in place, so make a defensive copy if you want it) The query object with the update query parameters ''' iterations, epsilon, logFrequency, fastButInaccurate, debug = \ trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug W_list, docLens, topicDists = \ queryState.W_list, queryState.docLens, queryState.topicDists K, topicPrior, vocabPrior, wordDists, dtype = \ modelState.K, modelState.topicPrior, modelState.vocabPrior, modelState.wordDists, modelState.dtype W = data.words D,T = W.shape # Quick sanity check if np.any(docLens < 1): raise ValueError ("Input document-term matrix contains at least one document with no words") # Book-keeping for logs logPoints = 1 if logFrequency == 0 else iterations // logFrequency boundIters = np.zeros(shape=(logPoints,)) boundValues = np.zeros(shape=(logPoints,)) likelyValues = np.zeros(shape=(logPoints,)) bvIdx = 0 # Instead of storing the full topic assignments for every individual word, we # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension, # we only store a 1xNxT = NxT part. z_dnk = np.empty((docLens.max(), K), dtype=dtype, order='F') # Select the training iterations function appropriate for the dtype current_micro_time = lambda: int(time.time()) do_iterations = compiled.iterate_f32 \ if modelState.dtype == np.float32 \ else compiled.iterate_f64 # do_iterations = iterate # pure Python # Iterate in segments, pausing to take measures of the bound / likelihood segIters = logFrequency remainder = iterations - segIters * (logPoints - 1) totalItrs = 0 for segment in range(logPoints - 1): start = current_micro_time() totalItrs += do_iterations (segIters, D, K, T, \ W_list, docLens, \ topicPrior, vocabPrior, \ z_dnk, topicDists, wordDists) duration = current_micro_time() - start boundIters[bvIdx] = segment * segIters boundValues[bvIdx] = var_bound(data, modelState, queryState) likelyValues[bvIdx] = log_likelihood(data, modelState, queryState) perp = perplexity_from_like(likelyValues[bvIdx], W.sum()) bvIdx += 1 if converged (boundIters, boundValues, bvIdx, epsilon, minIters=20): boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likelyValues, bvIdx) return ModelState(K, topicPrior, vocabPrior, wordDists, modelState.dtype, modelState.name), \ QueryState(W_list, docLens, topicDists), \ (boundIters, boundValues, likelyValues) print ("Segment %d/%d Total Iterations %d Duration %d Perplexity %4.0f Bound %10.2f Likelihood %10.2f" % (segment, logPoints, totalItrs, duration, perp, boundValues[bvIdx - 1], likelyValues[bvIdx - 1])) # Final batch of iterations. do_iterations (remainder, D, K, T, \ W_list, docLens, \ topicPrior, vocabPrior, \ z_dnk, topicDists, wordDists) boundIters[bvIdx] = iterations - 1 boundValues[bvIdx] = var_bound(data, modelState, queryState) likelyValues[bvIdx] = log_likelihood(data, modelState, queryState) return ModelState(K, topicPrior, vocabPrior, wordDists, modelState.dtype, modelState.name), \ QueryState(W_list, docLens, topicDists), \ (boundIters, boundValues, likelyValues)
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, X = data.words, data.feats D, _ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype # Book-keeping for logs boundIters = np.zeros(shape=(iterations // logFrequency,)) boundValues = np.zeros(shape=(iterations // logFrequency,)) boundLikes = np.zeros(shape=(iterations // logFrequency,)) bvIdx = 0 debugFn = _debug_with_bound if debug else _debug_with_nothing _debug_with_bound.old_bound = 0 # For efficient inference, we need a separate covariance for every unique # document length. For products to execute quickly, the doc-term matrix # therefore needs to be ordered in ascending terms of document length originalDocLens = docLens sortIdx = np.argsort(docLens, kind=STABLE_SORT_ALG) # sort needs to be stable in order to be reversible W = W[sortIdx,:] # deep sorted copy X = X[sortIdx,:] means, varcs = means[sortIdx,:], varcs[sortIdx,:] docLens = originalDocLens[sortIdx] lens, inds = np.unique(docLens, return_index=True) inds = np.append(inds, [W.shape[0]]) # Initialize some working variables R = W.copy() aI_P = 1./lfv * ssp.eye(P, dtype=dtype) print("Creating posterior covariance of A, this will take some time...") XTX = X.T.dot(X) R_A = XTX R_A = R_A.todense() # dense inverse typically as fast or faster than sparse inverse R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case R_A = la.inv(R_A) print("Covariance matrix calculated, launching inference") diff_m_xa = (means-X.dot(A.T)) means_cov_with_x_a = diff_m_xa.T.dot(diff_m_xa) expMeans = np.zeros((BatchSize, K), dtype=dtype) R = np.zeros((BatchSize, K), dtype=dtype) S = np.zeros((BatchSize, K), dtype=dtype) vocabScale = np.ones(vocab.shape, dtype=dtype) # Iterate over parameters batchIter = 0 for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the covariance of the prior diff_a_yv = (A-Y.dot(V)) sigT = 1./lfv * (Y.dot(Y.T)) sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T) sigT += means_cov_with_x_a sigT.flat[::K+1] += varcs.sum(axis=0) # As small numbers lead to instable inverse estimates, we use the # fact that for a scalar a, (a .* X)^-1 = 1/a * X^-1 and use these # scales whenever we use the inverse of the unscaled covariance sigScale = 1. / (P+D+F) isigScale = 1. / sigScale isigT = la.inv(sigT) debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the vocabulary # vocab *= vocabScale # vocab += vocabPrior # vocab = normalizerows_ip(vocab) # debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Finally update the parameter V V = la.inv(sigScale * R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A)) debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # # And now this is the E-Step # # Update the distribution on the latent space R_Y_base = aI_P + 1/fv * V.dot(V.T) R_Y = la.inv(R_Y_base) debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) Y = 1./fv * A.dot(V.T).dot(R_Y) debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the mapping from the features to topics A = (1./fv * Y.dot(V) + (X.T.dot(means)).T).dot(R_A) debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the Variances varcs = 1./((docLens * (K-1.)/K)[:,np.newaxis] + isigScale * isigT.flat[::K+1]) debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Faster version? vocabScale[:,:] = 0 means_cov_with_x_a[:,:] = 0 for lenIdx in range(len(lens)): nd = lens[lenIdx] start, end = inds[lenIdx], inds[lenIdx + 1] lhs = la.inv(isigT + sigScale * nd * Ab) * sigScale for d in range(start, end, BatchSize): end_d = min(d + BatchSize, end) span = end_d - d expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:]) R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[d:end_d,:], vocab) S[:span,:] = expMeans[:span, :] * R.dot(vocab.T) # Convert expMeans to a softmax(means) expMeans[:span,:] /= expMeans[:span,:].sum(axis=1)[:span,np.newaxis] mu = X[d:end_d,:].dot(A.T) rhs = mu.dot(isigT) * isigScale rhs += S[:span,:] rhs += docLens[d:end_d,np.newaxis] * means[d:end_d,:].dot(Ab) rhs -= docLens[d:end_d,np.newaxis] * expMeans[:span,:] # here expMeans is actually softmax(means) means[d:end_d,:] = rhs.dot(lhs) # huh?! Left and right refer to eqn for a single mean: once we're talking a DxK matrix it gets swapped expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:]) R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[:span,:], vocab, out=R) stepSize = (Tau + batchIter) ** -Kappa batchIter += 1 # Do a gradient update of the vocab vocabScale = (R.T.dot(expMeans[:span,:])).T vocabScale *= vocab normalizerows_ip(vocabScale) # vocabScale += vocabPrior vocabScale *= stepSize vocab *= (1 - stepSize) vocab += vocabScale diff = (means[d:end_d,:] - mu) means_cov_with_x_a += diff.T.dot(diff) # print("Vec-Means: %f, %f, %f, %f" % (means.min(), means.mean(), means.std(), means.max())) debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, docLens) boundValues[bvIdx] = var_bound(DataSet(W, feats=X), modelState, queryState, XTX) boundLikes[bvIdx] = log_likelihood(DataSet(W, feats=X), modelState, queryState) boundIters[bvIdx] = itr perp = perplexity_from_like(boundLikes[bvIdx], docLens.sum()) print (time.strftime('%X') + " : Iteration %d: Perplexity %4.0f bound %f" % (itr, perp, boundValues[bvIdx])) if bvIdx > 0 and boundValues[bvIdx - 1] > boundValues[bvIdx]: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx])) # print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max())) # Check to see if the improvement in the likelihood has fallen below the threshold if bvIdx > 1 and boundIters[bvIdx] > 20: lastPerp = perplexity_from_like(boundLikes[bvIdx - 1], docLens.sum()) if lastPerp - perp < 1: boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, boundLikes, bvIdx) break bvIdx += 1 revert_sort = np.argsort(sortIdx, kind=STABLE_SORT_ALG) means = means[revert_sort,:] varcs = varcs[revert_sort,:] docLens = docLens[revert_sort] return \ ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, docLens), \ (boundIters, boundValues, boundLikes)
def train (data, modelState, queryState, trainPlan, query=False): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words are used in this model modelState - the actual LDA model. In a training run (query = False) this will be mutated in place, and then returned. queryState - the query results - essentially all the "local" variables matched to the given observations. This will be mutated in-place and then returned. trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) query - Return: The updated model object (note parameters are updated in place, so make a defensive copy if you want it) The query object with the update query parameters ''' iterations, epsilon, logFrequency, fastButInaccurate, debug = \ trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug W_list, docLens, q_n_dk, q_n_kt, q_n_k, z_dnk = \ queryState.W_list, queryState.docLens, queryState.n_dk, queryState.n_kt, queryState.n_k, queryState.z_dnk K, topicPrior, vocabPrior, m_n_dk, m_n_kt, m_n_k = \ modelState.K, modelState.topicPrior, modelState.vocabPrior, modelState.n_dk, modelState.n_kt, modelState.n_k D_train = 0 if m_n_dk is None else m_n_dk.shape[0] D_query = q_n_dk.shape[0] W = data.words T = W.shape[1] # Quick sanity check if np.any(docLens < 1): raise ValueError ("Input document-term matrix contains at least one document with no words") # Book-keeping for logs logPoints = 1 if logFrequency == 0 else iterations // logFrequency boundIters = np.zeros(shape=(logPoints,)) boundValues = np.zeros(shape=(logPoints,)) likelyValues = np.zeros(shape=(logPoints,)) bvIdx = 0 # Early stopping check finishedTraining = False # Add the model counts (essentially the learnt model parameters) to those for # the query, assuming the model has been trained previously if m_n_dk is not None: np.add (q_n_kt, m_n_kt, out=q_n_kt) # q_n_kt += m_n_kt np.add (q_n_k, m_n_k, out=q_n_k) # q_n_k += m_n_k # print ("Topic prior : " + str(topicPrior)) # Select the training iterations function appropriate for the dtype if debug: print ("Starting Training") do_iterations = compiled.iterate_f32 \ if modelState.dtype == np.float32 \ else compiled.iterate_f64 # Iterate in segments, pausing to take measures of the bound / likelihood segIters = logFrequency remainder = iterations - segIters * (logPoints - 1) for segment in range(logPoints - 1): do_iterations (segIters, D_query, D_train, K, T, \ W_list, docLens, \ q_n_dk, q_n_kt, q_n_k, z_dnk,\ topicPrior, vocabPrior) # Measure and record the improvement to the bound and log-likely boundIters[bvIdx] = segment * segIters boundValues[bvIdx] = var_bound_intermediate(data, modelState, queryState, q_n_kt, q_n_k) likelyValues[bvIdx] = log_likely_intermediate(data, modelState, queryState, q_n_kt, q_n_k) bvIdx += 1 # Check to see if the improvement in the bound has fallen below the threshold if converged (boundIters, boundValues, bvIdx, epsilon, minIters=20): boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likelyValues, bvIdx) finishedTraining = True break if debug: print ("Segment %d/%d Total Iterations %d Duration %d" % (segment, logPoints, -1, -1)) # Final scheduled batch of iterations if we haven't already converged. if not finishedTraining: do_iterations (remainder, D_query, D_train, K, T, \ W_list, docLens, \ q_n_dk, q_n_kt, q_n_k, z_dnk,\ topicPrior, vocabPrior) boundIters[bvIdx] = iterations - 1 boundValues[bvIdx] = var_bound_intermediate(data, modelState, queryState, q_n_kt, q_n_k) likelyValues[bvIdx] = log_likely_intermediate(data, modelState, queryState, q_n_kt, q_n_k) # Now return the results if query: # Model is unchanged, query is changed if m_n_dk is not None: np.subtract(q_n_kt, m_n_kt, out=q_n_kt) # q_n_kt -= m_n_kt np.subtract(q_n_k, m_n_k, out=q_n_k) # q_n_k -= m_n_k else: # train # Model is changed (or flat-out created). Query is changed if m_n_dk is not None: # Amend existing m_n_dk = np.vstack((m_n_dk, q_n_dk)) m_n_kt[:,:] = q_n_kt m_n_k[:] = q_n_k else: # Create from scratch m_n_dk = q_n_dk.copy() m_n_kt = q_n_kt.copy() m_n_k = q_n_k.copy() return ModelState(K, topicPrior, vocabPrior, m_n_dk, m_n_kt, m_n_k, modelState.dtype, modelState.name), \ QueryState(W_list, docLens, q_n_dk, q_n_kt, q_n_k, z_dnk), \ (boundIters, boundValues, likelyValues)
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want it) A new query object with the update query parameters ''' W, X = data.words, data.feats assert W.dtype == modelState.dtype assert X.dtype == modelState.dtype D,_ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype # Book-keeping for logs boundIters = np.zeros(shape=(iterations // logFrequency,)) boundValues = np.zeros(shape=(iterations // logFrequency,)) likeValues = np.zeros(shape=(iterations // logFrequency,)) bvIdx = 0 _debug_with_bound.old_bound = 0 debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables isigT = la.inv(sigT) R = W.copy() sigT_regularizer = 0.001 aI_P = 1./lfv * ssp.eye(P, dtype=dtype) tI_F = 1./fv * ssp.eye(F, dtype=dtype) print("Creating posterior covariance of A, this will take some time...") XTX = X.T.dot(X) R_A = XTX if ssp.issparse(R_A): R_A = R_A.todense() # dense inverse typically as fast or faster than sparse inverse R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case R_A = la.inv(R_A) print("Covariance matrix calculated, launching inference") s.fill(0) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the covariance of the prior diff_a_yv = (A-Y.dot(V)) diff_m_xa = (means-X.dot(A.T)) sigT = 1./lfv * (Y.dot(Y.T)) sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T) sigT += diff_m_xa.T.dot(diff_m_xa) sigT.flat[::K+1] += varcs.sum(axis=0) sigT /= (P+F+D) sigT.flat[::K+1] += sigT_regularizer # Diagonalize it sigT = np.diag(sigT.flat[::K+1]) # and invert it. isigT = np.diag(np.reciprocal(sigT.flat[::K+1])) debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Building Blocks - temporarily replaces means with exp(means) expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) S = expMeans * R.dot(vocab.T) # Update the vocabulary vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) # Reset the means to their original form, and log effect of vocab update debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Finally update the parameter V V = la.inv(R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A)) debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # And now this is the E-Step, though it's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the distribution on the latent space R_Y_base = aI_P + 1/fv * V.dot(V.T) R_Y = la.inv(R_Y_base) debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) Y = 1./fv * A.dot(V.T).dot(R_Y) debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the mapping from the features to topics A = (1./fv * (Y).dot(V) + (X.T.dot(means)).T).dot(R_A) debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Means vMat = (s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S rhsMat = vMat + X.dot(A.T).dot(isigT) # TODO Verify this lhsMat = np.reciprocal(np.diag(isigT)[np.newaxis,:] + n[:,np.newaxis] * lxi) # inverse of D diagonal matrices... means = lhsMat * rhsMat # as LHS is a diagonal matrix for all d, it's equivalent # do doing a hadamard product for all d debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Variances varcs = 1./(n[:,np.newaxis] * lxi + isigT.flat[::K+1]) debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the approximation parameters lxi = 2 * ctm.negJakkolaOfDerivedXi(means, varcs, s) debugFn (itr, lxi, "lxi", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # s can sometimes grow unboundedly # Follow Bouchard's suggested approach of fixing it at zero # # s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1) # debugFn (itr, s, "s", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, lxi, s, n) boundValues[bvIdx] = var_bound(data, modelState, queryState, XTX) likeValues[bvIdx] = log_likelihood(data, modelState, queryState) boundIters[bvIdx] = itr perp = perplexity_from_like(likeValues[bvIdx], n.sum()) print (time.strftime('%X') + " : Iteration %d: Perplexity %4.2f bound %f" % (itr, perp, boundValues[bvIdx])) if bvIdx > 0 and boundValues[bvIdx - 1] > boundValues[bvIdx]: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx])) # print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max())) # Check to see if the improvment in the likelihood has fallen below the threshold if bvIdx > 1 and boundIters[bvIdx] > 50: lastPerp = perplexity_from_like(likeValues[bvIdx - 1], n.sum()) if lastPerp - perp < 1: boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likeValues, bvIdx) return modelState, queryState, (boundIters, boundValues, likeValues) bvIdx += 1 return \ ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, lxi, s, n), \ (boundIters, boundValues, likeValues)
def train (dataset, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words are used in this model modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want it) A new query object with the update query parameters ''' W = dataset.words D,_ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens K, topicMean, sigT, vocab, vocabPrior, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype # Book-keeping for logs boundIters = np.zeros(shape=(iterations // logFrequency,)) boundValues = np.zeros(shape=(iterations // logFrequency,)) likelyValues = np.zeros(shape=(iterations // logFrequency,)) bvIdx = 0 debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables isigT = la.inv(sigT) R = W.copy() s.fill(0) priorSigt_diag = np.ndarray(shape=(K,), dtype=dtype) priorSigt_diag.fill (0.1) kappa = K + 2 expMeans = means.copy() # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the mean and covariance of the prior # topicMean = means.mean(axis = 0) topicMean = means.sum(axis=0) / (D + kappa) \ if USE_NIW_PRIOR \ else means.mean(axis=0) debugFn (itr, topicMean, "topicMean", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # diff = means - topicMean # sigT = diff.T.dot(diff) / D sigT, _ = oas(means, assume_centered=False) if dtype is not np.float64: sigT = sigT.astype(dtype) sigT += np.diag(varcs.mean(axis=0)) if USE_NIW_PRIOR: sigT.flat[::K+1] += priorSigt_diag sigT += (kappa * D)/(kappa + D) * np.outer(topicMean, topicMean) # Building blocks... # 1/4 Create the precision matrix from the covariance if True or diagonalPriorCov: diag = np.diag(sigT) sigT = np.diag(diag) isigT = np.diag(1. / diag) else: isigT = la.inv(sigT) debugFn (itr, sigT, "sigT", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # print (" Det sigT = " + str(la.det(sigT))) # 2/4 temporarily replace means with exp(means) expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) # S = expMeans * R.dot(vocab.T) # 3/4 Update the vocabulary vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) S = expMeans * R.dot(vocab.T) # 4/4 Reset the means to their original form, and log effect of vocab update #means = np.log(expMeans, out=expMeans) debugFn (itr, vocab, "vocab", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # And now this is the E-Step, though it's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the Variances varcs = np.reciprocal(n[:,np.newaxis] * lxi + isigT.flat[::K+1]) debugFn (itr, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Means vMat = (s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S rhsMat = vMat + isigT.dot(topicMean) # for d in range(D): # means[d,:] = la.inv(isigT + ssp.diags(n[d] * lxi[d,:], 0)).dot(rhsMat[d,:]) means = varcs * rhsMat means -= (means[:,0])[:,np.newaxis] debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the approximation parameters lxi = 2 * negJakkolaOfDerivedXi(means, varcs, s) debugFn (itr, lxi, "lxi", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # s can sometimes grow unboundedly # If so Bouchard's suggested approach of fixing it at zero # #s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1) debugFn (itr, s, "s", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, topicMean, sigT, vocab, vocabPrior, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, lxi, s, n) boundValues[bvIdx] = var_bound(dataset, modelState, queryState) likelyValues[bvIdx] = log_likelihood(dataset, modelState, queryState) boundIters[bvIdx] = itr perp = perplexity_from_like(likelyValues[bvIdx], n.sum()) print (time.strftime('%X') + " : Iteration %5d: Perplexity %4.2f Bound %10.2f " % (itr, perp, boundValues[bvIdx])) if bvIdx > 0 and boundValues[bvIdx - 1] > boundValues[bvIdx]: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx])) # print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max())) # Check to see if the improvment in the likelihood has fallen below the threshold if bvIdx > 1 and boundIters[bvIdx] >= 30: lastPerp = perplexity_from_like(likelyValues[bvIdx - 1], n.sum()) if lastPerp - perp < 1: boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likelyValues, bvIdx) return modelState, queryState, (boundIters, boundValues, likelyValues) bvIdx += 1 return \ ModelState(K, topicMean, sigT, vocab, vocabPrior, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, lxi, s, n), \ (boundIters, boundValues, likelyValues)