def train(data, model, query, plan, updateVocab=True): ''' Infers the topic distributions in general, and specifically for each individual datapoint, and additionally learns the weights needed to predict new links. Params: W - the DxT document-term matrix X - The DxD document-document matrix model - the initial model configuration. This is MUTATED IN-PLACE qyery - the query results - essentially all the "local" variables matched to the given observations. Also MUTATED IN-PLACE plan - how to execute the training process (e.g. iterations, log-interval etc.) Return: The updated model object (note parameters are updated in place, so make a defensive copy if you want it) The query object with the update query parameters ''' iterations, epsilon, logFrequency, fastButInaccurate, debug = \ plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug docLens, topicMeans = \ query.docLens, query.topicDists K, topicPrior, vocabPrior, wordDists, weights, negCount, reg, dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.weights, model.pseudoNegCount, model.regularizer, model.dtype # Quick sanity check if np.any(docLens < 1): raise ValueError( "Input document-term matrix contains at least one document with no words" ) assert dtype == np.float64, "Only implemented for 64-bit floats" # Prepare the data for inference topicMeans = _convertDirichletParamToMeans(docLens, topicMeans, topicPrior) W = data.words D, T = W.shape X = data.links iters, bnds, likes = [], [], [] # Instead of storing the full topic assignments for every individual word, we # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension, # we only store a 1xNxT = NxT part. z = np.empty((K, ), dtype=dtype, order='F') diWordDistSums = np.empty((K, ), dtype=dtype) diWordDists = np.empty(wordDists.shape, dtype=dtype) for itr in range(iterations): if debug: printAndFlushNoNewLine("\n %4d: " % itr) diWordDistSums[:] = wordDists.sum(axis=1) fns.digamma(diWordDistSums, out=diWordDistSums) fns.digamma(wordDists, out=diWordDists) if updateVocab: # Perform inference, updating the vocab wordDists[:, :] = vocabPrior for d in range(D): if debug and d % 100 == 0: printAndFlushNoNewLine(".") wordIdx, z = _update_topics_at_d(d, data, weights, docLens, topicMeans, topicPrior, diWordDists, diWordDistSums) wordDists[:, wordIdx] += W[d, :].data[np.newaxis, :] * z _infer_weights(data, weights, topicMeans, topicPrior, negCount, reg) # Log bound and the determine if we can stop early if itr % logFrequency == 0: iters.append(itr) bnds.append(_var_bound_internal(data, model, query)) likes.append(_log_likelihood_internal(data, model, query)) if debug: print("%.3f < %.3f" % (bnds[-1], likes[-1])) if converged(iters, bnds, len(bnds) - 1, minIters=5): break # Update hyperparameters (do this after bound, to make sure bound # calculation is internally consistent) if itr > 0 and itr % HyperParamUpdateInterval == 0: if debug: print("Topic Prior was " + str(topicPrior)) _updateTopicHyperParamsFromMeans(model, query) if debug: print("Topic Prior is now " + str(topicPrior)) else: for d in range(D): _ = _update_topics_at_d(d, W, docLens, topicMeans, topicPrior, diWordDists, diWordDistSums) topicMeans = _convertMeansToDirichletParam(docLens, topicMeans, topicPrior) return ModelState(K, topicPrior, vocabPrior, wordDists, weights, negCount, reg, dtype, model.name), \ QueryState(docLens, topicMeans), \ (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))
def train(data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words are used in this model modelState - the actual LDA model. In a training run (query = False) this will be mutated in place, and then returned. queryState - the query results - essentially all the "local" variables matched to the given observations. This will be mutated in-place and then returned. trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) query - Return: The updated model object (note parameters are updated in place, so make a defensive copy if you want it) The query object with the update query parameters ''' iterations, epsilon, logFrequency, fastButInaccurate, debug, batchSize, rate_retardation, forgetting_rate = \ trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug, \ trainPlan.batchSize, trainPlan.rate_retardation, trainPlan.forgetting_rate W_list, docLens, topicDists = \ queryState.W_list, queryState.docLens, queryState.topicDists K, topicPrior, vocabPrior, wordDists, dtype = \ modelState.K, modelState.topicPrior, modelState.vocabPrior, modelState.wordDists, modelState.dtype W = data.words D, T = W.shape # Quick sanity check if np.any(docLens < 1): raise ValueError( "Input document-term matrix contains at least one document with no words" ) # Book-keeping for logs logPoints = 1 if logFrequency == 0 else iterations // logFrequency boundIters = np.zeros(shape=(logPoints, )) boundValues = np.zeros(shape=(logPoints, )) likelyValues = np.zeros(shape=(logPoints, )) bvIdx = 0 # Instead of storing the full topic assignments for every individual word, we # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension, # we only store a 1xNxT = NxT part. z_dnk = np.empty((docLens.max(), K), dtype=dtype, order='F') # Select the training iterations function appropriate for the dtype current_micro_time = lambda: int(time.time()) do_iterations = compiled.iterate_f32 \ if modelState.dtype == np.float32 \ else compiled.iterate_f64 # do_iterations = iterate # pure Python # Iterate in segments, pausing to take measures of the bound / likelihood segIters = logFrequency remainder = iterations - segIters * (logPoints - 1) totalItrs = 0 for segment in range(logPoints - 1): start = current_micro_time() totalItrs += do_iterations (segIters, \ batchSize, segment * segIters, rate_retardation, forgetting_rate, \ D, K, T, \ W_list, docLens, \ topicPrior, vocabPrior, \ z_dnk, topicDists, wordDists) duration = current_micro_time() - start boundIters[bvIdx] = segment * segIters boundValues[bvIdx] = var_bound(data, modelState, queryState) likelyValues[bvIdx] = log_likelihood(data, modelState, queryState) perp = perplexity_from_like(likelyValues[bvIdx], W.sum()) bvIdx += 1 if converged(boundIters, boundValues, bvIdx, epsilon, minIters=20): boundIters, boundValues, likelyValues = clamp( boundIters, boundValues, likelyValues, bvIdx) return ModelState(K, topicPrior, vocabPrior, wordDists, modelState.dtype, modelState.name), \ QueryState(W_list, docLens, topicDists), \ (boundIters, boundValues, likelyValues) print( "Segment %d/%d Total Iterations %d Duration %d Perplexity %4.0f Bound %10.2f Likelihood %10.2f" % (segment, logPoints, totalItrs, duration, perp, boundValues[bvIdx - 1], likelyValues[bvIdx - 1])) # Final batch of iterations. do_iterations (remainder, D, K, T, \ W_list, docLens, \ topicPrior, vocabPrior, \ z_dnk, topicDists, wordDists) boundIters[bvIdx] = iterations - 1 boundValues[bvIdx] = var_bound(data, modelState, queryState) likelyValues[bvIdx] = log_likelihood(data, modelState, queryState) return ModelState(K, topicPrior, vocabPrior, wordDists, modelState.dtype, modelState.name), \ QueryState(W_list, docLens, topicDists), \ (boundIters, boundValues, likelyValues)
def train(data, model, query, plan, updateVocab=True): ''' Infers the topic distributions in general, and specifically for each individual datapoint, and additionally learns the weights needed to predict new links. Params: W - the DxT document-term matrix X - The DxD document-document matrix model - the initial model configuration. This is MUTATED IN-PLACE qyery - the query results - essentially all the "local" variables matched to the given observations. Also MUTATED IN-PLACE plan - how to execute the training process (e.g. iterations, log-interval etc.) Return: The updated model object (note parameters are updated in place, so make a defensive copy if you want it) The query object with the update query parameters ''' iterations, epsilon, logFrequency, fastButInaccurate, debug = \ plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug docLens, topics, postTopicCov, U, V, tsums_bydoc, tsums_bytop, exp_tsums_bydoc, exp_tsums_bytop, lse_at_k, out_counts, in_counts = \ query.docLens, query.topics, query.postTopicCov, query.U, query.V, query.tsums_bydoc, query.tsums_bytop, query.exp_tsums_bydoc, query.exp_tsums_bytop, query.lse_at_k, query.out_counts, query.in_counts K, Q, topicPrior, vocabPrior, wordDists, topicCov, dtype, name = \ model.K, model.Q, model.topicPrior, model.vocabPrior, model.wordDists, model.topicCov, model.dtype, model.name # Quick sanity check if np.any(docLens < 1): raise ValueError( "Input document-term matrix contains at least one document with no words" ) assert dtype == np.float64, "Only implemented for 64-bit floats" # Prepare the data for inference W, L, X_fixme = data.words, data.links, data.feats D, T = W.shape iters, bnds, likes = [], [], [] # Instead of storing the full topic assignments for every individual word, we # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension, # we only store a 1xNxT = NxT part. diWordDistSums = np.empty((K, ), dtype=dtype) diWordDists = np.empty(wordDists.shape, dtype=dtype) new_in_counts = in_counts.copy() newCov = np.ndarray(shape=(K, K), dtype=model.dtype) newCov.fill(0) invCov = la.inv(topicCov) S = topicCov.copy() rhs = np.ndarray(shape=(K, ), dtype=model.dtype) b, f = rhs.copy(), rhs.copy( ) # linear coefficients of the Bohing quadratic bounds new_maxes_bytop = np.ndarray(shape=(K, ), dtype=model.dtype) maxes_bytop = topics.max(axis=0) for itr in range(iterations): if itr % logFrequency == 0: iters.append(itr) bnds.append(_var_bound_internal(data, model, query)) likes.append(_log_likelihood_internal(data, model, query)) if debug: print( "Bound : %f \t Likelihood %f \t Perplexity %.2f" % (bnds[-1], likes[-1], np.exp(-likes[-1] / docLens.sum()))) if converged(iters, bnds, len(bnds) - 1, minIters=5): break if debug: printAndFlushNoNewLine("\n %4d: " % itr) newCov[:, :] = 0 new_in_counts[:] = 0 # U and V FIXME DEBUG # U[:, :] = la.lstsq(V.T, topics.T)[0].T # V[:, :] = la.lstsq(U, topics)[0] diWordDistSums[:] = wordDists.sum(axis=1) fns.digamma(diWordDistSums, out=diWordDistSums) fns.digamma(wordDists, out=diWordDists) wordDists[:, :] = vocabPrior new_maxes_bytop.fill(1E-300) for d in range(D): if d % 100 == 0: printAndFlushNoNewLine(".") wordIdx, z, linkIdx, y = _update_topics_at_d( d, data, docLens, topics, topicPrior, lse_at_k, diWordDists, diWordDistSums) # Update the word distributions wordDists[:, wordIdx] += W[d, :].data[np.newaxis, :] * z # Determine the topic distribution # Step 1, the covariance S[:, :] = invCov S[np.diag_indices_from(S)] += docLens[d] + out_counts[d] S[:, :] -= 1. / (K + 1) S[np.diag_indices_from(S)] += (K - 1.) / K * in_counts S = la.inv(S) # Topics Step 2, the actual right-hand side rhs[:] = invCov.dot(U[d, :].dot(V)) rhs += (z * W[d, :].data[np.newaxis, :]).sum(axis=1) ysum = (y * L[d, :].data[:, np.newaxis]).sum(axis=0) #rhs += ysum b[:] = topics[d, :] - 1. / (K + 1) * topics[d, :].sum() - softmax( topics[d, :]) b *= docLens[d] rhs += b f[:] = topics[d, :] - 1. / (D + 1) * tsums_bytop - np.exp( topics[d, :] - maxes_bytop) / exp_tsums_bytop f *= in_counts rhs += f rhs[:] += (D - 1) / (2 * D + 2) * (in_counts * (tsums_bytop - topics[d, :])) # Topics Step 3: solve new_topics = S.dot(rhs) # Topics Step 4: update the running counts and covariance, then assign the new topics to "topics' tsums_bytop -= topics[d, :] tsums_bytop += new_topics new_maxes_bytop = np.maximum(new_maxes_bytop, new_topics) new_in_counts += ysum vec = new_topics - U[d, :].dot(V) newCov += np.outer(vec, vec) newCov += np.diag(S) topics[d, :] = new_topics # Next step is the posterior covariance postTopicCov[d, :] = np.diag(S) # The covariance hyper-parameter topicCov[:, :] = newCov invCov[:, :] = la.inv(topicCov) # The remaining running counts, and the column-wise softmax adjustment maxes_bytop[:] = new_maxes_bytop in_counts[:] = new_in_counts exp_tsums_bytop[:] = np.sum(np.exp(topics - maxes_bytop[np.newaxis, :]), axis=0) return ModelState(K, Q, topicPrior, vocabPrior, wordDists, topicCov, dtype, model.name), \ QueryState(docLens, topics, postTopicCov, U, V, tsums_bydoc, tsums_bytop, exp_tsums_bydoc, exp_tsums_bytop, lse_at_k, out_counts, in_counts), \ (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))