def var_bound(data, modelState, queryState): ''' Determines the variational bounds. Values are mutated in place, but are reset afterwards to their initial values. So it's safe to call in a serial manner. ''' # Unpack the the structs, for ease of access and efficiency W = data.words D,_ = W.shape means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens K, topicMean, sigT, vocab, vocabPrior, A = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A # Calculate some implicit variables isigT = la.inv(sigT) bound = 0 if USE_NIW_PRIOR: pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN pseudoObsVar = K + NIW_PSEUDO_OBS_VAR # distribution over topic covariance bound -= 0.5 * K * pseudoObsVar * log(NIW_PSI) bound -= 0.5 * K * pseudoObsVar * log(2) bound -= fns.multigammaln(pseudoObsVar / 2., K) bound -= 0.5 * (pseudoObsVar + K - 1) * safe_log_det(sigT) bound += 0.5 * NIW_PSI * np.trace(isigT) # and its entropy # is a constant which we skip # distribution over means bound -= 0.5 * K * log(1./pseudoObsMeans) * safe_log_det(sigT) bound -= 0.5 / pseudoObsMeans * (topicMean).T.dot(isigT).dot(topicMean) # and its entropy bound += 0.5 * safe_log_det(sigT) # + a constant # Distribution over document topics bound -= (D*K)/2. * LN_OF_2_PI bound -= D/2. * la.det(sigT) diff = means - topicMean[np.newaxis,:] bound -= 0.5 * np.sum (diff.dot(isigT) * diff) bound -= 0.5 * np.sum(varcs * np.diag(isigT)[np.newaxis,:]) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only. # And its entropy # bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.sum(np.log(varcs)) # Distribution over word-topic assignments and words and the formers # entropy. This is somewhat jumbled to avoid repeatedly taking the # exp and log of the means expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab) # D x V [W / TB] is the quotient of the original over the reconstructed doc-term matrix V = expMeans * (R.dot(vocab.T)) # D x K bound += np.sum(docLens * np.log(np.sum(expMeans, axis=1))) bound += np.sum(sparseScalarProductOfSafeLnDot(W, expMeans, vocab).data) bound += np.sum(means * V) bound += np.sum(2 * ssp.diags(docLens,0) * means.dot(A) * means) bound -= 2. * scaledSelfSoftDot(means, docLens) bound -= 0.5 * np.sum(docLens[:,np.newaxis] * V * (np.diag(A))[np.newaxis,:]) bound -= np.sum(means * V) return bound
def var_bound(data, modelState, queryState, XTX=None): ''' Determines the variational bounds. Values are mutated in place, but are reset afterwards to their initial values. So it's safe to call in a serial manner. ''' # Unpack the the structs, for ease of access and efficiency W, X = data.words, data.feats D, _ = W.shape means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.Ab, modelState.dtype # Calculate some implicit variables isigT = la.inv(sigT) lnDetSigT = lnDetOfDiagMat(sigT) verifyProper(lnDetSigT, "lnDetSigT") if XTX is None: XTX = X.T.dot(X) bound = 0 # Distribution over latent space bound -= (P * K) / 2. * LN_OF_2_PI bound -= P * lnDetSigT bound -= K * P * log(lfv) bound -= 0.5 * np.sum(1. / lfv * isigT.dot(Y) * Y) bound -= 0.5 * K * np.trace(R_Y) # And its entropy detR_Y = safeDet(R_Y, "R_Y") bound += 0.5 * LN_OF_2_PI_E + P / 2. * lnDetSigT + K / 2. * log(detR_Y) # Distribution over mapping from features to topics diff = (A - Y.dot(V)) bound -= (F * K) / 2. * LN_OF_2_PI bound -= F * lnDetSigT bound -= K * P * log(fv) bound -= 0.5 * np.sum(1. / lfv * isigT.dot(diff) * diff) bound -= 0.5 * K * np.trace(R_A) # And its entropy detR_A = safeDet(R_A, "R_A") bound += 0.5 * LN_OF_2_PI_E + F / 2. * lnDetSigT + K / 2. * log(detR_A) # Distribution over document topics bound -= (D * K) / 2. * LN_OF_2_PI bound -= D / 2. * lnDetSigT diff = means - X.dot(A.T) bound -= 0.5 * np.sum(diff.dot(isigT) * diff) bound -= 0.5 * np.sum( varcs * np.diag(isigT)[np.newaxis, :] ) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only. bound -= 0.5 * K * np.trace(XTX.dot(R_A)) # And its entropy bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.sum(np.log(varcs)) # Distribution over word-topic assignments, and their entropy # and distribution over words. This is re-arranged as we need # means for some parts, and exp(means) for other parts expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot( W, expMeans, vocab ) # D x V [W / TB] is the quotient of the original over the reconstructed doc-term matrix S = expMeans * (R.dot(vocab.T)) # D x K bound += np.sum(docLens * np.log(np.sum(expMeans, axis=1))) bound += np.sum(sparseScalarProductOfSafeLnDot(W, expMeans, vocab).data) bound += np.sum(means * S) bound += np.sum(2 * ssp.diags(docLens, 0) * means.dot(Ab) * means) bound -= 2. * scaledSelfSoftDot(means, docLens) bound -= 0.5 * np.sum(docLens[:, np.newaxis] * S * (np.diag(Ab))[np.newaxis, :]) bound -= np.sum(means * S) return bound
def var_bound(data, modelState, queryState): ''' Determines the variational bounds. Values are mutated in place, but are reset afterwards to their initial values. So it's safe to call in repeatedly. ''' # Unpack the the structs, for ease of access and efficiency W, X = data.words, data.feats D, T, F = W.shape[0], W.shape[1], X.shape[1] means, docLens = queryState.means, queryState.docLens K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = \ modelState.K, modelState.A, modelState.U, modelState.Y, modelState.V, modelState.covA, modelState.tv, modelState.ltv, modelState.fv, modelState.lfv, modelState.vocab, modelState.vocabPrior, modelState.dtype H = 0.5 * (np.eye(K) - np.ones((K, K), dtype=dtype) / K) Log2Pi = log(2 * pi) bound = 0 # U and V are parameters with no distribution # # Y has a normal distribution, it's covariance is unfortunately an expensive computation # P, Q = U.shape[1], V.shape[1] covY = np.eye(P * Q) * (lfv * ltv) covY += np.kron(V.T.dot(V), U.T.dot(U)) covY = la.inv(covY, overwrite_a=True) # The expected likelihood of Y bound -= 0.5 * P * Q * Log2Pi bound -= 0.5 * P * Q * log(ltv * lfv) bound -= 0.5 / (lfv * ltv) * np.sum( Y * Y) # 5x faster than np.trace(Y.dot(Y.T)) bound -= 0.5 * np.trace(covY) * (lfv * ltv) # the traces of the posterior+prior covariance products cancel out across likelihoods # The entropy of Y bound += 0.5 * P * Q * (Log2Pi + 1) + 0.5 * safe_log_det(covY) # # A has a normal distribution/ # F, K = A.shape[0], A.shape[1] diff = A - U.dot(Y).dot(V.T) diff *= diff # The expected likelihood of A bound -= 0.5 * K * F * Log2Pi bound -= 0.5 * K * F * log(tv * fv) bound -= 0.5 / (fv * tv) * np.sum(diff) # The entropy of A bound += 0.5 * F * K * (Log2Pi + 1) + 0.5 * K * safe_log_det(covA) # # Theta, the matrix of means, has a normal distribution. Its row-covarince is diagonal # (i.e. it's several independent multi-var normal distros). The posterior is made # up of D K-dimensional normals with diagonal covariances # # We iterate through the topics in batches, to control memory use batchSize = min(BatchSize, D) batchCount = ceil(D / batchSize) feats = np.ndarray(shape=(batchSize, F), dtype=dtype) tops = np.ndarray(shape=(batchSize, K), dtype=dtype) trace = 0 for b in range(0, batchCount): start = b * batchSize end = min(start + batchSize, D) batchSize = min(batchSize, end - start) feats[:batchSize, :] = X[start:end, :].toarray() np.dot(feats[:batchSize, :], A, out=tops[:batchSize, :]) tops[:batchSize, :] -= means[start:end, :] tops[:batchSize, :] *= tops[:batchSize, :] trace += np.sum(tops[:batchSize, :]) feats = None # The expected likelihood of the topic-assignments bound -= 0.5 * D * K * Log2Pi bound -= 0.5 * D * K * log(tv) bound -= 0.5 / tv * trace bound -= 0.5 * tv * np.sum(covA) # this trace doesn't cancel as we # don't have a posterior on tv # The entropy of the topic-assignments bound += 0.5 * D * K * (Log2Pi + 1) + 0.5 * np.sum(covA) # Distribution over word-topic assignments and words and the formers # entropy. This is somewhat jumbled to avoid repeatedly taking the # exp and log of the means # Again we batch this for safety batchSize = min(BatchSize, D) batchCount = ceil(D / batchSize) V = np.ndarray(shape=(batchSize, K), dtype=dtype) for b in range(0, batchCount): start = b * batchSize end = min(start + batchSize, D) batchSize = min(batchSize, end - start) meansBatch = means[start:end, :] docLensBatch = docLens[start:end] np.exp(meansBatch - meansBatch.max(axis=1)[:, np.newaxis], out=tops[:batchSize, :]) expMeansBatch = tops[:batchSize, :] R = sparseScalarQuotientOfDot( W, expMeansBatch, vocab, start=start, end=end ) # BatchSize x V: [W / TB] is the quotient of the original over the reconstructed doc-term matrix V[:batchSize, :] = expMeansBatch * (R[:batchSize, :].dot(vocab.T) ) # BatchSize x K VBatch = V[:batchSize, :] bound += np.sum(docLensBatch * np.log(np.sum(expMeansBatch, axis=1))) bound += np.sum( sparseScalarProductOfSafeLnDot(W, expMeansBatch, vocab, start=start, end=end).data) bound += np.sum(meansBatch * VBatch) bound += np.sum(2 * ssp.diags(docLensBatch, 0) * meansBatch.dot(H) * meansBatch) bound -= 2. * scaledSelfSoftDot(meansBatch, docLensBatch) bound -= 0.5 * np.sum(docLensBatch[:, np.newaxis] * VBatch * (np.diag(H))[np.newaxis, :]) bound -= np.sum(meansBatch * VBatch) return bound
def var_bound(data, modelState, queryState): ''' Determines the variational bounds. Values are mutated in place, but are reset afterwards to their initial values. So it's safe to call in a serial manner. ''' # Unpack the the structs, for ease of access and efficiency W, L, X = data.words, data.links, data.feats D, _ = W.shape means, varcs, docLens = queryState.means, queryState.varcs, queryState.docLens K, topicMean, topicCov, vocab, A = modelState.K, modelState.topicMean, modelState.topicCov, modelState.vocab, modelState.A # Calculate some implicit variables itopicCov = la.inv(topicCov) bound = 0 expMeansOut = np.exp(means - means.max(axis=1)[:, np.newaxis]) expMeansIn = np.exp(means - means.max(axis=0)[np.newaxis, :]) lse_at_k = expMeansIn.sum(axis=0) if USE_NIW_PRIOR: pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN pseudoObsVar = K + NIW_PSEUDO_OBS_VAR # distribution over topic covariance bound -= 0.5 * K * pseudoObsVar * log(NIW_PSI) bound -= 0.5 * K * pseudoObsVar * log(2) bound -= fns.multigammaln(pseudoObsVar / 2., K) bound -= 0.5 * (pseudoObsVar + K - 1) * safe_log_det(topicCov) bound += 0.5 * NIW_PSI * np.trace(itopicCov) # and its entropy # is a constant which we skip # distribution over means bound -= 0.5 * K * log(1. / pseudoObsMeans) * safe_log_det(topicCov) bound -= 0.5 / pseudoObsMeans * ( topicMean).T.dot(itopicCov).dot(topicMean) # and its entropy bound += 0.5 * safe_log_det(topicCov) # + a constant # Distribution over document topics bound -= (D * K) / 2. * LN_OF_2_PI bound -= D / 2. * la.det(topicCov) diff = means - topicMean[np.newaxis, :] bound -= 0.5 * np.sum(diff.dot(itopicCov) * diff) bound -= 0.5 * np.sum( varcs * np.diag(itopicCov)[np.newaxis, :] ) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only. # And its entropy # bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.sum(np.log(varcs)) # Distribution over word-topic assignments and words and the formers # entropy, and similaarly for out-links. This is somewhat jumbled to # avoid repeatedly taking the exp and log of the means W_weights = sparseScalarQuotientOfDot( W, expMeansOut, vocab ) # D x V [W / TB] is the quotient of the original over the reconstructed doc-term matrix w_top_sums = expMeansOut * (W_weights.dot(vocab.T)) # D x K L_weights = sparseScalarQuotientOfNormedDot(L, expMeansOut, expMeansIn, lse_at_k) l_top_sums = L_weights.dot(expMeansIn) / lse_at_k[ np.newaxis, :] * expMeansOut bound += np.sum(docLens * np.log(np.sum(expMeansOut, axis=1))) bound += np.sum(sparseScalarProductOfSafeLnDot(W, expMeansOut, vocab).data) # means = np.log(expMeans, out=expMeans) #means = safe_log(expMeansOut, out=means) bound += np.sum(means * w_top_sums) bound += np.sum(2 * ssp.diags(docLens, 0) * means.dot(A) * means) bound -= 2. * scaledSelfSoftDot(means, docLens) bound -= 0.5 * np.sum(docLens[:, np.newaxis] * w_top_sums * (np.diag(A))[np.newaxis, :]) bound -= np.sum(means * w_top_sums) return bound