def var_bound(data, model, query, topicDistOverride=None): ''' Determines the variational bounds. ''' bound = 0 # Unpack the the structs, for ease of access and efficiency docLens, topicMeans = \ query.docLens, query.topicDists K, topicPrior, vocabPrior, wordDists, corpusTopicDist, dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.corpusTopicDist, model.dtype tops = topicDistOverride \ if topicDistOverride is not None \ else topicDists(query) # Initialize z matrix if necessary W = data.words D, T = W.shape wordLikely = sparseScalarProductOfSafeLnDot(data.words, tops, wordDists(model)).sum() topicLikely = topicMeans.dot(fns.digamma(corpusTopicDist) - fns.digamma(corpusTopicDist.sum())) # Expected joint like = W.dot(safe_log(wordDists).T) # D*K like += corpusTopicDist[np.newaxis,:] like *= safe_log(topicMeans) # Entropy ent = (-topicMeans * safe_log(topicMeans)).sum() return like.sum() + ent
def var_bound(data, model, query): ''' A total nonsense in this case which we retain just so all the other functions continue to work. ''' # Unpack the the structs, for ease of access and efficiency docLens, topicMeans = \ query.docLens, query.topicDists K, topicPrior, vocabPrior, wordDists, dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype # Initialize z matrix if necessary W = data.words D, T = W.shape # ln p(x,z) >= sum_k p(z=k|x) * (ln p(x|z=k, phi) + p(z=k)) + H[q] # Expected joint like = W.dot(safe_log(wordDists).T) # D*K like *= safe_log(topicMeans) # Entropy ent = (-topicMeans * safe_log(topicMeans)).sum() return like.sum() + ent
def _sparseScalarProductOfSafeLnDot_py(A, B, C, out=None): ''' Calculates A * B.dot(C) where A is a sparse matrix Retains sparsity in the result, unlike the built-in operator Note the type of the return-value is the same as the type of the sparse matrix A. If this has an integral type, this will only provide integer-based multiplication. ''' if WarnIfSlow: sys.stderr.write( "WARNING: Slow code path triggered (_sparseScalarProductOfSafeLnDot_py)" ) if not (A.dtype == B.dtype and B.dtype == C.dtype and (out is None or C.dtype == out.dtype)): raise ValueError( "Inconsistent dtypes in the three matrices and possibly the out-param" ) if out is None: out = A.copy() else: out.data[:] = A.data rhs = B.dot(C) rhs[rhs < sys.float_info.min] = sys.float_info.min out.data *= safe_log(rhs)[csr_indices(out.indptr, out.indices)] return out
def log_likelihood_point(data: DataSet, model: ModelState, query: QueryState = None) -> float: wordDist = wordDists((model)) # ln p(x|topic=k, word_dist) + ln p(topic=k) for all documents, for all k lls = data.words @ np.log(wordDist.T) if query is not None: topicDist = topicDists(query) lls += safe_log(topicDist) else: lls += safe_log(corpusTopicDist(model))[np.newaxis, :] # Safe Log-sum-exp (of topic-specific log likelihoods) max_lls = lls.max(axis=1) lls -= max_lls[:, np.newaxis] np.exp(lls, out=lls) lls = max_lls + np.log(lls.sum(axis=1)) # Return corpus-total log likelihood return lls.sum()
def iterate (iterations, D, K, T, \ W_list, docLens, \ topicPrior, vocabPrior, \ z_dnk, topicDists, wordDists): raise ValueError("This implementation no longer supported") totalItrs = 0 epsilon = 0.01 / K oldWordDists = np.empty(wordDists.shape, wordDists.dtype) newWordDists = wordDists for _ in range(iterations): oldWordDists, newWordDists = newWordDists, oldWordDists lnWordDists = safe_log(oldWordDists, out=oldWordDists) newWordDists.fill(vocabPrior) for d in range(D): oldTopics = topicDists[d, :].copy() topicDists[d, :] = 1. / K lnWordProbs = lnWordDists[:, W_list[d, 0:docLens[d]]] innerItrs = 0 while ((innerItrs < MaxInnerItrs) or (np.sum(np.abs(oldTopics - topicDists[d,:])) > epsilon)) \ and (innerItrs < MaxInnerItrs): diTopic = fns.digamma(topicDists[d, :]) z_dnk[:docLens[d], :] = lnWordProbs.T + diTopic[np.newaxis, :] # We've been working in log-space till now, before we go to true # probability space rescale so we don't underflow everywhere maxes = z_dnk.max(axis=1) z_dnk -= maxes[:, np.newaxis] np.exp(z_dnk, out=z_dnk) # Now normalize so probabilities sum to one sums = z_dnk.sum(axis=1) z_dnk /= sums[:, np. newaxis] # Update vocabulary: hard to do with a list representation # Now use it to infer the topic distribution topicDists[d, :] = topicPrior + np.sum(z_dnk[:docLens[d], :], axis=0) topicDists[d, :] /= np.sum(topicDists[d, :]) innerItrs += 1 totalItrs += innerItrs for k in range(K): for n in range(docLens[d]): newWordDists[k, W_list[d, n]] += z_dnk[n, k] newWordDists /= newWordDists.sum(axis=1)[:, np.newaxis] return totalItrs
def sample_memberships(W, alpha, wordDists, memberships): _, K = memberships.shape priorNum = memberships.sum(axis=0) + alpha - 1 prior = priorNum.copy() sample_dists = W.dot(safe_log(wordDists).T) # d x k for d in range(W.shape[0]): priorNum -= memberships[d, :] prior[:] = priorNum prior /= priorNum.sum() sample_dists[d, :] += safe_log(prior) sample_dists[d, :] -= sample_dists[d, :].max() sample_dists[d, :] -= fns.logsumexp(sample_dists[d, :]) np.exp(sample_dists[d, :], out=sample_dists[d, :]) memberships[d, :] = rd.multinomial(1, sample_dists[d, :], size=1) priorNum += memberships[d, :] return memberships
def var_bound(data, modelState, queryState, z_dnk=None): ''' Determines the variational bounds. ''' # Unpack the the structs, for ease of access and efficiency W_list, docLens, topicDists = \ queryState.W_list, queryState.docLens, queryState.topicDists K, topicPrior, vocabPrior, _, dtype = \ modelState.K, modelState.topicPrior, modelState.vocabPrior, modelState.wordDists, modelState.dtype W = data.words D, T = W.shape maxN = docLens.max() if z_dnk == None: z_dnk = np.empty(shape=(maxN, K), dtype=dtype) wordDistsMatrix = wordDists(modelState) diWordDists = fns.digamma(wordDistsMatrix.copy()) - fns.digamma( wordDistsMatrix.sum(axis=1))[:, np.newaxis] lnWordDists = np.log(wordDistsMatrix) bound = 0 # Expected Probablity # # P(topics|topicPrior) diTopicDists = fns.digamma(topicDists) - fns.digamma( topicDists.sum(axis=1))[:, np.newaxis] ln_b_topic = fns.gammaln(topicPrior.sum()) - fns.gammaln(topicPrior).sum() bound += D * ln_b_topic \ + np.sum((topicPrior - 1) * diTopicDists) # and its entropy ent = fns.gammaln(topicDists.sum(axis=1)).sum() - fns.gammaln(topicDists).sum() \ + np.sum ((topicDists - 1) * diTopicDists) bound -= ent # P(z|topic) is tricky as we don't actually store this. However # we make a single, simple estimate for this case. # NOTE COPY AND PASTED FROM iterate_f32 / iterate_f64 (-ish) for d in range(D): lnWordProbs = lnWordDists[:, W_list[d, 0:docLens[d]]] diTopic = fns.digamma(topicDists[d, :]) z_dnk[0:docLens[d], :] = lnWordProbs.T + diTopic[np.newaxis, :] # We've been working in log-space till now, before we go to true # probability space rescale so we don't underflow everywhere maxes = z_dnk.max(axis=1) z_dnk -= maxes[:, np.newaxis] np.exp(z_dnk, out=z_dnk) # Now normalize so probabilities sum to one sums = z_dnk.sum(axis=1) z_dnk /= sums[:, np.newaxis] # z_dnk[docLens[d]:maxN,:] = 0 # zero probablities for words that don't exist # Now use to calculate E[ln p(Z|topics), E[ln p(W|Z) and H[Z] in that order diTopic -= fns.digamma(np.sum(topicDists[d, :])) bound += np.sum(z_dnk * diTopic[np.newaxis, :]) bound += np.sum(z_dnk[0:docLens[d], :].T * diWordDists[:, W_list[d, 0:docLens[d]]]) bound -= np.sum(z_dnk[0:docLens[d], :] * safe_log(z_dnk[0:docLens[d], :])) # p(vocabDists|vocabPrior) ln_b_vocab = fns.gammaln(T * vocabPrior) - T * fns.gammaln(vocabPrior) bound += K * ln_b_vocab \ + (vocabPrior - 1) * np.sum(diWordDists) # and its entropy ent = fns.gammaln(wordDistsMatrix.sum(axis=1)).sum() - fns.gammaln(wordDistsMatrix).sum() \ + np.sum ((wordDistsMatrix - 1) * diWordDists) bound -= ent return bound
def var_bound(data, model, query, z_dnk=None): ''' Determines the variational bounds. ''' bound = 0 # Unpack the the structs, for ease of access and efficiency K, topicPrior, wordPrior, wordDists, weights, negCount, reg, dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.weights, model.pseudoNegCount, model.regularizer, model.dtype docLens, topicDists = \ query.docLens, query.topicDists W, X = data.words, data.links D, T = W.shape minNonZero = 1E-300 if dtype is np.float64 else 1E-30 # Perform the digamma transform for E[ln \theta] etc. topicDists = topicDists.copy() diTopicDists = fns.digamma(topicDists[:, :K]) diSumTopicDists = fns.digamma(topicDists[:, :K].sum(axis=1)) diWordDists = fns.digamma(model.wordDists) diSumWordDists = fns.digamma(model.wordDists.sum(axis=1)) # E[ln p(topics|topicPrior)] according to q(topics) # prob_topics = D * (fns.gammaln(topicPrior[:K].sum()) - fns.gammaln(topicPrior[:K]).sum()) \ + np.sum((topicPrior[:K] - 1)[np.newaxis, :] * (diTopicDists - diSumTopicDists[:, np.newaxis])) bound += prob_topics # and its entropy ent_topics = _dirichletEntropy(topicDists[:, :K]) bound += ent_topics # E[ln p(vocabs|vocabPrior)] # if type(model.vocabPrior) is float or type(model.vocabPrior) is int: prob_vocabs = K * (fns.gammaln(wordPrior * T) - T * fns.gammaln(wordPrior)) \ + np.sum((wordPrior - 1) * (diWordDists - diSumWordDists[:,np.newaxis] )) else: prob_vocabs = K * (fns.gammaln(wordPrior.sum()) - fns.gammaln(wordPrior).sum()) \ + np.sum((wordPrior - 1)[np.newaxis,:] * (diWordDists - diSumWordDists[:,np.newaxis] )) bound += prob_vocabs # and its entropy ent_vocabs = _dirichletEntropy(wordDists) bound += ent_vocabs # P(z|topic) is tricky as we don't actually store this. However # we make a single, simple estimate for this case. topicMeans = _convertDirichletParamToMeans(docLens, topicDists, topicPrior) prob_words = 0 prob_z = 0 ent_z = 0 for d in range(D): wordIdx, z = _infer_topics_at_d(d, data, weights, docLens, topicMeans, topicPrior, diWordDists, diSumWordDists) # E[ln p(Z|topics) = sum_d sum_n sum_k E[z_dnk] E[ln topicDist_dk] exLnTopic = diTopicDists[d, :K] - diSumTopicDists[d] prob_z += np.dot(z * exLnTopic[:, np.newaxis], W[d, :].data).sum() # E[ln p(W|Z)] = sum_d sum_n sum_k sum_t E[z_dnk] w_dnt E[ln vocab_kt] prob_words += np.sum( W[d, :].data[np.newaxis, :] * z * (diWordDists[:, wordIdx] - diSumWordDists[:, np.newaxis])) # And finally the entropy of Z ent_z -= np.dot(z * safe_log(z), W[d, :].data).sum() bound += (prob_z + ent_z + prob_words) # Next, the distribution over links - we just focus on the positives in this case for d in range(D): links = _links_up_to(d, X) if len(links) == 0: continue scores = topicMeans[links, :].dot(weights * topicMeans[d]) probs = _probit_inplace(scores) + minNonZero lnProbs = np.log(probs, out=probs) # expected probability of all links from d to p < d such that y_dp = 1 bound += lnProbs.sum() _convertMeansToDirichletParam(docLens, topicMeans, topicPrior) return bound
def var_bound(data, model, query, z_dnk = None): ''' Determines the variational bounds. ''' bound = 0 # Unpack the the structs, for ease of access and efficiency K, topicPrior, wordPrior, wordDists, dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype docLens, topicDists = \ query.docLens, query.topicDists # Initialize z matrix if necessary W,X = data.words, data.links D,T = W.shape # Perform the digamma transform for E[ln \theta] etc. topicDists = topicDists.copy() diTopicDists = fns.digamma(topicDists[:, :K]) diSumTopicDists = fns.digamma(topicDists[:, :K].sum(axis=1)) diWordDists = fns.digamma(model.wordDists) diSumWordDists = fns.digamma(model.wordDists.sum(axis=1)) # E[ln p(topics|topicPrior)] according to q(topics) # prob_topics = D * (fns.gammaln(topicPrior[:K].sum()) - fns.gammaln(topicPrior[:K]).sum()) \ + np.sum((topicPrior[:K] - 1)[np.newaxis, :] * (diTopicDists - diSumTopicDists[:, np.newaxis])) bound += prob_topics # and its entropy ent_topics = _dirichletEntropy(topicDists[:, :K]) bound += ent_topics # E[ln p(vocabs|vocabPrior)] # if type(model.vocabPrior) is float or type(model.vocabPrior) is int: prob_vocabs = K * (fns.gammaln(wordPrior * T) - T * fns.gammaln(wordPrior)) \ + np.sum((wordPrior - 1) * (diWordDists - diSumWordDists[:, np.newaxis] )) else: prob_vocabs = K * (fns.gammaln(wordPrior.sum()) - fns.gammaln(wordPrior).sum()) \ + np.sum((wordPrior - 1)[np.newaxis,:] * (diWordDists - diSumWordDists[:, np.newaxis] )) bound += prob_vocabs # and its entropy ent_vocabs = _dirichletEntropy(wordDists) bound += ent_vocabs # P(z|topic) is tricky as we don't actually store this. However # we make a single, simple estimate for this case. topicMeans = _convertDirichletParamToMeans(docLens, topicDists, topicPrior) prob_words = 0 prob_z = 0 ent_z = 0 for d in range(D): wordIdx, z = _infer_topics_at_d(d, data, docLens, topicMeans, topicPrior, diWordDists, diSumWordDists) # E[ln p(Z|topics) = sum_d sum_n sum_k E[z_dnk] E[ln topicDist_dk] exLnTopic = diTopicDists[d, :K] - diSumTopicDists[d] prob_z += np.dot(z * exLnTopic[:, np.newaxis], W[d, :].data).sum() # E[ln p(W|Z)] = sum_d sum_n sum_k sum_t E[z_dnk] w_dnt E[ln vocab_kt] prob_words += np.sum(W[d, :].data[np.newaxis, :] * z * (diWordDists[:, wordIdx] - diSumWordDists[:, np.newaxis])) # And finally the entropy of Z ent_z -= np.dot(z * safe_log(z), W[d, :].data).sum() bound += (prob_z + ent_z + prob_words) _convertMeansToDirichletParam(docLens, topicMeans, topicPrior) return bound
def varBound (modelState, queryState, X, W, lnVocab = None, XAT=None, XTX = None, scaledWordCounts = None, UTU = None, VTV = None): ''' For a current state of the model, and the query, for given inputs, outputs the variational lower-bound. Params modelState - the state of the model currently queryState - the state of the query currently X - the DxF matrix of features we're querying on, where D is the number of documents W - the DxT matrix of words ("terms") we're querying on Z - if this has already been calculated, it can be passed in. If not, we recalculate it from the model and query states. Z is the DxKxT tensor which for each document D and term T gives the proportion of those terms assigned to topic K vocab - the KxV matrix of the vocabulary distribution XAT - DxK dot product of XA', recalculated if not provided, where X is DxF and A' is FxK XTX - dot product of X-transpose and X, recalculated if not provided. UTU - as above for U VTV - as above for V Returns The (positive) variational lower bound ''' # Unpack the model and query state tuples for ease of use and maybe speed improvements K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq = modelState.K, modelState.Q, modelState.F, modelState.P, modelState.T, modelState.A, modelState.varA, modelState.Y, modelState.omY, modelState.sigY, modelState.sigT, modelState.U, modelState.V, modelState.vocab, modelState.topicVar, modelState.featVar, modelState.lowTopicVar, modelState.lowFeatVar (expLmda, nu, lxi, s, docLen) = (queryState.expLmda, queryState.nu, queryState.lxi, queryState.s, queryState.docLen) lmda = np.log(expLmda) # Get the number of samples from the shape. Ensure that the shapes are consistent # with the model parameters. (D, Tcheck) = W.shape if Tcheck != T: raise ValueError ("The shape of the DxT document matrix W is invalid, T is %d but the matrix W has shape (%d, %d)" % (T, D, Tcheck)) (Dcheck, Fcheck) = X.shape if Dcheck != D: raise ValueError ("Inconsistent sizes between the matrices X and W, X has %d rows but W has %d" % (Dcheck, D)) if Fcheck != F: raise ValueError ("The shape of the DxF feature matrix X is invalid. F is %d but the matrix X has shape (%d, %d)" % (F, Dcheck, Fcheck)) # We'll need the original xi for this and also Z, the 3D tensor of which for each document D # and term T gives the strength of topic K. We'll also need the log of the vocab dist xi = deriveXi (lmda, nu, s) # If not already provided, we'll also need the following products # if XAT is None: XAT = X.dot(A.T) if XTX is None: XTX = X.T.dot(X) if V is not None and VTV is None: VTV = V.T.dot(V) if U is not None and UTU is None: UTU = U.T.dot(U) # also need one over the usual variances overSsq, overAsq, overKsq, overTsq = 1./sigmaSq, 1./alphaSq, 1./kappaSq, 1./tauSq overTkSq = overTsq * overKsq overAsSq = overAsq * overSsq # <ln p(Y)> # trSigY = 1 if sigY is None else np.trace(sigY) trOmY = 1 if omY is None else np.trace(omY) lnP_Y = -0.5 * (Q*P * LOG_2PI + overTkSq * trSigY * trOmY + overTkSq * np.trace(Y.dot(Y.T))) # <ln P(A|Y)> # TODO it looks like I should take the trace of omA \otimes I_K here. # TODO Need to check re-arranging sigY and omY is sensible. halfKF = 0.5 * K * F # Horrible, but varBound can be called by two implementations, one with Y as a matrix-variate # where sigY is QxQ and one with Y as a multi-varate, where sigY is a QPxQP. A_from_Y = Y.dot(U.T) if V is None else U.dot(Y).dot(V.T) A_diff = A - A_from_Y varFactorU = np.trace(sigY.dot(np.kron(VTV, UTU))) if sigY.shape[0] == Q*P else np.sum(sigY*UTU) varFactorV = 1 if V is None \ else np.sum(omY * V.T.dot(V)) lnP_A = -halfKF * LOG_2PI - halfKF * log (alphaSq) -halfKF * log(sigmaSq) \ -0.5 * (overAsSq * varFactorV * varFactorU \ + np.trace(XTX.dot(varA)) * K \ + np.sum(np.square(A_diff))) # <ln p(Theta|A,X) # lmdaDiff = lmda - XAT lnP_Theta = -0.5 * D * LOG_2PI -0.5 * D * K * log (sigmaSq) \ -0.5 / sigmaSq * ( \ np.sum(nu) + D*K * np.sum(XTX * varA) + np.sum(np.square(lmdaDiff))) # Why is order of sigT reversed? It's cause we've not been consistent. A is KxF but lmda is DxK, and # note that the distribution of lmda tranpose has the same covariances, just in different positions # (i.e. row is col and vice-versa) # <ln p(Z|Theta) # docLenLmdaLxi = docLen[:, np.newaxis] * lmda * lxi scaledWordCounts = sparseScalarQuotientOfDot(W, expLmda, vocab) lnP_Z = 0.0 lnP_Z -= np.sum(docLenLmdaLxi * lmda) lnP_Z -= np.sum(docLen[:, np.newaxis] * nu * nu * lxi) lnP_Z += 2 * np.sum (s[:, np.newaxis] * docLenLmdaLxi) lnP_Z -= 0.5 * np.sum (docLen[:, np.newaxis] * lmda) lnP_Z += np.sum (lmda * expLmda * (scaledWordCounts.dot(vocab.T))) # n(d,k) = expLmda * (scaledWordCounts.dot(vocab.T)) lnP_Z -= np.sum(docLen[:,np.newaxis] * lxi * ((s**2)[:,np.newaxis] - xi**2)) lnP_Z += 0.5 * np.sum(docLen[:,np.newaxis] * (s[:,np.newaxis] + xi)) lnP_Z -= np.sum(docLen[:,np.newaxis] * safe_log_one_plus_exp_of(xi)) lnP_Z -= np.sum (docLen * s) # <ln p(W|Z, vocab)> # lnP_w_dt = sparseScalarProductOfDot(scaledWordCounts, expLmda, vocab * safe_log(vocab)) lnP_W = np.sum(lnP_w_dt.data) # H[q(Y)] lnDetOmY = 0 if omY is None else safe_log_det(omY) lnDetSigY = 0 if sigY is None else safe_log_det(sigY) ent_Y = 0.5 * (P * K * LOG_2PI_E + Q * lnDetOmY + P * lnDetSigY) # H[q(A|Y)] # # A few things - omA is fixed so long as tau an sigma are, so there's no benefit in # recalculating this every time. # # However in a recent test, la.det(omA) = 0 # this is very strange as omA is the inverse of (s*I + t*XTX) # ent_A = 0.5 * (F * K * LOG_2PI_E + K * safe_log_det(varA) + F * K * log (tauSq))\ # H[q(Theta|A)] ent_Theta = 0.5 * (K * LOG_2PI_E + np.sum (np.log(nu * nu))) # H[q(Z|\Theta) # # So Z_dtk \propto expLmda_dt * vocab_tk. We let N here be the normalizer (which is # \sum_j expLmda_dj * vocab_tj, which implies N is DxT. We need to evaluate # Z_dtk * log Z_dtk. We can pull out the normalizer of the first term, but it has # to stay in the log Z_dtk expression, hence the third term in the sum. We can however # take advantage of the ability to mix dot and element-wise products for the different # components of Z_dtk in that three-term sum, which we denote as S # Finally we use np.sum to sum over d and t # ent_Z = 0 #entropyOfDot(expLmda, vocab) result = lnP_Y + lnP_A + lnP_Theta + lnP_Z + lnP_W + ent_Y + ent_A + ent_Theta + ent_Z return result
def var_bound(data, model, query, z_dnk=None): ''' Determines the variational bounds. ''' bound = 0 # Unpack the the structs, for ease of access and efficiency docLens, topics, postTopicCov, U, V, tsums_bydoc, tsums_bytop, exp_tsums_bydoc, exp_tsums_bytop, lse_at_k, out_counts, in_counts = \ query.docLens, query.topics, query.postTopicCov, query.U, query.V, query.tsums_bydoc, query.tsums_bytop, query.exp_tsums_bydoc, query.exp_tsums_bytop, query.lse_at_k, query.out_counts, query.in_counts K, Q, topicPrior, vocabPrior, wordDists, topicCov, dtype, name = \ model.K, model.Q, model.topicPrior, model.vocabPrior, model.wordDists, model.topicCov, model.dtype, model.name W, L = data.words, data.links D, T = W.shape bound = 0 # Pre-calculate some repeated expressinos logVagueness = log(Vagueness) halfDQ, halfQK, halfDK = 0.5 * D * Q, 0.5 * Q * K, 0.5 * D * K logTwoPi = log(2 * pi) logTwoPiE = log(2 * pi * e) # # E[ln p(U)] # bound += -halfDQ * logTwoPi - D * Q * logVagueness - 0.5 * np.sum(U * U) # trace of U U' # # # H[q(U)] # bound += -halfDQ * logTwoPiE - D * Q * logVagueness # # # E[ln p(V)] # bound += -halfQK * logTwoPi - Q * K * logVagueness - 0.5 * np.sum(V * V) # trace of U U' # # # H[q(V)] # bound += -halfQK * logTwoPiE - D * Q * logVagueness # ln p(Topics|U, V) logDetCov = log(la.det(topicCov)) kernel = topics.copy() kernel -= U.dot(V) kernel **= 2 kernel[:] = kernel.dot(topicCov) kernel /= (2 * Vagueness) bound += -halfDK * logTwoPi - halfDK * logVagueness \ -D * 0.5 * logDetCov \ -np.sum(kernel) \ -np.sum(postTopicCov) # FIXME bound here is squiffy # H[q(topics)] bound += -halfDK * logTwoPiE - halfDK * logVagueness - D * 0.5 * logDetCov # We'll need these for the next steps diWordDists = fns.digamma(wordDists) diWordDistSums = fns.digamma(wordDists.sum(axis=1)) # P(z|topic) and P(y|topic) are not stored explicitly, so we need to # recalculate here to calculate their expected log-probs and entropies. prob_words, prob_links = 0, 0 prob_z, ent_z = 0, 0 prob_y, ent_y = 0, 0 for d in range(D): # First the word-topic assignments, note this is a KxV matrix wordIdx, z = _infer_word_topics_at_d(d, W, topics, diWordDists, diWordDistSums) # E[ln p(Z|topics) = sum_d sum_n sum_k E[z_dnk] E[ln topicDist_dk] prob_z += topics[d, :].dot(z * W[d, :].data[np.newaxis, :]).sum() prob_z -= docLens[d] * lse(topics[d, :]) # E[ln p(W|Z)] = sum_d sum_n sum_k sum_t E[z_dnk] w_dnt E[ln vocab_kt] prob_words += np.sum( W[d, :].data[np.newaxis, :] * z * (diWordDists[:, wordIdx] - diWordDistSums[:, np.newaxis])) # And finally the entropy of Z ent_z -= np.dot(z * safe_log(z), W[d, :].data).sum() # Next the link-topic assignments, note this is a PxK matrix linkIdx, y = _infer_link_topics_at_d(d, L, topics, lse_at_k) # Here we _start_ with the entropy of y ent_y -= np.dot(L[d, :].data, y * safe_log(y)).sum() # E[ln p(Y|topics) = sum_d sum_m sum_k E[y_dmk] E[ln topicDist_dk] y *= L[d, :].data[:, np.newaxis] prob_y += y.dot(topics[d, :].T).sum() prob_y -= out_counts[d] * lse(topics[d, :]) # E[ln p(L|Y)] = sum_d sum_m sum_k sum_t E[y_dmk] l_dmp E[ln topics_pk] prob_links += y.dot(topics[linkIdx, :].T).sum() prob_links -= y.dot(lse_at_k).sum() bound += (prob_z + ent_z + prob_words) bound += (prob_y + ent_y + prob_links) return bound
def varBound (modelState, queryState, X, W, Z = None, lnVocab = None, varA_U = None, XA = None, XTX = None): ''' For a current state of the model, and the query, for given inputs, outputs the variational lower-bound. Params modelState - the state of the model currently queryState - the state of the query currently X - the DxF matrix of features we're querying on, where D is the number of documents W - the DxT matrix of words ("terms") we're querying on Z - if this has already been calculated, it can be passed in. If not, we recalculate it from the model and query states. Z is the DxKxT tensor which for each document D and term T gives the proportion of those terms assigned to topic K lnVocab - the KxV matrix of the natural log applied to the vocabularly. Recalculated if not provided varA_U - the product of the column variance matrix and the matrix U. Recalculated if not provided XA - dot product of X and A, recalculated if not provided XTX - dot product of X-transpose and X, recalculated if not provided. Returns The (positive) variational lower bound ''' # Unpack the model and query state tuples for ease of use and maybe speed improvements (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab) = (modelState.K, modelState.F, modelState.T, modelState.P, modelState.A, modelState.varA, modelState.V, modelState.varV, modelState.U, modelState.sigma, modelState.tau, modelState.vocab) (lmda, nu, lxi, s, docLen) = (queryState.lmda, queryState.nu, queryState.lxi, queryState.s, queryState.docLen) # Get the number of samples from the shape. Ensure that the shapes are consistent # with the model parameters. (D, Tcheck) = W.shape if Tcheck != T: raise ValueError ("The shape of the document matrix W is invalid, T is %d but the matrix W has shape (%d, %d)" % (T, D, Tcheck)) (Dcheck, Fcheck) = X.shape if Dcheck != D: raise ValueError ("Inconsistent sizes between the matrices X and W, X has %d rows but W has %d" % (Dcheck, D)) if Fcheck != F: raise ValueError ("The shape of the feature matrix X is invalid. F is %d but the matrix X has shape (%d, %d)" % (F, Dcheck, Fcheck)) # We'll need the original xi for this and also Z, the 3D tensor of which for each document D #and term T gives the strenght of topic K. We'll also need the log of the vocab dist xi = deriveXi (lmda, nu, s) if lnVocab is None: lnVocab = safe_log(vocab) if Z is None: Z = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxV # lnProb1 is the bound on E[p(W|Theta)]. This is a bound, not an equality as we're using # Bouchard's softmax bound (NIPS 2007) here. That said, most of the subsequent terms # will discard additive constants, so strictly speaking none of them are equalities docLenLmdaLxi = docLen[:, np.newaxis] * lmda * lxi lnProb1 = 0.0 lnProb1 -= np.sum(docLenLmdaLxi * lmda) lnProb1 -= np.sum(docLen[:, np.newaxis] * nu * nu * lxi) lnProb1 += 2 * np.sum (s[:, np.newaxis] * docLenLmdaLxi) lnProb1 -= 0.5 * np.sum (docLen[:, np.newaxis] * lmda) lnProb1 += np.sum (lmda * np.einsum ('dt,dkt->dk', W, Z)) lnProb1 += np.sum(lnVocab * np.einsum('dt,dkt->kt', W, Z)) lnProb1 -= np.sum(W * np.einsum('dkt->dt', safe_x_log_x(Z))) lnProb1 -= np.sum(docLen[:,np.newaxis] * lxi * ((s**2)[:,np.newaxis] - xi**2)) lnProb1 += 0.5 * np.sum(docLen[:,np.newaxis] * (s[:,np.newaxis] + xi)) lnProb1 -= np.sum(docLen[:,np.newaxis] * safe_log_one_plus_exp_of(xi)) # lnProb2 is E[p(Theta|A)] if XA is None: XA = X.dot(A) if XTX is None: XTX = X.T.dot(X) sig2 = sigma * sigma tau2 = tau * tau lnProb2 = -0.5 * D * K * log (sig2) \ - 0.5 / sig2 * (np.sum(nu) + D*K * tau2 * np.sum(XTX * varA) + np.sum((lmda - XA)**2)) # lnProb3 is E[p(A|V)] if varA_U is None: varA_U = varA.dot(U) lnProb3 = -0.5 * K * F * log (2 * pi) \ -0.5 * K * F * log(tau2) \ -0.5 / tau2 * \ ( \ np.trace(varA)*K*tau2 \ + np.sum(varA_U * U) * K * tau2 \ + np.sum((A - U.dot(V)) ** 2) \ ) # lnProb4 is E[p(V)] lnProb4 = -0.5 * (np.trace(varV) * K * tau2 + np.sum(V*V)) # ent1 is H[q(Theta)] ent1 = 0.5 * np.sum (np.log(nu * nu)) # ent2 is H[q(A|V)] ent2 = 0.5 * F * K + log(2 * pi * e) + 0.5 * K * log (la.det(varA)) + 0.5 * F * K * log (tau2) # ent3 is H[q(V)] ent3 = 0.5 * P * K * log (2 * pi * e) + 0.5 * K * log (la.det(varV)) + 0.5 * P * K * log (tau2) result = lnProb1 + lnProb2 + lnProb3 + lnProb4 + ent1 + ent2 + ent3 # if (lnProb1 > 0) or (lnProb2 > 0) or (lnProb3 > 0) or (lnProb4 > 0): # print ("Whoopsie - lnProb > 0") # if result > 100: # print ("Well this is just ridiculous") return result
def train(modelState, X, W, iterations=10000, epsilon=0.001, logInterval = 0): ''' Creates a new query state object for a topic model based on side-information. This contains all those estimated parameters that are specific to the actual date being queried - this must be used in conjunction with a model state. The parameters are modelState - the model state with all the model parameters X - the D x F matrix of side information vectors W - the D x V matrix of word **count** vectors. iterations - how long to iterate for epsilon - currently ignored, in future, allows us to stop early. This returns a tuple of new model-state and query-state. The latter object will contain X and W and also s - A D-dimensional vector describing the offset in our bound on the true value of ln sum_k e^theta_dk lxi - A DxK matrix used in the above bound, containing the negative Jakkola function applied to the quadratic term xi lambda - the topics we've inferred for the current batch of documents nu - the variance of topics we've inferred (independent) ''' # Unpack the model state tuple for ease of use and maybe speed improvements (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab) = (modelState.K, modelState.F, modelState.T, modelState.P, modelState.A, modelState.varA, modelState.V, modelState.varV, modelState.U, modelState.sigma, modelState.tau, modelState.vocab) # Get ready to plot the evolution of the likelihood if logInterval > 0: elbos = np.zeros((iterations / logInterval,)) iters = np.zeros((iterations / logInterval,)) # We'll need the total word count per doc, and total count of docs docLen = W.sum(axis=1) D = len(docLen) # No need to recompute this every time XTX = X.T.dot(X) # Assign initial values to the query parameters lmda = rd.random((D, K)) nu = np.ones((D,K), np.float64) s = np.zeros((D,)) lxi = negJakkola (np.ones((D, K), np.float64)) XA = X.dot(A) for iteration in range(iterations): # Save repeated computation tsq = tau * tau; tsqIP = tsq * np.eye(P) trTsqIK = K * tsq # trace of the matrix tau * tau * np.eye(K) halfSig2 = 1./(sigma*sigma) tau2sig2 = (tau * tau) / (sigma * sigma) # ============================================================= # E-Step # Model dists are q(Theta|A;Lambda;nu) q(A|V) q(V) # Where lambda is the posterior mean of theta. # ============================================================= # # V, varV varV = la.inv (tsqIP + U.T.dot(U)) V = varV.dot(U.T).dot(A) _quickPrintElbo ("E-Step: q(V)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # A, varA # TODO, since only tau2sig2 changes at each step, would it be possible just to # amend the old inverse? # TODO Use sparse inverse varA = la.inv (tau2sig2 * XTX + np.eye(F)) A = varA.dot (U.dot(V) + X.T.dot(lmda)) XA = X.dot(A) _quickPrintElbo ("E-Step: q(A|V)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # lmda_dk lnVocab = safe_log (vocab) Z = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxT rho = 2 * s[:,np.newaxis] * lxi - 0.5 \ + np.einsum('dt,dkt->dk', W, Z) / docLen[:,np.newaxis] rhs = docLen[:,np.newaxis] * rho + halfSig2 * X.dot(A) lmda = rhs / (docLen[:,np.newaxis] * 2 * lxi + halfSig2) _quickPrintElbo ("E-Step: q(Theta|A;lamda)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # nu_dk # TODO Double check this again... nu = 1./ np.sqrt(2. * docLen[:, np.newaxis] * lxi + halfSig2) _quickPrintElbo ("E-Step: q(Theta|A;nu)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # ============================================================= # M-Step # Parameters for the softmax bound: lxi and s # The projection used for A: U # The vocabulary : vocab # The variances: tau, sigma # ============================================================= # # s_d # s = (K/4. + (lxi * lmda).sum(axis = 1)) / lxi.sum(axis=1) # _quickPrintElbo ("M-Step: max s", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # xi_dk lxi = negJakkolaOfDerivedXi(lmda, nu, s) _quickPrintElbo ("M-Step: max xi", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # vocab # # TODO, since vocab is in the RHS, is there any way to optimize this? Z = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxV vocab = normalizerows_ip (np.einsum('dt,dkt->kt', W, Z)) _quickPrintElbo ("M-Step: max vocab", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # U U = A.dot(V.T).dot (la.inv(trTsqIK * varV + V.dot(V.T))) _quickPrintElbo ("M-Step: max U", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # sigma # Equivalent to \frac{1}{DK} \left( \sum_d (\sum_k nu_{dk}) + tr(\Omega_A) x_d^{T} \Sigma_A x_d + (\lambda - A^{T} x_d)^{T}(\lambda - A^{T} x_d) \right) # # sigma = 1./(D*K) * (np.sum(nu) + D*K * tsq * np.sum(XTX * varA) + np.sum((lmda - XA)**2)) # # tau # Equivalent to \frac{1}{KF} \left( tr(\Sigma_A)tr(\Omega_A) + tr(\Sigma_V U U^{T})tr(\Omega_V) + tr ((M_A - U M_V)^{T} (M_A - U M_V)) \right) # varA_U = varA.dot(U) # tau_term1 = np.trace(varA)*K*tsq # tau_term2 = sum(varA_U[p,:].dot(U[p,:]) for p in xrange(P)) * K * tsq # tau_term3 = np.sum((A - U.dot(V)) ** 2) # # tau = 1./(K*F) * (tau_term1 + tau_term2 + tau_term3) if (logInterval > 0) and (iteration % logInterval == 0): elbo = varBound ( \ VbSideTopicModelState (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab), \ VbSideTopicQueryState(lmda, nu, lxi, s, docLen), X, W, Z, lnVocab, varA_U, XA, XTX) elbos[iteration / logInterval] = elbo iters[iteration / logInterval] = iteration print ("Iteration %5d ELBO %f" % (iteration, elbo)) if logInterval > 0: plot_bound(iters, elbos) return (VbSideTopicModelState (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab), \ VbSideTopicQueryState (lmda, nu, lxi, s, docLen))