def var_bound(data, model, query): ''' A total nonsense in this case which we retain just so all the other functions continue to work. ''' bound = 0 # Unpack the the structs, for ease of access and efficiency docLens, topicMeans = \ query.docLens, query.topicDists K, topicPrior, vocabPrior, wordDists, dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype # Initialize z matrix if necessary W = data.words D, T = W.shape # ln p(x,z) >= sum_k p(z=k|x) * (ln p(x|z=k, phi) + p(z=k)) + H[q] # Expected joint like = W.dot(safe_log(wordDists).T) # D*K like *= safe_log(topicMeans) # Entropy ent = (-topicMeans * safe_log(topicMeans)).sum() return like.sum() + bound
def var_bound(data, model, query): ''' Determines the variational bounds. ''' bound = 0 # Unpack the the structs, for ease of access and efficiency docLens, topicMeans = \ query.docLens, query.topicDists K, topicPrior, vocabPrior, wordDists, corpusTopicDist, dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.corpusTopicDist, model.dtype # Initialize z matrix if necessary W = data.words D, T = W.shape # ln p(x,z) >= sum_k p(z=k|x) * (ln p(x|z=k, phi) + p(z=k)) + H[q] # Expected joint like = W.dot(safe_log(wordDists).T) # D*K like += corpusTopicDist[np.newaxis,:] like *= safe_log(topicMeans) # Entropy ent = (-topicMeans * safe_log(topicMeans)).sum() return like.sum() + bound
def query(data, model, queryState, _): ''' Infers the topic distributions in general, and specifically for each individual datapoint, without altering the model Params: W - the DxT document-term matrix X - The DxD document-document matrix model - the initial model configuration. This is MUTATED IN-PLACE qyery - the query results - essentially all the "local" variables matched to the given observations. Also MUTATED IN-PLACE plan - how to execute the training process (e.g. iterations, log-interval etc.) Return: The updated model object (note parameters are updated in place, so make a defensive copy if you want it) The query object with the update query parameters ''' K, wordDists, corpusTopicDist, topicPrior, vocabPrior = \ model.K, model.wordDists, model.corpusTopicDist, model.topicPrior, model.vocabPrior topicDists = queryState.topicDists W = data.words wordDists = safe_log(wordDists) corpusTopicDist = safe_log(corpusTopicDist) topicDists = W.dot(wordDists.T) + corpusTopicDist[np.newaxis, :] norms = fns.logsumexp(topicDists, axis=1) topicDists -= norms[:, np.newaxis] return model, QueryState(queryState.docLens, np.exp(topicDists), True)
def _testInferenceFromHandcraftedExampleWithKEqualingQ(self): print ("Fully handcrafted example, K=Q") rd.seed(0xC0FFEE) # Global init for repeatable test T = 100 # Vocabulary size, the number of "terms". Must be a square number Q = 6 # Topics: This cannot be changed without changing the code that generates the vocabulary K = 6 # Observed topics P = 8 # Features F = 12 # Observed features D = 200 # Sample documents (each with associated features) avgWordsPerDoc = 500 # The vocabulary. Presented graphically there are two with horizontal bands # (upper lower); two with vertical bands (left, right); and two with # horizontal bands (inside, outside) vocab = makeSixTopicVocab(T) # Create our (sparse) features X, then our topic proportions ("tpcs") # then our word counts W lmda = np.zeros((D,K)) X = np.zeros((D,F)) for d in range(D): for _ in range(3): lmda[d,rd.randint(K)] += 1./3 for _ in range(int(F/3)): X[d,rd.randint(F)] += 1 A = rd.random((K,F)) X = lmda.dot(la.pinv(A).T) X = ssp.csr_matrix(X) tpcs = lmda docLens = rd.poisson(avgWordsPerDoc, (D,)) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # # Now finally try to train the model # modelState = newVbModelState(K, Q, F, P, T) (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=1) tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda)) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32) priorReconsError = np.sum(np.square(W - W_inf)) / D (trainedState, queryState) = train (modelState, X, W, logInterval=1, plotInterval = 100, iterations=130) tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda)) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32) print ("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError,)) print ("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.square(W - W_inf)) / D,)) print("End of Test")
def _sparseScalarProductOfSafeLnDot_py(A,B,C, out=None): ''' Calculates A * B.dot(C) where A is a sparse matrix Retains sparsity in the result, unlike the built-in operator Note the type of the return-value is the same as the type of the sparse matrix A. If this has an integral type, this will only provide integer-based multiplication. ''' if WarnIfSlow: sys.stderr.write("WARNING: Slow code path triggered (_sparseScalarProductOfSafeLnDot_py)") if not (A.dtype == B.dtype and B.dtype == C.dtype and (out is None or C.dtype == out.dtype)): raise ValueError ("Inconsistent dtypes in the three matrices and possibly the out-param") if out is None: out = A.copy() else: out.data[:] = A.data rhs = B.dot(C) rhs[rhs < sys.float_info.min] = sys.float_info.min out.data *= safe_log(rhs)[csr_indices(out.indptr, out.indices)] return out
def sample_memberships(W, alpha, wordDists, memberships): _, K = memberships.shape priorNum = memberships.sum(axis=0) + alpha - 1 prior = priorNum.copy() sample_dists = W.dot(safe_log(wordDists).T) # d x k for d in range(W.shape[0]): priorNum -= memberships[d, :] prior[:] = priorNum prior /= priorNum.sum() sample_dists[d, :] += safe_log(prior) sample_dists[d, :] -= sample_dists[d, :].max() sample_dists[d, :] -= fns.logsumexp(sample_dists[d, :]) np.exp(sample_dists[d, :], out=sample_dists[d, :]) memberships[d, :] = rd.multinomial(1, sample_dists[d, :], size=1) priorNum += memberships[d, :] return memberships
def iterate (iterations, D, K, T, \ W_list, docLens, \ topicPrior, vocabPrior, \ z_dnk, topicDists, wordDists): raise ValueError("This implementation no longer supported") totalItrs = 0 epsilon = 0.01 / K oldWordDists = np.empty(wordDists.shape, wordDists.dtype) newWordDists = wordDists for _ in range(iterations): oldWordDists, newWordDists = newWordDists, oldWordDists lnWordDists = safe_log(oldWordDists, out=oldWordDists) newWordDists.fill (vocabPrior) for d in range(D): oldTopics = topicDists[d,:].copy() topicDists[d,:]= 1./ K lnWordProbs = lnWordDists[:,W_list[d,0:docLens[d]]] innerItrs = 0 while ((innerItrs < MaxInnerItrs) or (np.sum(np.abs(oldTopics - topicDists[d,:])) > epsilon)) \ and (innerItrs < MaxInnerItrs): diTopic = fns.digamma(topicDists[d,:]) z_dnk[:docLens[d],:] = lnWordProbs.T + diTopic[np.newaxis,:] # We've been working in log-space till now, before we go to true # probability space rescale so we don't underflow everywhere maxes = z_dnk.max(axis=1) z_dnk -= maxes[:,np.newaxis] np.exp(z_dnk, out=z_dnk) # Now normalize so probabilities sum to one sums = z_dnk.sum(axis=1) z_dnk /= sums[:,np.newaxis] # Update vocabulary: hard to do with a list representation # Now use it to infer the topic distribution topicDists[d,:] = topicPrior + np.sum(z_dnk[:docLens[d],:], axis=0) topicDists[d,:] /= np.sum(topicDists[d,:]) innerItrs += 1 totalItrs += innerItrs for k in range(K): for n in range(docLens[d]): newWordDists[k,W_list[d,n]] += z_dnk[n,k] newWordDists /= newWordDists.sum(axis=1)[:,np.newaxis] return totalItrs
def varBound(modelState, queryState, X, W, lnVocab=None, XAT=None, XTX=None, scaledWordCounts=None, VTV=None, UTU=None): # # TODO Standardise hyperparameter handling so we can eliminate this copy and paste # # Unpack the model and query state tuples for ease of use and maybe speed improvements K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, _, alphaSq, kappaSq, tauSq = ( modelState.K, modelState.Q, modelState.F, modelState.P, modelState.T, modelState.A, modelState.varA, modelState.Y, modelState.omY, modelState.sigY, modelState.sigT, modelState.U, modelState.V, modelState.vocab, modelState.topicVar, modelState.featVar, modelState.lowTopicVar, modelState.lowFeatVar, ) (expLmda, nu, lxi, s, docLen) = (queryState.expLmda, queryState.nu, queryState.lxi, queryState.s, queryState.docLen) lmda = np.log(expLmda) isigT = la.inv(sigT) lnDetSigT = la.det(sigT) sigmaSq = 1 # A bit of a hack till hyperparameter handling is standardised # Get the number of samples from the shape. Ensure that the shapes are consistent # with the model parameters. (D, Tcheck) = W.shape if Tcheck != T: raise ValueError( "The shape of the DxT document matrix W is invalid, T is %d but the matrix W has shape (%d, %d)" % (T, D, Tcheck) ) (Dcheck, Fcheck) = X.shape if Dcheck != D: raise ValueError("Inconsistent sizes between the matrices X and W, X has %d rows but W has %d" % (Dcheck, D)) if Fcheck != F: raise ValueError( "The shape of the DxF feature matrix X is invalid. F is %d but the matrix X has shape (%d, %d)" % (F, Dcheck, Fcheck) ) # We'll need the original xi for this and also Z, the 3D tensor of which for each document D # and term T gives the strength of topic K. We'll also need the log of the vocab dist xi = deriveXi(lmda, nu, s) # If not already provided, we'll also need the following products # if XAT is None: XAT = X.dot(A.T) if XTX is None: XTX = X.T.dot(X) if V is not None and VTV is None: VTV = V.T.dot(V) if U is not None and UTU is None: UTU = U.T.dot(U) # also need one over the usual variances overSsq, overAsq, overKsq, overTsq = 1.0 / sigmaSq, 1.0 / alphaSq, 1.0 / kappaSq, 1.0 / tauSq overTkSq = overTsq * overKsq overAsSq = overAsq * overSsq # <ln p(Y)> # trSigY = 1 if sigY is None else np.trace(sigY) trOmY = K # Basically it's the trace of the identity matrix as the posterior and prior cancel out lnP_Y = -0.5 * ( Q * P * LOG_2PI + P * lnDetSigT + overTkSq * trSigY * trOmY + overTkSq * np.trace(isigT.dot(Y).dot(Y.T)) ) # <ln P(A|Y)> # TODO it looks like I should take the trace of omA \otimes I_K here. # TODO Need to check re-arranging sigY and omY is sensible. halfKF = 0.5 * K * F # Horrible, but varBound can be called by two implementations, one with Y as a matrix-variate # where sigY is QxQ and one with Y as a multi-varate, where sigY is a QPxQP. A_from_Y = Y.dot(U.T) if V is None else U.dot(Y).dot(V.T) A_diff = A - A_from_Y varFactorU = np.trace(sigY.dot(np.kron(VTV, UTU))) if sigY.shape[0] == Q * P else np.sum(sigY * UTU) varFactorV = 1 if V is None else np.sum(omY * V.T.dot(V)) lnP_A = ( -halfKF * LOG_2PI - halfKF * log(alphaSq) - F / 2.0 * lnDetSigT - 0.5 * (overAsSq * varFactorV * varFactorU + np.trace(XTX.dot(varA)) * K + np.sum(isigT.dot(A_diff) * A_diff)) ) # <ln p(Theta|A,X) # lmdaDiff = lmda - XAT lnP_Theta = ( -0.5 * D * LOG_2PI - 0.5 * D * lnDetSigT - 0.5 / sigmaSq * (np.sum(nu) + D * K * np.sum(XTX * varA) + np.sum(lmdaDiff.dot(isigT) * lmdaDiff)) ) # Why is order of sigT reversed? It's 'cause we've not been consistent. A is KxF but lmda is DxK, and # note that the distribution of lmda transpose has the same covariances, just in different positions # (i.e. row is col and vice-versa) # <ln p(Z|Theta) # docLenLmdaLxi = docLen[:, np.newaxis] * lmda * lxi scaledWordCounts = sparseScalarQuotientOfDot(W, expLmda, vocab, out=scaledWordCounts) lnP_Z = 0.0 lnP_Z -= np.sum(docLenLmdaLxi * lmda) lnP_Z -= np.sum(docLen[:, np.newaxis] * nu * nu * lxi) lnP_Z += 2 * np.sum(s[:, np.newaxis] * docLenLmdaLxi) lnP_Z -= 0.5 * np.sum(docLen[:, np.newaxis] * lmda) lnP_Z += np.sum( lmda * expLmda * (scaledWordCounts.dot(vocab.T)) ) # n(d,k) = expLmda * (scaledWordCounts.dot(vocab.T)) lnP_Z -= np.sum(docLen[:, np.newaxis] * lxi * ((s ** 2)[:, np.newaxis] - xi ** 2)) lnP_Z += 0.5 * np.sum(docLen[:, np.newaxis] * (s[:, np.newaxis] + xi)) lnP_Z -= np.sum(docLen[:, np.newaxis] * safe_log_one_plus_exp_of(xi)) lnP_Z -= np.sum(docLen * s) # <ln p(W|Z, vocab)> # lnP_w_dt = sparseScalarProductOfDot(scaledWordCounts, expLmda, vocab * safe_log(vocab)) lnP_W = np.sum(lnP_w_dt.data) # H[q(Y)] lnDetOmY = 0 if omY is None else log(la.det(omY)) lnDetSigY = 0 if sigY is None else log(max(la.det(sigY), sys.float_info.min)) # TODO FIX THIS ent_Y = 0.5 * (P * K * LOG_2PI_E + Q * lnDetOmY + P * lnDetSigY) # H[q(A|Y)] # # A few things - omA is fixed so long as tau and sigma are, so there's no benefit in # recalculating this every time. # # However in a recent test, la.det(omA) = 0 # this is very strange as omA is the inverse of (s*I + t*XTX) # # ent_A = 0.5 * (F * K * LOG_2PI_E + K * log (la.det(omA)) + F * K * log (tau2))\ ent_A = 0 # H[q(Theta|A)] ent_Theta = 0.5 * (K * LOG_2PI_E + np.sum(np.log(nu * nu))) # H[q(Z|\Theta) # # So Z_dtk \propto expLmda_dt * vocab_tk. We let N here be the normalizer (which is # \sum_j expLmda_dt * vocab_tj, which implies N is DxT. We need to evaluate # Z_dtk * log Z_dtk. We can pull out the normalizer of the first term, but it has # to stay in the log Z_dtk expression, hence the third term in the sum. We can however # take advantage of the ability to mix dot and element-wise products for the different # components of Z_dtk in that three-term sum, which we denote as S # Finally we use np.sum to sum over d and t # ent_Z = 0 # entropyOfDot(expLmda, vocab) result = lnP_Y + lnP_A + lnP_Theta + lnP_Z + lnP_W + ent_Y + ent_A + ent_Theta + ent_Z return result
def var_bound(data, modelState, queryState): ''' Determines the variational bounds. Values are mutated in place, but are reset afterwards to their initial values. So it's safe to call in a serial manner. ''' # Unpack the the structs, for ease of access and efficiency W, L, X = data.words, data.links, data.feats D,_ = W.shape outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens = queryState.outMeans, queryState.outVarcs, queryState.inMeans, queryState.inVarcs, queryState.inDocCov, queryState.docLens K, topicMean, topicCov, outDocCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.outDocCov, modelState.vocab, modelState.A, modelState.dtype # Calculate some implicit variables itopicCov = la.inv(topicCov) bound = 0 expMeansOut = np.exp(outMeans - outMeans.max(axis=1)[:, np.newaxis]) expMeansIn = np.exp(inMeans - inMeans.max(axis=0)[np.newaxis, :]) lse_at_k = expMeansIn.sum(axis=0) # Distribution over document topics bound -= (D*K)/2. * LN_OF_2_PI bound -= D/2. * safe_log_det(outDocCov * topicCov) diff = outMeans - topicMean[np.newaxis,:] bound -= 0.5 * np.sum (diff.dot(itopicCov) * diff * 1./outDocCov) bound -= (0.5 / outDocCov) * np.sum(outVarcs * np.diag(itopicCov)[np.newaxis,:]) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only. # And its entropy bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.log(outVarcs).sum() # Distribution over document in-links inDocPre = np.reciprocal(inDocCov) bound -= (D*K)/2. * LN_OF_2_PI bound -= D/2. * safe_log_det(topicCov) bound -= K/2 * safe_log(inDocCov).sum() diff = inMeans - outMeans bound -= 0.5 * np.sum (diff.dot(itopicCov) * diff * inDocPre[:,np.newaxis]) bound -= 0.5 * np.sum((inVarcs * inDocPre[:,np.newaxis]) * np.diag(itopicCov)[np.newaxis,:]) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only. # And its entropy bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.log(inVarcs).sum() # Distribution over topic assignments E[p(Z)] and E[p(Y)] W_weights = sparseScalarQuotientOfDot(W, expMeansOut, vocab) # D x V [W / TB] is the quotient of the original over the reconstructed doc-term matrix top_sums = expMeansOut * (W_weights.dot(vocab.T)) # D x K L_weights = sparseScalarQuotientOfNormedDot(L, expMeansOut, expMeansIn, lse_at_k) top_sums += expMeansOut * (L_weights.dot(expMeansIn) / lse_at_k[np.newaxis, :]) # E[p(Z,Y)] linkLens = np.squeeze(np.array(L.sum(axis=1))) bound += np.sum(outMeans * top_sums) bound -= np.sum((docLens + linkLens) * np.log(np.sum(expMeansOut, axis=1))) # H[Z] bound += ((W_weights.dot(vocab.T)) * expMeansOut * outMeans).sum() \ + ((W_weights.dot((np.log(vocab) * vocab).T)) * expMeansOut).sum() \ - np.trace(sparseScalarProductOfSafeLnDot(W_weights, expMeansOut, vocab).dot(vocab.T).dot(expMeansOut.T)) # H[Y] docVocab = (expMeansIn / lse_at_k[np.newaxis,:]).T.copy() bound += ((L_weights.dot(docVocab.T)) * expMeansOut * outMeans).sum() \ + ((L_weights.dot((np.log(docVocab) * docVocab).T)) * expMeansOut).sum() \ - np.trace(sparseScalarProductOfSafeLnDot(L_weights, expMeansOut, docVocab).dot(docVocab.T).dot(expMeansOut.T)) # E[p(W)] vlv = np.log(vocab) * vocab bound += np.trace(expMeansOut.T.dot(W_weights.dot(vlv.T))) # E[p(L) dld = np.log(docVocab) * docVocab bound += np.trace(expMeansOut.T.dot(L_weights.dot(dld.T))) return bound
def var_bound(data, modelState, queryState, z_dnk = None): ''' Determines the variational bounds. ''' # Unpack the the structs, for ease of access and efficiency W_list, docLens, topicDists = \ queryState.W_list, queryState.docLens, queryState.topicDists K, topicPrior, vocabPrior, _, dtype = \ modelState.K, modelState.topicPrior, modelState.vocabPrior, modelState.wordDists, modelState.dtype W = data.words D,T = W.shape maxN = docLens.max() if z_dnk == None: z_dnk = np.empty(shape=(maxN, K), dtype=dtype) wordDistsMatrix = wordDists(modelState) diWordDists = fns.digamma(wordDistsMatrix.copy()) - fns.digamma(wordDistsMatrix.sum(axis=1))[:,np.newaxis] lnWordDists = np.log(wordDistsMatrix) bound = 0 # Expected Probablity # # P(topics|topicPrior) diTopicDists = fns.digamma(topicDists) - fns.digamma(topicDists.sum(axis=1))[:,np.newaxis] ln_b_topic = fns.gammaln(topicPrior.sum()) - fns.gammaln(topicPrior).sum() bound += D * ln_b_topic \ + np.sum((topicPrior - 1) * diTopicDists) # and its entropy ent = fns.gammaln(topicDists.sum(axis=1)).sum() - fns.gammaln(topicDists).sum() \ + np.sum ((topicDists - 1) * diTopicDists) bound -= ent # P(z|topic) is tricky as we don't actually store this. However # we make a single, simple estimate for this case. # NOTE COPY AND PASTED FROM iterate_f32 / iterate_f64 (-ish) for d in range(D): lnWordProbs = lnWordDists[:,W_list[d,0:docLens[d]]] diTopic = fns.digamma(topicDists[d,:]) z_dnk[0:docLens[d],:] = lnWordProbs.T + diTopic[np.newaxis,:] # We've been working in log-space till now, before we go to true # probability space rescale so we don't underflow everywhere maxes = z_dnk.max(axis=1) z_dnk -= maxes[:,np.newaxis] np.exp(z_dnk, out=z_dnk) # Now normalize so probabilities sum to one sums = z_dnk.sum(axis=1) z_dnk /= sums[:,np.newaxis] # z_dnk[docLens[d]:maxN,:] = 0 # zero probablities for words that don't exist # Now use to calculate E[ln p(Z|topics), E[ln p(W|Z) and H[Z] in that order diTopic -= fns.digamma(np.sum(topicDists[d,:])) bound += np.sum(z_dnk * diTopic[np.newaxis,:]) bound += np.sum(z_dnk[0:docLens[d],:].T * diWordDists[:,W_list[d,0:docLens[d]]]) bound -= np.sum(z_dnk[0:docLens[d],:] * safe_log(z_dnk[0:docLens[d],:])) # p(vocabDists|vocabPrior) ln_b_vocab = fns.gammaln(T * vocabPrior) - T * fns.gammaln(vocabPrior) bound += K * ln_b_vocab \ + (vocabPrior - 1) * np.sum(diWordDists) # and its entropy ent = fns.gammaln(wordDistsMatrix.sum(axis=1)).sum() - fns.gammaln(wordDistsMatrix).sum() \ + np.sum ((wordDistsMatrix - 1) * diWordDists) bound -= ent return bound
def _testInferenceFromHandcraftedExampleWithKEqualingQ(self): print("Fully handcrafted example, K=Q") rd.seed(0xC0FFEE) # Global init for repeatable test T = 100 # Vocabulary size, the number of "terms". Must be a square number Q = 6 # Topics: This cannot be changed without changing the code that generates the vocabulary K = 6 # Observed topics P = 8 # Features F = 12 # Observed features D = 200 # Sample documents (each with associated features) avgWordsPerDoc = 500 # The vocabulary. Presented graphically there are two with horizontal bands # (upper lower); two with vertical bands (left, right); and two with # horizontal bands (inside, outside) vocab = makeSixTopicVocab(T) # Create our (sparse) features X, then our topic proportions ("tpcs") # then our word counts W lmda = np.zeros((D, K)) X = np.zeros((D, F)) for d in range(D): for _ in range(3): lmda[d, rd.randint(K)] += 1. / 3 for _ in range(int(F / 3)): X[d, rd.randint(F)] += 1 A = rd.random((K, F)) X = lmda.dot(la.pinv(A).T) X = ssp.csr_matrix(X) tpcs = lmda docLens = rd.poisson(avgWordsPerDoc, (D, )) W = tpcs.dot(vocab) W *= docLens[:, np.newaxis] W = np.array(W, dtype=np.int32) # truncate word counts to integers W = ssp.csr_matrix(W) # # Now finally try to train the model # modelState = newVbModelState(K, Q, F, P, T) (trainedState, queryState) = train(modelState, X, W, logInterval=1, iterations=1) tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda)) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:, np.newaxis], dtype=np.int32) priorReconsError = np.sum(np.square(W - W_inf)) / D (trainedState, queryState) = train(modelState, X, W, logInterval=1, plotInterval=100, iterations=130) tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda)) W_inf = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:, np.newaxis], dtype=np.int32) print("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError, )) print("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.square(W - W_inf)) / D, )) print("End of Test")
def var_bound(data, model, query, z_dnk = None): ''' Determines the variational bounds. ''' bound = 0 # Unpack the the structs, for ease of access and efficiency K, topicPrior, wordPrior, wordDists, weights, negCount, reg, dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.weights, model.pseudoNegCount, model.regularizer, model.dtype docLens, topicDists = \ query.docLens, query.topicDists W,X = data.words, data.links D,T = W.shape minNonZero = 1E-300 if dtype is np.float64 else 1E-30 # Perform the digamma transform for E[ln \theta] etc. topicDists = topicDists.copy() diTopicDists = fns.digamma(topicDists[:, :K]) diSumTopicDists = fns.digamma(topicDists[:, :K].sum(axis=1)) diWordDists = fns.digamma(model.wordDists) diSumWordDists = fns.digamma(model.wordDists.sum(axis=1)) # E[ln p(topics|topicPrior)] according to q(topics) # prob_topics = D * (fns.gammaln(topicPrior[:K].sum()) - fns.gammaln(topicPrior[:K]).sum()) \ + np.sum((topicPrior[:K] - 1)[np.newaxis, :] * (diTopicDists - diSumTopicDists[:, np.newaxis])) bound += prob_topics # and its entropy ent_topics = _dirichletEntropy(topicDists[:, :K]) bound += ent_topics # E[ln p(vocabs|vocabPrior)] # if type(model.vocabPrior) is float or type(model.vocabPrior) is int: prob_vocabs = K * (fns.gammaln(wordPrior * T) - T * fns.gammaln(wordPrior)) \ + np.sum((wordPrior - 1) * (diWordDists - diSumWordDists[:,np.newaxis] )) else: prob_vocabs = K * (fns.gammaln(wordPrior.sum()) - fns.gammaln(wordPrior).sum()) \ + np.sum((wordPrior - 1)[np.newaxis,:] * (diWordDists - diSumWordDists[:,np.newaxis] )) bound += prob_vocabs # and its entropy ent_vocabs = _dirichletEntropy(wordDists) bound += ent_vocabs # P(z|topic) is tricky as we don't actually store this. However # we make a single, simple estimate for this case. topicMeans = _convertDirichletParamToMeans(docLens, topicDists, topicPrior) prob_words = 0 prob_z = 0 ent_z = 0 for d in range(D): wordIdx, z = _infer_topics_at_d(d, data, weights, docLens, topicMeans, topicPrior, diWordDists, diSumWordDists) # E[ln p(Z|topics) = sum_d sum_n sum_k E[z_dnk] E[ln topicDist_dk] exLnTopic = diTopicDists[d, :K] - diSumTopicDists[d] prob_z += np.dot(z * exLnTopic[:, np.newaxis], W[d, :].data).sum() # E[ln p(W|Z)] = sum_d sum_n sum_k sum_t E[z_dnk] w_dnt E[ln vocab_kt] prob_words += np.sum(W[d, :].data[np.newaxis, :] * z * (diWordDists[:, wordIdx] - diSumWordDists[:, np.newaxis])) # And finally the entropy of Z ent_z -= np.dot(z * safe_log(z), W[d, :].data).sum() bound += (prob_z + ent_z + prob_words) # Next, the distribution over links - we just focus on the positives in this case for d in range(D): links = _links_up_to(d, X) if len(links) == 0: continue scores = topicMeans[links, :].dot(weights * topicMeans[d]) probs = _probit_inplace(scores) + minNonZero lnProbs = np.log(probs, out=probs) # expected probability of all links from d to p < d such that y_dp = 1 bound += lnProbs.sum() _convertMeansToDirichletParam(docLens, topicMeans, topicPrior) return bound
def var_bound(data, model, query, z_dnk = None): ''' Determines the variational bounds. ''' bound = 0 # Unpack the the structs, for ease of access and efficiency K, topicPrior, wordPrior, wordDists, dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype docLens, topicDists = \ query.docLens, query.topicDists # Initialize z matrix if necessary W,X = data.words, data.links D,T = W.shape # Perform the digamma transform for E[ln \theta] etc. topicDists = topicDists.copy() diTopicDists = fns.digamma(topicDists) diSumTopicDists = fns.digamma(topicDists.sum(axis=1)) diWordDists = fns.digamma(model.wordDists) diSumWordDists = fns.digamma(model.wordDists.sum(axis=1)) # E[ln p(topics|topicPrior)] according to q(topics) # prob_topics = D * (fns.gammaln(topicPrior.sum()) - fns.gammaln(topicPrior).sum()) \ + np.sum((topicPrior - 1)[np.newaxis, :] * (diTopicDists - diSumTopicDists[:, np.newaxis])) bound += prob_topics # and its entropy ent_topics = _dirichletEntropy(topicDists) bound += ent_topics # E[ln p(vocabs|vocabPrior)] # if type(model.vocabPrior) is float or type(model.vocabPrior) is int: prob_vocabs = K * (fns.gammaln(wordPrior * T) - T * fns.gammaln(wordPrior)) \ + np.sum((wordPrior - 1) * (diWordDists - diSumWordDists[:, np.newaxis] )) else: prob_vocabs = K * (fns.gammaln(wordPrior.sum()) - fns.gammaln(wordPrior).sum()) \ + np.sum((wordPrior - 1)[np.newaxis,:] * (diWordDists - diSumWordDists[:, np.newaxis] )) bound += prob_vocabs # and its entropy ent_vocabs = _dirichletEntropy(wordDists) bound += ent_vocabs # P(z|topic) is tricky as we don't actually store this. However # we make a single, simple estimate for this case. topicMeans = _convertDirichletParamToMeans(docLens, topicDists, topicPrior) prob_words = 0 prob_z = 0 ent_z = 0 for d in range(D): wordIdx, z = _infer_topics_at_d(d, data, docLens, topicMeans, topicPrior, diWordDists, diSumWordDists) # E[ln p(Z|topics) = sum_d sum_n sum_k E[z_dnk] E[ln topicDist_dk] exLnTopic = diTopicDists[d, :] - diSumTopicDists[d] prob_z += np.dot(z * exLnTopic[:, np.newaxis], W[d, :].data).sum() # E[ln p(W|Z)] = sum_d sum_n sum_k sum_t E[z_dnk] w_dnt E[ln vocab_kt] prob_words += np.sum(W[d, :].data[np.newaxis, :] * z * (diWordDists[:, wordIdx] - diSumWordDists[:, np.newaxis])) # And finally the entropy of Z ent_z -= np.dot(z * safe_log(z), W[d, :].data).sum() bound += (prob_z + ent_z + prob_words) _convertMeansToDirichletParam(docLens, topicMeans, topicPrior) return bound
def var_bound(data, model, query, z_dnk = None): ''' Determines the variational bounds. ''' bound = 0 # Unpack the the structs, for ease of access and efficiency docLens, topics, postTopicCov, U, V, tsums_bydoc, tsums_bytop, exp_tsums_bydoc, exp_tsums_bytop, lse_at_k, out_counts, in_counts = \ query.docLens, query.topics, query.postTopicCov, query.U, query.V, query.tsums_bydoc, query.tsums_bytop, query.exp_tsums_bydoc, query.exp_tsums_bytop, query.lse_at_k, query.out_counts, query.in_counts K, Q, topicPrior, vocabPrior, wordDists, topicCov, dtype, name = \ model.K, model.Q, model.topicPrior, model.vocabPrior, model.wordDists, model.topicCov, model.dtype, model.name W, L = data.words, data.links D, T = W.shape bound = 0 # Pre-calculate some repeated expressinos logVagueness = log(Vagueness) halfDQ, halfQK, halfDK = 0.5*D*Q, 0.5*Q*K, 0.5*D*K logTwoPi = log(2 * pi) logTwoPiE = log(2 * pi * e) # # E[ln p(U)] # bound += -halfDQ * logTwoPi - D * Q * logVagueness - 0.5 * np.sum(U * U) # trace of U U' # # # H[q(U)] # bound += -halfDQ * logTwoPiE - D * Q * logVagueness # # # E[ln p(V)] # bound += -halfQK * logTwoPi - Q * K * logVagueness - 0.5 * np.sum(V * V) # trace of U U' # # # H[q(V)] # bound += -halfQK * logTwoPiE - D * Q * logVagueness # ln p(Topics|U, V) logDetCov = log(la.det(topicCov)) kernel = topics.copy() kernel -= U.dot(V) kernel **= 2 kernel[:] = kernel.dot(topicCov) kernel /= (2 * Vagueness) bound += -halfDK * logTwoPi - halfDK * logVagueness \ -D * 0.5 * logDetCov \ -np.sum(kernel) \ -np.sum(postTopicCov) # FIXME bound here is squiffy # H[q(topics)] bound += -halfDK * logTwoPiE - halfDK * logVagueness - D * 0.5 * logDetCov # We'll need these for the next steps diWordDists = fns.digamma(wordDists) diWordDistSums = fns.digamma(wordDists.sum(axis=1)) # P(z|topic) and P(y|topic) are not stored explicitly, so we need to # recalculate here to calculate their expected log-probs and entropies. prob_words, prob_links = 0, 0 prob_z, ent_z = 0, 0 prob_y, ent_y = 0, 0 for d in range(D): # First the word-topic assignments, note this is a KxV matrix wordIdx, z = _infer_word_topics_at_d(d, W, topics, diWordDists, diWordDistSums) # E[ln p(Z|topics) = sum_d sum_n sum_k E[z_dnk] E[ln topicDist_dk] prob_z += topics[d, :].dot(z * W[d, :].data[np.newaxis, :]).sum() prob_z -= docLens[d] * lse(topics[d, :]) # E[ln p(W|Z)] = sum_d sum_n sum_k sum_t E[z_dnk] w_dnt E[ln vocab_kt] prob_words += np.sum(W[d, :].data[np.newaxis, :] * z * (diWordDists[:, wordIdx] - diWordDistSums[:, np.newaxis])) # And finally the entropy of Z ent_z -= np.dot(z * safe_log(z), W[d, :].data).sum() # Next the link-topic assignments, note this is a PxK matrix linkIdx, y = _infer_link_topics_at_d(d, L, topics, lse_at_k) # Here we _start_ with the entropy of y ent_y -= np.dot(L[d, :].data, y * safe_log(y)).sum() # E[ln p(Y|topics) = sum_d sum_m sum_k E[y_dmk] E[ln topicDist_dk] y *= L[d, :].data[:, np.newaxis] prob_y += y.dot(topics[d, :].T).sum() prob_y -= out_counts[d] * lse(topics[d, :]) # E[ln p(L|Y)] = sum_d sum_m sum_k sum_t E[y_dmk] l_dmp E[ln topics_pk] prob_links += y.dot(topics[linkIdx, :].T).sum() prob_links -= y.dot(lse_at_k).sum() bound += (prob_z + ent_z + prob_words) bound += (prob_y + ent_y + prob_links) return bound
def train(data, model, query, plan, updateVocab=True): ''' Infers the topic distributions in general, and specifically for each individual datapoint, Params: data - the training data, we just use the DxT document-term matrix model - the initial model configuration. This is MUTATED IN-PLACE qyery - the query results - essentially all the "local" variables matched to the given observations. Also MUTATED IN-PLACE plan - how to execute the training process (e.g. iterations, log-interval etc.) Return: The updated model object (note parameters are updated in place, so make a defensive copy if you want it) The query object with the update query parameters ''' iterations, epsilon, logFrequency, fastButInaccurate, debug = \ plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug docLens, topicMeans = \ query.docLens, query.topicDists K, topicPrior, vocabPrior, wordDists, corpusTopicDist, dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.corpusTopicDist, model.dtype W = data.words iters, bnds, likes = [], [], [] # Quick sanity check if np.any(docLens < 1): raise ValueError("Input document-term matrix contains at least one document with no words") assert dtype == np.float64, "Only implemented for 64-bit floats" for itr in range(iterations): # E-Step safe_log(wordDists, out=wordDists) safe_log(corpusTopicDist, out=corpusTopicDist) topicDists = W.dot(wordDists.T) + corpusTopicDist[np.newaxis, :] #topicDists -= topicDists.max(axis=1)[:, np.newaxis] # TODO Ensure this is okay norms = fns.logsumexp(topicDists, axis=1) topicDists -= norms[:, np.newaxis] np.exp(topicDists, out=topicDists) # M-Step wordDists = (W.T.dot(topicDists)).T wordDists += vocabPrior wordDists /= wordDists.sum(axis=1)[:, np.newaxis] corpusTopicDist = topicDists.sum(axis=0) corpusTopicDist[:] += topicPrior corpusTopicDist /= corpusTopicDist.sum() if itr % logFrequency == 0 or debug: m = ModelState(K, topicPrior, vocabPrior, wordDists, corpusTopicDist, True, dtype, model.name) q = QueryState(query.docLens, topicDists, True) iters.append(itr) bnds.append(var_bound(data, m, q)) likes.append(log_likelihood(data, m, q)) perp = perplexity_from_like(likes[-1], W.sum()) print("Iteration %d : Train Perp = %4.0f Bound = %.3f" % (itr, perp, bnds[-1])) if len(iters) > 2 and iters[-1] > 50: lastPerp = perplexity_from_like(likes[-2], W.sum()) if lastPerp - perp < 1: break; return ModelState(K, topicPrior, vocabPrior, wordDists, corpusTopicDist, True, dtype, model.name), \ QueryState(query.docLens, topicDists, True), \ (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))