def e_step(self, wordids, wordcts): batch_size = len(wordids) document_level_elbo = 0; # Initialize the variational distribution q(theta|gamma) for the mini-batch gamma = 1*numpy.random.gamma(100., 1./100., (batch_size, self._number_of_topics)) exp_E_log_theta = numpy.exp(compute_dirichlet_expectation(gamma)) sstats = numpy.zeros(self._beta.shape) # Now, for each document d update that document's gamma and phi meanchange = 0 for d in range(0, batch_size): # These are mostly just shorthand (but might help cache locality) ids = wordids[d] cts = wordcts[d] gammad = gamma[d, :] exp_E_log_theta_d = exp_E_log_theta[d, :] exp_E_log_beta_d = self._exp_E_log_beta[:, ids] # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w. phi_norm is the normalizer. phi_norm = numpy.dot(exp_E_log_theta_d, exp_E_log_beta_d) + 1e-100 # Iterate between gamma and phi until convergence for it in range(0, self._maximum_gamma_update_iteration): lastgamma = gammad # We represent phi implicitly to save memory and time. Substituting the value of the optimal phi back into the update for gamma gives this update. Cf. Lee&Seung 2001. gammad = self._alpha_theta + exp_E_log_theta_d * numpy.dot(cts / phi_norm, exp_E_log_beta_d.T) exp_E_log_theta_d = numpy.exp(compute_dirichlet_expectation(gammad)) phi_norm = numpy.dot(exp_E_log_theta_d, exp_E_log_beta_d) + 1e-100 # If gamma hasn't changed much, we're done. meanchange = numpy.mean(abs(gammad - lastgamma)) if (meanchange < self._minimum_mean_change_threshold): break gamma[d, :] = gammad # Contribution of document d to the expected sufficient statistics for the M step. sstats[:, ids] += numpy.outer(exp_E_log_theta_d.T, cts/phi_norm) if self._compute_elbo: document_level_elbo += numpy.sum(cts * phi_norm) # E[log p(theta | alpha) - log q(theta | gamma)] document_level_elbo += numpy.sum((self._alpha_theta - gammad) * exp_E_log_theta_d); document_level_elbo += numpy.sum(scipy.special.gammaln(gammad) - scipy.special.gammaln(self._alpha_theta)); document_level_elbo += numpy.sum(scipy.special.gammaln(self._alpha_theta * self._number_of_topics) - scipy.special.gammaln(numpy.sum(gammad))); # This step finishes computing the sufficient statistics for the M step, so that sstats[k, w] = \sum_d n_{dw} * phi_{dwk} = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}. sstats = sstats * self._exp_E_log_beta if self._compute_elbo: document_level_elbo *= self._number_of_documents / batch_size; return gamma, sstats, document_level_elbo
def export_beta(self, exp_beta_path, top_display=-1): output = open(exp_beta_path, 'w'); E_log_eta = numpy.copy(self._var_beta); assert E_log_eta.shape == (self._number_of_topics, self._number_of_edges) for internal_node_index in self._edges_from_internal_node: edge_index_list = self._edges_from_internal_node[internal_node_index]; assert numpy.min(E_log_eta[:, edge_index_list]) >= 0; E_log_eta[:, edge_index_list] = compute_dirichlet_expectation(E_log_eta[:, edge_index_list]); del internal_node_index, edge_index_list; for topic_index in xrange(self._number_of_topics): output.write("==========\t%d\t==========\n" % (topic_index)); freqdist = nltk.probability.FreqDist() for path_index in self._path_index_to_word_index: path_rank = 1; for edge_index in self._edges_along_path[path_index]: path_rank *= numpy.exp(E_log_eta[topic_index, edge_index]); freqdist[path_index] += path_rank; i = 0; for (path_index, path_freq) in freqdist.most_common(): i += 1; output.write("%s\t%g\n" % (self._index_to_type[self._path_index_to_word_index[path_index]], freqdist[path_index])); if top_display > 0 and i >= top_display: break; output.close();
def approx_bound(self, docs, gamma): """ Estimates the variational bound over *all documents* using only the documents passed in as "docs." gamma is the set of parameters to the variational distribution q(theta) corresponding to the set of documents passed in. The output of this function is going to be noisy, but can be useful for assessing convergence. """ # This is to handle the case where someone just hands us a single # document, not in a list. if (type(docs).__name__ == 'string'): temp = list() temp.append(docs) docs = temp (wordids, wordcts) = self.parse_doc_list(docs) batch_size = len(docs) score = 0 Elogtheta = compute_dirichlet_expectation(gamma) expElogtheta = numpy.exp(Elogtheta) # E[log p(docs | theta, beta)] for d in range(0, batch_size): gammad = gamma[d, :] ids = wordids[d] cts = numpy.array(wordcts[d]) phinorm = numpy.zeros(len(ids)) for i in range(0, len(ids)): temp = Elogtheta[d, :] + self._Elogbeta[:, ids[i]] tmax = max(temp) phinorm[i] = numpy.log(sum(numpy.exp(temp - tmax))) + tmax score += numpy.sum(cts * phinorm) # oldphinorm = phinorm # phinorm = n.dot(expElogtheta[d, :], self._exp_E_log_beta[:, ids]) # print oldphinorm # print n.log(phinorm) # score += n.sum(cts * n.log(phinorm)) # E[log p(theta | alpha) - log q(theta | gamma)] score += numpy.sum((self._alpha_theta - gamma)*Elogtheta) score += numpy.sum(gammaln(gamma) - gammaln(self._alpha_theta)) score += sum(gammaln(self._alpha_theta*self._number_of_topics) - gammaln(numpy.sum(gamma, 1))) # Compensate for the subsampling of the population of documents score = score * self._number_of_documents / len(docs) # E[log p(beta | eta) - log q (beta | lambda)] score = score + numpy.sum((self._alpha_eta-self._beta)*self._Elogbeta) score = score + numpy.sum(gammaln(self._beta) - gammaln(self._alpha_eta)) score = score + numpy.sum(gammaln(self._alpha_eta*self._vocab_size) - gammaln(numpy.sum(self._beta, 1))) return(score)
def export_beta(self, exp_beta_path, top_display=-1): output = open(exp_beta_path, 'w'); E_log_eta = compute_dirichlet_expectation(self._beta); for topic_index in xrange(self._number_of_topics): output.write("==========\t%d\t==========\n" % (topic_index)); beta_probability = numpy.exp(E_log_eta[topic_index, :] - scipy.misc.logsumexp(E_log_eta[topic_index, :])); i = 0; for type_index in reversed(numpy.argsort(beta_probability)): i += 1; output.write("%s\t%g\n" % (self._index_to_type[type_index], beta_probability[type_index])); if top_display > 0 and i >= top_display: break; output.close();
def export_beta(self, exp_beta_path, top_display=-1): output = open(exp_beta_path, 'w'); E_log_eta = compute_dirichlet_expectation(self._eta); for topic_index in xrange(self._number_of_topics): output.write("==========\t%d\t==========\n" % (topic_index)); beta_probability = numpy.exp(E_log_eta[topic_index, :] - scipy.misc.logsumexp(E_log_eta[topic_index, :])); i = 0; for type_index in reversed(numpy.argsort(beta_probability)): i += 1; output.write("%s\t%g\n" % (self._index_to_type[type_index], beta_probability[type_index])); if top_display > 0 and i >= top_display: break; output.close();
def e_step(self, wordids): batchD = len(wordids) document_level_elbo = 0 sufficient_statistics = numpy.zeros( (self._number_of_topics, self._vocab_size)) # Initialize the variational distribution q(theta|gamma) for the mini-batch batch_document_topic_distribution = numpy.zeros( (batchD, self._number_of_topics)) # Now, for each document d update that document's gamma and phi for d in xrange(batchD): phi = numpy.random.random( (self._number_of_topics, len(wordids[d]))) phi = phi / numpy.sum(phi, axis=0)[numpy.newaxis, :] phi_sum = numpy.sum(phi, axis=1)[:, numpy.newaxis] assert (phi_sum.shape == (self._number_of_topics, 1)) for it in xrange(self._number_of_samples): for n in xrange(len(wordids[d])): id = wordids[d][n] phi_sum -= phi[:, n][:, numpy.newaxis] # this is to get rid of the underflow error from the above summation, ideally, phi will become all integers after few iterations phi_sum *= phi_sum > 0 #assert(numpy.all(phi_sum >= 0)); temp_phi = (phi_sum + self._alpha_theta ).T * self._exp_E_log_beta[:, wordids[d][n]] assert (temp_phi.shape == (1, self._number_of_topics)) temp_phi /= numpy.sum(temp_phi) # sample a topic for this word temp_phi = numpy.random.multinomial( 1, temp_phi[0, :])[:, numpy.newaxis] assert (temp_phi.shape == (self._number_of_topics, 1)) phi[:, n][:, numpy.newaxis] = temp_phi phi_sum += temp_phi # discard the first few burn-in sweeps if it < self._burn_in_sweeps: continue sufficient_statistics[:, id] += temp_phi[:, 0] batch_document_topic_distribution[ d, :] = self._alpha_theta + phi_sum.T[0, :] if self._compute_elbo: document_level_elbo += len(wordids[d]) gammad = batch_document_topic_distribution[d] document_level_elbo += numpy.sum( (self._alpha_theta - gammad) * numpy.exp(compute_dirichlet_expectation(gammad))) document_level_elbo += numpy.sum( scipy.special.gammaln(gammad) - scipy.special.gammaln(self._alpha_theta)) document_level_elbo += numpy.sum( scipy.special.gammaln(self._alpha_theta * self._number_of_topics) - scipy.special.gammaln(numpy.sum(gammad))) sufficient_statistics /= (self._number_of_samples - self._burn_in_sweeps) if self._compute_elbo: document_level_elbo *= self._number_of_documents / batchD return batch_document_topic_distribution, sufficient_statistics, document_level_elbo
def e_step(self, parsed_corpus=None, local_parameter_iteration=50, local_parameter_converge_threshold=1e-6): if parsed_corpus == None: word_ids = self._parsed_corpus[0]; word_cts = self._parsed_corpus[1]; else: word_ids = parsed_corpus[0] word_cts = parsed_corpus[1]; assert len(word_ids) == len(word_cts); number_of_documents = len(word_ids); document_log_likelihood = 0; words_log_likelihood = 0; # initialize a V-by-K matrix phi sufficient statistics phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_types)); # initialize a D-by-K matrix gamma values gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics; E_log_eta = compute_dirichlet_expectation(self._eta); assert E_log_eta.shape == (self._number_of_topics, self._number_of_types); if parsed_corpus != None: E_log_prob_eta = E_log_eta - scipy.misc.logsumexp(E_log_eta, axis=1)[:, numpy.newaxis] # iterate over all documents # for doc_id in xrange(number_of_documents): for doc_id in numpy.random.permutation(number_of_documents): # compute the total number of words # total_word_count = self._corpus[doc_id].N() total_word_count = numpy.sum(word_cts[doc_id]); # initialize gamma for this document gamma_values[doc_id, :] = self._alpha_alpha + 1.0 * total_word_count / self._number_of_topics; # term_ids = numpy.array(self._corpus[doc_id].keys()); # term_counts = numpy.array([self._corpus[doc_id].values()]); term_ids = word_ids[doc_id]; term_counts = word_cts[doc_id]; assert term_counts.shape == (1, len(term_ids)); # update phi and gamma until gamma converges for gamma_iteration in xrange(local_parameter_iteration): assert E_log_eta.shape == (self._number_of_topics, self._number_of_types); # log_phi = self._E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]), (len(self._corpus[doc_id]), 1)); log_phi = E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(gamma_values[[doc_id], :]), (word_ids[doc_id].shape[0], 1)); assert log_phi.shape == (len(term_ids), self._number_of_topics); # phi_normalizer = numpy.log(numpy.sum(numpy.exp(log_phi), axis=1)[:, numpy.newaxis]); # assert phi_normalizer.shape == (len(term_ids), 1); # log_phi -= phi_normalizer; log_phi -= scipy.misc.logsumexp(log_phi, axis=1)[:, numpy.newaxis]; assert log_phi.shape == (len(term_ids), self._number_of_topics); gamma_update = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi + numpy.log(term_counts.transpose())), axis=0)); mean_change = numpy.mean(abs(gamma_update - gamma_values[doc_id, :])); gamma_values[doc_id, :] = gamma_update; if mean_change <= local_parameter_converge_threshold: break; # Note: all terms including E_q[p(\theta | \alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates in E-step # compute the alpha terms document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha)) # compute the gamma terms document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[doc_id, :])) - scipy.special.gammaln(numpy.sum(gamma_values[doc_id, :])); # compute the phi terms document_log_likelihood -= numpy.sum(numpy.dot(term_counts, numpy.exp(log_phi) * log_phi)); # Note: all terms including E_q[p(\eta | \beta)], i.e., terms involving \Psi(\eta), are cancelled due to \eta updates in M-step if parsed_corpus != None: # compute the p(w_{dn} | z_{dn}, \eta) terms, which will be cancelled during M-step during training words_log_likelihood += numpy.sum(numpy.exp(log_phi.T + numpy.log(term_counts)) * E_log_prob_eta[:, term_ids]); assert(log_phi.shape == (len(term_ids), self._number_of_topics)); phi_sufficient_statistics[:, term_ids] += numpy.exp(log_phi + numpy.log(term_counts.transpose())).T; if (doc_id + 1) % 1000 == 0: print "successfully processed %d documents..." % (doc_id + 1); if parsed_corpus == None: self._gamma = gamma_values; return document_log_likelihood, phi_sufficient_statistics else: return words_log_likelihood, gamma_values
def e_step(self, parsed_corpus=None, number_of_samples=10, burn_in_samples=5): if parsed_corpus == None: documents = self._parsed_corpus else: documents = parsed_corpus number_of_documents = len(documents); document_log_likelihood = 0; words_log_likelihood = 0; # initialize a V-by-K matrix phi contribution phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_paths)); # gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics; gamma_values = self._alpha_alpha + 2.0 * self._number_of_paths / self._number_of_topics * numpy.random.random((number_of_documents, self._number_of_topics)); E_log_eta = numpy.copy(self._var_beta); for internal_node_index in self._edges_from_internal_node: edge_index_list = self._edges_from_internal_node[internal_node_index]; assert numpy.min(E_log_eta[:, edge_index_list]) >= 0; E_log_eta[:, edge_index_list] = compute_dirichlet_expectation(E_log_eta[:, edge_index_list]); del internal_node_index, edge_index_list; # iterate over all documents for document_index in xrange(number_of_documents): document_gamma = numpy.zeros(self._alpha_alpha.shape); topic_path_assignment = {}; topic_sum = numpy.zeros((1, self._number_of_topics)); for word_index in xrange(len(documents[document_index])): topic_assignment = numpy.random.randint(0, self._number_of_topics); path_assignment = numpy.random.randint(0, len(self._word_index_to_path_indices[documents[document_index][word_index]])); topic_path_assignment[word_index] = (topic_assignment, path_assignment); topic_sum[0, topic_assignment] += 1; del word_index, topic_assignment, path_assignment; # update path_phi and phi_sum until phi_sum converges for sample_index in xrange(number_of_samples): # document_phi = numpy.zeros((self._number_of_topics, self._number_of_paths)); phi_entropy = 0; phi_E_log_eta = 0; for word_index in xrange(len(documents[document_index])): word_id = documents[document_index][word_index]; topic_sum[0, topic_path_assignment[word_index][0]] -= 1; paths_lead_to_current_word = self._word_index_to_path_indices[word_id]; assert len(paths_lead_to_current_word) > 0 # path_phi = numpy.tile(scipy.special.psi(self._gamma[[document_index], :]).T, (1, len(paths_lead_to_current_word))); path_phi = numpy.tile((topic_sum + self._alpha_alpha).T, (1, len(paths_lead_to_current_word))); assert path_phi.shape == (self._number_of_topics, len(paths_lead_to_current_word)); for path_index in xrange(len(paths_lead_to_current_word)): path_phi[:, path_index] *= numpy.exp(numpy.sum(E_log_eta[:, self._edges_along_path[paths_lead_to_current_word[path_index]]], axis=1)); del path_index assert path_phi.shape == (self._number_of_topics, len(paths_lead_to_current_word)); # normalize path_phi over all topics path_phi /= numpy.sum(path_phi); # compute the phi terms phi_entropy += -numpy.sum(path_phi * numpy.log(path_phi + 1e-100)); random_number = numpy.random.random(); for topic_index in xrange(self._number_of_topics): for path_index in xrange(len(paths_lead_to_current_word)): random_number -= path_phi[topic_index, path_index]; if random_number <= 0: break; if random_number <= 0: break; topic_sum[0, topic_index] += 1; topic_path_assignment[word_index] = (topic_index, path_index); if sample_index >= burn_in_samples: phi_sufficient_statistics[topic_index, paths_lead_to_current_word[path_index]] += 1; # # # # # for position_index in xrange(len(paths_lead_to_current_word)): phi_E_log_eta += numpy.sum(path_phi[:, [position_index]] * numpy.sum(E_log_eta[:, self._edges_along_path[paths_lead_to_current_word[position_index]]], axis=1)[:, numpy.newaxis]) del position_index del word_index, paths_lead_to_current_word if sample_index >= burn_in_samples: document_gamma += self._alpha_alpha + topic_sum # gamma_values[[document_index], :] = self._alpha_alpha + topic_sum; gamma_values[[document_index], :] = document_gamma / (number_of_samples - burn_in_samples); # Note: all terms including E_q[p(\theta | \alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates in E-step # document_log_likelihood += numpy.sum((self._alpha_alpha - 1) * compute_dirichlet_expectation(gamma_values[[document_index], :])); # document_log_likelihood += numpy.sum(numpy.sum(document_phi, axis=1)[:, numpy.newaxis].T * compute_dirichlet_expectation(gamma_values[[document_index], :])); # document_log_likelihood += -numpy.sum((gamma_values[[document_index], :] - 1) * compute_dirichlet_expectation(gamma_values[[document_index], :])); # compute the alpha terms document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha)); # compute the gamma terms document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[document_index, :])) - scipy.special.gammaln(numpy.sum(gamma_values[document_index, :])); # compute the phi terms # phi_entropy += -numpy.sum(path_phi * numpy.log(path_phi)) * documents[doc_id][word_id]; document_log_likelihood += phi_entropy; # Note: all terms including E_q[p(\eta | \beta)], i.e., terms involving \Psi(\eta), are cancelled due to \eta updates in M-step if parsed_corpus != None: # compute the p(w_{dn} | z_{dn}, \eta) terms, which will be cancelled during M-step during training words_log_likelihood += phi_E_log_eta; # phi_sufficient_statistics += document_phi; if (document_index + 1) % 1000 == 0: print "successfully processed %d documents..." % (document_index + 1); del document_index phi_sufficient_statistics /= (number_of_samples - burn_in_samples); assert phi_sufficient_statistics.shape == (self._number_of_topics, self._number_of_paths); if parsed_corpus == None: self._gamma = gamma_values; return document_log_likelihood, phi_sufficient_statistics else: return words_log_likelihood, gamma_values
def e_step(self, parsed_corpus_response=None, number_of_samples=10, burn_in_samples=5, approximate_phi=True): if parsed_corpus_response == None: word_idss = self._parsed_corpus; responses = self._responses else: word_idss, responses = parsed_corpus_response; number_of_documents = len(word_idss); document_log_likelihood = 0; words_log_likelihood = 0; # initialize a V-by-K matrix phi sufficient statistics phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_types)); E_A_sufficient_statistics = numpy.zeros((number_of_documents, self._number_of_topics)) E_AA_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_topics)) # initialize a D-by-K matrix gamma values gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics; E_log_beta = compute_dirichlet_expectation(self._beta); assert E_log_beta.shape == (self._number_of_topics, self._number_of_types); if parsed_corpus_response != None: E_log_prob_eta = E_log_beta - scipy.misc.logsumexp(E_log_beta, axis=1)[:, numpy.newaxis] exp_E_log_beta = numpy.exp(E_log_beta); for doc_id in xrange(number_of_documents): phi = numpy.random.random((self._number_of_topics, len(word_idss[doc_id]))); phi = phi / numpy.sum(phi, axis=0)[numpy.newaxis, :]; phi_sum = numpy.sum(phi, axis=1)[:, numpy.newaxis]; assert(phi_sum.shape == (self._number_of_topics, 1)); document_phi = numpy.zeros((len(word_idss[doc_id]), self._number_of_topics)); for iter in xrange(number_of_samples): for word_pos in xrange(len(word_idss[doc_id])): word_id = word_idss[doc_id][word_pos]; phi_sum -= phi[:, word_pos][:, numpy.newaxis]; # this is to get rid of the underflow error from the above summation, ideally, phi will become all integers after few iterations phi_sum *= (phi_sum > 0); # assert(numpy.all(phi_sum >= 0)); temp_phi = (phi_sum.T + self._alpha_alpha) * exp_E_log_beta[:, [word_id]].T; assert(temp_phi.shape == (1, self._number_of_topics)); temp_phi /= numpy.sum(temp_phi); # sample a topic for this word temp_phi = numpy.random.multinomial(1, temp_phi[0])[:, numpy.newaxis]; assert(temp_phi.shape == (self._number_of_topics, 1)); phi[:, word_pos][:, numpy.newaxis] = temp_phi; phi_sum += temp_phi; # discard the first few burn-in sweeps if iter < burn_in_samples: continue; phi_sufficient_statistics[:, word_id] += temp_phi[:, 0]; document_phi[word_pos, :] += temp_phi[:, 0]; gamma_values[doc_id, :] = self._alpha_alpha + phi_sum.T[0, :]; # batch_document_topic_distribution[doc_id, :] = self._alpha_alpha + phi_sum.T[0, :]; document_phi /= (number_of_samples - burn_in_samples); # this is to prevent 0 during log() document_phi += 1e-100; assert document_phi.shape == (len(word_idss[doc_id]), self._number_of_topics); phi_mean = numpy.mean(document_phi, axis=0) assert phi_mean.shape == (self._number_of_topics,); # Note: all terms including E_q[p(\theta|\alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates # Note: all terms including E_q[p(\eta | \beta)], i.e., terms involving \Psi(\eta), are cancelled due to \eta updates in M-step # compute the alpha terms document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha)) # compute the gamma terms document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[doc_id, :])) - scipy.special.gammaln(numpy.sum(gamma_values[doc_id, :])); # compute the phi terms document_log_likelihood -= numpy.sum(numpy.log(document_phi) * document_phi); # compute the eta terms document_log_likelihood -= 0.5 * numpy.log(2 * numpy.pi * self._sigma_square) document_log_likelihood -= 0.5 * (responses[doc_id] ** 2 - 2 * responses[doc_id] * numpy.sum(self._eta[0, :] * phi_mean) + numpy.dot(numpy.dot(self._eta, numpy.dot(phi_mean[:, numpy.newaxis], phi_mean[numpy.newaxis, :])), self._eta.T)) / self._sigma_square # Note: all terms including E_q[p(\_eta | \_beta)], i.e., terms involving \Psi(\_eta), are cancelled due to \_eta updates in M-step if parsed_corpus_response != None: # compute the p(w_{dn} | z_{dn}, \_eta) terms, which will be cancelled during M-step during training words_log_likelihood += numpy.sum(phi.T * E_log_prob_eta[:, word_idss[doc_id]]); E_A_sufficient_statistics[doc_id, :] = phi_mean; E_AA_sufficient_statistics += numpy.dot(phi_mean[:, numpy.newaxis], phi_mean[numpy.newaxis, :]) if (doc_id + 1) % 1000 == 0: print "successfully processed %d documents..." % (doc_id + 1); phi_sufficient_statistics /= (number_of_samples - burn_in_samples); # compute mean absolute error mean_absolute_error = numpy.abs(numpy.dot(E_A_sufficient_statistics, self._eta.T) - responses[:, numpy.newaxis]).sum() if parsed_corpus_response == None: self._gamma = gamma_values; return document_log_likelihood, phi_sufficient_statistics, E_A_sufficient_statistics, E_AA_sufficient_statistics else: return words_log_likelihood, gamma_values, numpy.dot(E_A_sufficient_statistics, self._eta.T)
def approx_bound(self, docs, gamma): """ Estimates the variational bound over *all documents* using only the documents passed in as "docs." gamma is the set of parameters to the variational distribution q(theta) corresponding to the set of documents passed in. The output of this function is going to be noisy, but can be useful for assessing convergence. """ # This is to handle the case where someone just hands us a single # document, not in a list. if (type(docs).__name__ == 'string'): temp = list() temp.append(docs) docs = temp (wordids, wordcts) = self.parse_doc_list(docs) batch_size = len(docs) score = 0 Elogtheta = compute_dirichlet_expectation(gamma) expElogtheta = numpy.exp(Elogtheta) # E[log p(docs | theta, beta)] for d in range(0, batch_size): gammad = gamma[d, :] ids = wordids[d] cts = numpy.array(wordcts[d]) phinorm = numpy.zeros(len(ids)) for i in range(0, len(ids)): temp = Elogtheta[d, :] + self._Elogbeta[:, ids[i]] tmax = max(temp) phinorm[i] = numpy.log(sum(numpy.exp(temp - tmax))) + tmax score += numpy.sum(cts * phinorm) # oldphinorm = phinorm # phinorm = n.dot(expElogtheta[d, :], self._exp_E_log_beta[:, ids]) # print oldphinorm # print n.log(phinorm) # score += n.sum(cts * n.log(phinorm)) # E[log p(theta | alpha) - log q(theta | gamma)] score += numpy.sum((self._alpha_theta - gamma) * Elogtheta) score += numpy.sum(gammaln(gamma) - gammaln(self._alpha_theta)) score += sum( gammaln(self._alpha_theta * self._number_of_topics) - gammaln(numpy.sum(gamma, 1))) # Compensate for the subsampling of the population of documents score = score * self._number_of_documents / len(docs) # E[log p(beta | eta) - log q (beta | lambda)] score = score + numpy.sum( (self._alpha_eta - self._beta) * self._Elogbeta) score = score + numpy.sum( gammaln(self._beta) - gammaln(self._alpha_eta)) score = score + numpy.sum( gammaln(self._alpha_eta * self._vocab_size) - gammaln(numpy.sum(self._beta, 1))) return (score)
def e_step(self, parsed_corpus=None, local_parameter_iteration=50, local_parameter_converge_threshold=1e-6): if parsed_corpus==None: word_ids = self._parsed_corpus[0]; word_cts = self._parsed_corpus[1]; else: word_ids = parsed_corpus[0] word_cts = parsed_corpus[1]; assert len(word_ids)==len(word_cts); number_of_documents = len(word_ids); document_log_likelihood = 0; words_log_likelihood = 0; # initialize a V-by-K matrix phi sufficient statistics phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_types)); # initialize a D-by-K matrix gamma values gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics; E_log_eta = compute_dirichlet_expectation(self._eta); assert E_log_eta.shape==(self._number_of_topics, self._number_of_types); if parsed_corpus!=None: E_log_prob_eta = E_log_eta-scipy.misc.logsumexp(E_log_eta, axis=1)[:, numpy.newaxis] # iterate over all documents #for doc_id in xrange(number_of_documents): for doc_id in numpy.random.permutation(number_of_documents): # compute the total number of words #total_word_count = self._corpus[doc_id].N() total_word_count = numpy.sum(word_cts[doc_id]); # initialize gamma for this document gamma_values[doc_id, :] = self._alpha_alpha + 1.0 * total_word_count / self._number_of_topics; #term_ids = numpy.array(self._corpus[doc_id].keys()); #term_counts = numpy.array([self._corpus[doc_id].values()]); term_ids = word_ids[doc_id]; term_counts = word_cts[doc_id]; assert term_counts.shape == (1, len(term_ids)); # update phi and gamma until gamma converges for gamma_iteration in xrange(local_parameter_iteration): assert E_log_eta.shape==(self._number_of_topics, self._number_of_types); #log_phi = self._E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]), (len(self._corpus[doc_id]), 1)); log_phi = E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(gamma_values[[doc_id], :]), (word_ids[doc_id].shape[0], 1)); assert log_phi.shape==(len(term_ids), self._number_of_topics); #phi_normalizer = numpy.log(numpy.sum(numpy.exp(log_phi), axis=1)[:, numpy.newaxis]); #assert phi_normalizer.shape == (len(term_ids), 1); #log_phi -= phi_normalizer; log_phi -= scipy.misc.logsumexp(log_phi, axis=1)[:, numpy.newaxis]; assert log_phi.shape==(len(term_ids), self._number_of_topics); gamma_update = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi + numpy.log(term_counts.transpose())), axis=0)); mean_change = numpy.mean(abs(gamma_update - gamma_values[doc_id, :])); gamma_values[doc_id, :] = gamma_update; if mean_change <= local_parameter_converge_threshold: break; # Note: all terms including E_q[p(\theta | \alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates in E-step # compute the alpha terms document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha)) # compute the gamma terms document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[doc_id, :])) - scipy.special.gammaln(numpy.sum(gamma_values[doc_id, :])); # compute the phi terms document_log_likelihood -= numpy.dot(term_counts, numpy.exp(log_phi) * log_phi); # Note: all terms including E_q[p(\eta | \beta)], i.e., terms involving \Psi(\eta), are cancelled due to \eta updates in M-step if parsed_corpus!=None: # compute the p(w_{dn} | z_{dn}, \eta) terms, which will be cancelled during M-step during training words_log_likelihood += numpy.sum(numpy.exp(log_phi.T + numpy.log(term_counts)) * E_log_prob_eta[:, term_ids]); assert(log_phi.shape == (len(term_ids), self._number_of_topics)); phi_sufficient_statistics[:, term_ids] += numpy.exp(log_phi + numpy.log(term_counts.transpose())).T; if (doc_id+1) % 1000==0: print "successfully processed %d documents..." % (doc_id+1); if parsed_corpus==None: self._gamma = gamma_values; return document_log_likelihood, phi_sufficient_statistics else: return words_log_likelihood, gamma_values
def e_step(self, parsed_corpus_labels=None, local_gamma_iteration=10, local_phi_iteration=10, local_parameter_converge_threshold=1e-6, approximate_phi=False): if parsed_corpus_labels == None: word_idss = self._parsed_corpus; label_idss = self._parsed_labels else: word_idss = parsed_corpus_labels; label_idss = None; number_of_documents = len(word_idss); document_log_likelihood = 0; words_log_likelihood = 0; # initialize a V-by-K matrix phi sufficient statistics phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_types)); E_A_sufficient_statistics = numpy.zeros((number_of_documents, self._number_of_topics)) E_AA_sufficient_statistics = numpy.zeros((number_of_documents, self._number_of_topics, self._number_of_topics)) # initialize a D-by-K matrix gamma values gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics; E_log_beta = compute_dirichlet_expectation(self._beta); assert E_log_beta.shape == (self._number_of_topics, self._number_of_types); if parsed_corpus_labels != None: E_log_prob_eta = E_log_beta - scipy.misc.logsumexp(E_log_beta, axis=1)[:, numpy.newaxis] for doc_id in xrange(number_of_documents): total_word_count = len(word_idss[doc_id]); term_ids = word_idss[doc_id]; if parsed_corpus_labels == None: label_ids = label_idss[doc_id]; # initialize gamma for this document gamma_values[doc_id, :] = self._alpha_alpha + 1.0 * total_word_count / self._number_of_topics; log_phi = scipy.special.psi(gamma_values[doc_id, :][numpy.newaxis, :]) + E_log_beta[:, term_ids].T; log_phi -= scipy.misc.logsumexp(log_phi, axis=1)[:, numpy.newaxis]; assert log_phi.shape == (len(term_ids), self._number_of_topics); # phi = numpy.exp(log_phi); assert self._eta.shape == (self._number_of_labels, self._number_of_topics); auxilary_variables_per_label = numpy.zeros(len(self._index_to_label)); # log_auxilary_variables_per_label_token = numpy.zeros((len(self._index_to_label), total_word_count)); for label_index in self._index_to_label: log_sum_phi_exp_eta = scipy.misc.logsumexp(log_phi + self._eta[label_index, :][numpy.newaxis, :] / total_word_count, axis=1); assert log_sum_phi_exp_eta.shape == (len(term_ids),) # log_auxilary_variables_per_label_token[:, label_index] = log_sum_phi_exp_eta; auxilary_variables_per_label[label_index] = numpy.exp(numpy.sum(log_sum_phi_exp_eta)); # update phi and gamma until gamma converges for gamma_iteration in xrange(local_gamma_iteration): if approximate_phi: ''' phi = numpy.exp(log_phi); assert phi.shape == (len(term_ids), self._number_of_topics); phi_sum = numpy.sum(phi, axis=0)[numpy.newaxis, :]; phi_sum_j = numpy.tile(phi_sum, (len(term_ids), 1)); phi_sum_j -= phi; assert phi_sum_j.shape == (len(term_ids), self._number_of_topics); # log_phi = self._E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]), (len(self._corpus[doc_id]), 1)); # log_phi = E_log_beta[:, term_ids].T + numpy.tile(scipy.special.psi(gamma_values[[doc_id], :]), (word_ids[doc_id].shape[0], 1)); log_phi = scipy.special.psi(gamma_values[doc_id, :][numpy.newaxis, :]) + E_log_beta[:, term_ids].T; assert log_phi.shape == (len(term_ids), self._number_of_topics); assert self._eta.shape == (1, self._number_of_topics); log_phi += ((label_idss[doc_id] / (total_word_count * self._sigma_square)) * self._eta) assert log_phi.shape == (len(term_ids), self._number_of_topics); log_phi -= (numpy.dot(phi_sum_j, self._eta.T) * self._eta + 0.5 * (self._eta ** 2)) / ((numpy.float(total_word_count) ** 2.) * self._sigma_square) assert log_phi.shape == (len(term_ids), self._number_of_topics); # phi_normalizer = numpy.log(numpy.sum(numpy.exp(log_phi), axis=1)[:, numpy.newaxis]); # assert phi_normalizer.shape == (len(term_ids), 1); # log_phi -= phi_normalizer; log_phi -= scipy.misc.logsumexp(log_phi, axis=1)[:, numpy.newaxis]; assert log_phi.shape == (len(term_ids), self._number_of_topics); gamma_update = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi), axis=0)); mean_change = numpy.mean(abs(gamma_update - gamma_values[doc_id, :])); gamma_values[doc_id, :] = gamma_update; if mean_change <= local_parameter_converge_threshold: break; ''' pass else: old_gamma_values = gamma_values[doc_id, :].copy(); assert log_phi.shape == (len(term_ids), self._number_of_topics); for term_pos in xrange(len(term_ids)): term_id = term_ids[term_pos]; h_vector = numpy.zeros(self._number_of_topics); for label_index in self._index_to_label: log_sum_phi_n_exp_eta = scipy.misc.logsumexp(log_phi[term_pos, :] + self._eta[label_index, :] / total_word_count); sum_phi_n_exp_eta = numpy.exp(log_sum_phi_n_exp_eta); # numpy.sum(log_auxilary_variables_per_label_token[:term_pos, label_index]) + numpy.sum(log_auxilary_variables_per_label_token[term_pos:, label_index]) auxilary_variables_per_label[label_index] /= sum_phi_n_exp_eta; h_vector += auxilary_variables_per_label[label_index] * numpy.exp(self._eta[label_index, :] / total_word_count) for phi_iteration in xrange(local_phi_iteration): phi_n = numpy.exp(log_phi[term_pos, :]); # log_phi = self._E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]), (len(self._corpus[doc_id]), 1)); # log_phi = E_log_beta[:, term_ids].T + numpy.tile(scipy.special.psi(gamma_values[[doc_id], :]), (word_ids[doc_id].shape[0], 1)); log_phi_n = scipy.special.psi(gamma_values[doc_id, :]) + E_log_beta[:, term_id]; assert log_phi_n.shape == (self._number_of_topics,); if parsed_corpus_labels == None: log_phi_n += numpy.sum(self._eta[label_ids, :], axis=0) / total_word_count log_phi_n -= len(label_ids) * h_vector / numpy.dot(h_vector, phi_n) else: log_phi_n -= h_vector / numpy.dot(h_vector, phi_n) assert log_phi_n.shape == (self._number_of_topics,); log_phi_n -= scipy.misc.logsumexp(log_phi_n); log_phi[term_pos, :] = log_phi_n; for label_index in self._index_to_label: log_sum_phi_n_exp_eta = scipy.misc.logsumexp(log_phi[term_pos, :] + self._eta[label_index, :] / total_word_count); sum_phi_n_exp_eta = numpy.exp(log_sum_phi_n_exp_eta); # numpy.sum(log_auxilary_variables_per_label_token[:term_pos, label_index]) + numpy.sum(log_auxilary_variables_per_label_token[term_pos:, label_index]) auxilary_variables_per_label[label_index] *= sum_phi_n_exp_eta; gamma_values[doc_id, :] = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi), axis=0)); mean_change = numpy.mean(abs(gamma_values[doc_id, :] - old_gamma_values)); if mean_change <= local_parameter_converge_threshold: break; ''' # TODO: We could also update the gamma after all phi updates. gamma_update = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi), axis=0)); mean_change = numpy.mean(abs(gamma_update - gamma_values[doc_id, :])); gamma_values[doc_id, :] = gamma_update; if mean_change <= local_parameter_converge_threshold: break; ''' phi = numpy.exp(log_phi); assert phi.shape == (len(term_ids), self._number_of_topics); phi_mean = numpy.mean(phi, axis=0) assert phi_mean.shape == (self._number_of_topics,); # Note: all terms including E_q[p(\theta | \_alpha_alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates in E-step # compute the _alpha_alpha terms document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha)) # compute the gamma terms document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[doc_id, :])) - scipy.special.gammaln(numpy.sum(gamma_values[doc_id, :])); # compute the phi terms document_log_likelihood -= numpy.sum(phi * log_phi); # compute the eta terms if parsed_corpus_labels == None: document_log_likelihood += numpy.dot(numpy.sum(self._eta[label_ids, :], axis=0), phi_mean); document_log_likelihood -= numpy.log(numpy.sum(auxilary_variables_per_label)) # Note: all terms including E_q[p(\_eta | \_beta)], i.e., terms involving \Psi(\_eta), are cancelled due to \_eta updates in M-step if parsed_corpus_labels != None: # compute the p(w_{dn} | z_{dn}, \_eta) terms, which will be cancelled during M-step during training words_log_likelihood += numpy.sum(phi.T * E_log_prob_eta[:, term_ids]); assert(phi.shape == (len(term_ids), self._number_of_topics)); for term_pos in xrange(len(term_ids)): term_id = term_ids[term_pos]; phi_sufficient_statistics[:, term_id] += phi[term_pos, :]; # phi_sufficient_statistics[:, term_ids] += numpy.exp(log_phi + numpy.log(term_counts.transpose())).T; E_A_sufficient_statistics[doc_id, :] = phi_mean; E_AA_sufficient_statistics[doc_id, :, :] = numpy.dot(phi_mean[:, numpy.newaxis], phi_mean[numpy.newaxis, :]); if (doc_id + 1) % 10 == 0: print "successfully processed %d documents..." % (doc_id + 1); # compute mean absolute error # mean_absolute_error = numpy.abs(numpy.dot(E_A_sufficient_statistics, self._eta.T) - label_idss[:, numpy.newaxis]).sum() if parsed_corpus_labels == None: self._gamma = gamma_values; return document_log_likelihood, phi_sufficient_statistics, E_A_sufficient_statistics, E_AA_sufficient_statistics else: return words_log_likelihood, gamma_values, numpy.dot(E_A_sufficient_statistics, self._eta.T)
def e_step(self, parsed_corpus=None, number_of_samples=10, burn_in_samples=5): if parsed_corpus==None: word_idss = self._parsed_corpus; else: word_idss = parsed_corpus; number_of_documents = len(word_idss); E_log_eta = compute_dirichlet_expectation(self._eta) exp_E_log_eta = numpy.exp(E_log_eta); document_log_likelihood = 0; words_log_likelihood = 0; # initialize a V-by-K matrix phi contribution phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_types)); # initialize a D-by-K matrix gamma values gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics; # iterate over all documents for doc_id in xrange(number_of_documents): phi = numpy.random.random((self._number_of_topics, len(word_idss[doc_id]))); phi = phi / numpy.sum(phi, axis=0)[numpy.newaxis, :]; phi_sum = numpy.sum(phi, axis=1)[:, numpy.newaxis]; assert(phi_sum.shape == (self._number_of_topics, 1)); document_phi = numpy.zeros((len(word_idss[doc_id]), self._number_of_topics)); # collect phi samples from empirical distribution for it in xrange(number_of_samples): for word_pos in xrange(len(word_idss[doc_id])): word_index = word_idss[doc_id][word_pos]; phi_sum -= phi[:, word_pos][:, numpy.newaxis]; # this is to get rid of the underflow error from the above summation, ideally, phi will become all integers after few iterations phi_sum *= (phi_sum > 0); #assert(numpy.all(phi_sum >= 0)); temp_phi = (phi_sum.T + self._alpha_alpha) * exp_E_log_eta[:, [word_index]].T; assert(temp_phi.shape == (1, self._number_of_topics)); temp_phi /= numpy.sum(temp_phi); # sample a topic for this word temp_phi = numpy.random.multinomial(1, temp_phi[0])[:, numpy.newaxis]; assert(temp_phi.shape == (self._number_of_topics, 1)); phi[:, word_pos][:, numpy.newaxis] = temp_phi; phi_sum += temp_phi; # discard the first few burn-in sweeps if it < burn_in_samples: continue; phi_sufficient_statistics[:, word_index] += temp_phi[:, 0]; document_phi[word_pos, :] += temp_phi[:, 0]; gamma_values[doc_id, :] = self._alpha_alpha + phi_sum.T[0, :]; #batch_document_topic_distribution[doc_id, :] = self._alpha_alpha + phi_sum.T[0, :]; document_phi /= (number_of_samples - burn_in_samples); # this is to prevent 0 during log() document_phi += 1e-100; # Note: all terms including E_q[p(\theta|\alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates # Note: all terms including E_q[p(\eta | \beta)], i.e., terms involving \Psi(\eta), are cancelled due to \eta updates in M-step # compute the alpha terms document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha)) # compute the gamma terms document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[doc_id, :])) - scipy.special.gammaln(numpy.sum(gamma_values[doc_id, :])); # compute the phi terms document_log_likelihood -= numpy.sum(numpy.log(document_phi) * document_phi); # compute the p(w_{dn} | z_{dn}, \eta) terms, which will be cancelled during M-step words_log_likelihood += numpy.sum(document_phi * (E_log_eta[:, word_idss[doc_id]].T)); if (doc_id+1) % 1000==0: print "successfully processed %d documents in hybrid mode..." % (doc_id+1); phi_sufficient_statistics /= (number_of_samples - burn_in_samples); if parsed_corpus==None: self._gamma = gamma_values; return document_log_likelihood, phi_sufficient_statistics else: return words_log_likelihood, gamma_values
def e_step(self, parsed_corpus=None, local_parameter_iteration=50, local_parameter_converge_threshold=1e-6): if parsed_corpus == None: documents = self._parsed_corpus number_of_documents = len(documents); document_log_likelihood = 0; words_log_likelihood = 0; # initialize a V-by-K matrix phi contribution phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_paths)); # gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics; gamma_values = self._alpha_alpha + 2.0 * self._number_of_paths / self._number_of_topics * numpy.random.random((number_of_documents, self._number_of_topics)); E_log_eta = numpy.copy(self._var_beta); for internal_node_index in self._edges_from_internal_node: edge_index_list = self._edges_from_internal_node[internal_node_index]; assert numpy.min(E_log_eta[:, edge_index_list]) >= 0; E_log_eta[:, edge_index_list] = compute_dirichlet_expectation(E_log_eta[:, edge_index_list]); del internal_node_index, edge_index_list; # iterate over all documents for doc_id in xrange(number_of_documents): # update phi and gamma until gamma converges for gamma_iteration in xrange(local_parameter_iteration): document_phi = numpy.zeros((self._number_of_topics, self._number_of_paths)); phi_entropy = 0; phi_E_log_eta = 0; # E_log_theta = scipy.special.psi(self._gamma[[doc_id], :]).T; # assert E_log_theta.shape==(self._number_of_topics, 1); E_log_theta = compute_dirichlet_expectation(gamma_values[[doc_id], :]).T; assert E_log_theta.shape == (self._number_of_topics, 1); for word_id in documents[doc_id]: # word_ids: paths_lead_to_current_word = self._word_index_to_path_indices[word_id]; # log_phi = numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]).T, (1, len(paths_lead_to_current_word))); log_phi = numpy.tile(E_log_theta, (1, len(paths_lead_to_current_word))); assert log_phi.shape == (self._number_of_topics, len(paths_lead_to_current_word)); for position_index in xrange(len(paths_lead_to_current_word)): log_phi[:, position_index] += numpy.sum(E_log_eta[:, self._edges_along_path[paths_lead_to_current_word[position_index]]], axis=1); del position_index # log_phi -= scipy.misc.logsumexp(log_phi, axis=0)[numpy.newaxis, :] log_phi -= scipy.misc.logsumexp(log_phi) path_phi = numpy.exp(log_phi) # compute the phi terms phi_entropy += -numpy.sum(path_phi * numpy.log(path_phi + 1e-100)) * documents[doc_id][word_id]; for position_index in xrange(len(paths_lead_to_current_word)): phi_E_log_eta += documents[doc_id][word_id] * numpy.sum(path_phi[:, [position_index]] * numpy.sum(E_log_eta[:, self._edges_along_path[paths_lead_to_current_word[position_index]]], axis=1)[:, numpy.newaxis]) del position_index # multiple path_phi with the count of current word document_phi[:, paths_lead_to_current_word] += path_phi * documents[doc_id][word_id]; del word_id, paths_lead_to_current_word # print doc_id, "before", self._gamma[[doc_id], :]; gamma_values[[doc_id], :] = self._alpha_alpha + numpy.sum(document_phi, axis=1)[:, numpy.newaxis].T; # Note: all terms including E_q[p(\theta | \alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates in E-step # document_log_likelihood += numpy.sum((self._alpha_alpha - 1) * compute_dirichlet_expectation(gamma_values[[doc_id], :])); # document_log_likelihood += numpy.sum(numpy.sum(document_phi, axis=1)[:, numpy.newaxis].T * compute_dirichlet_expectation(gamma_values[[doc_id], :])); # document_log_likelihood += -numpy.sum((gamma_values[[doc_id], :] - 1) * compute_dirichlet_expectation(gamma_values[[doc_id], :])); # compute the alpha terms document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha)); # compute the gamma terms document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[doc_id, :])) - scipy.special.gammaln(numpy.sum(gamma_values[doc_id, :])); # compute the phi terms document_log_likelihood += phi_entropy; # Note: all terms including E_q[p(\eta | \beta)], i.e., terms involving \Psi(\eta), are cancelled due to \eta updates in M-step if parsed_corpus != None: # compute the p(w_{dn} | z_{dn}, \eta) terms, which will be cancelled during M-step during training words_log_likelihood += phi_E_log_eta; phi_sufficient_statistics += document_phi; if (doc_id + 1) % 1000 == 0: print "successfully processed %d documents..." % (doc_id + 1); del doc_id if parsed_corpus == None: self._gamma = gamma_values; return document_log_likelihood, phi_sufficient_statistics else: return words_log_likelihood, gamma_values
def e_step(self, wordids): batchD = len(wordids) document_level_elbo = 0; sufficient_statistics = numpy.zeros((self._number_of_topics, self._vocab_size)); # Initialize the variational distribution q(theta|gamma) for the mini-batch batch_document_topic_distribution = numpy.zeros((batchD, self._number_of_topics)); # Now, for each document d update that document's gamma and phi for d in xrange(batchD): phi = numpy.random.random((self._number_of_topics, len(wordids[d]))); phi = phi / numpy.sum(phi, axis=0)[numpy.newaxis, :]; phi_sum = numpy.sum(phi, axis=1)[:, numpy.newaxis]; assert(phi_sum.shape == (self._number_of_topics, 1)); for it in xrange(self._number_of_samples): for n in xrange(len(wordids[d])): id = wordids[d][n]; phi_sum -= phi[:, n][:, numpy.newaxis]; # this is to get rid of the underflow error from the above summation, ideally, phi will become all integers after few iterations phi_sum *= phi_sum > 0; #assert(numpy.all(phi_sum >= 0)); temp_phi = (phi_sum + self._alpha_theta).T * self._exp_E_log_beta[:, wordids[d][n]]; assert(temp_phi.shape == (1, self._number_of_topics)); temp_phi /= numpy.sum(temp_phi); # sample a topic for this word temp_phi = numpy.random.multinomial(1, temp_phi[0, :])[:, numpy.newaxis]; assert(temp_phi.shape == (self._number_of_topics, 1)); phi[:, n][:, numpy.newaxis] = temp_phi; phi_sum += temp_phi; # discard the first few burn-in sweeps if it < self._burn_in_sweeps: continue; sufficient_statistics[:, id] += temp_phi[:, 0]; batch_document_topic_distribution[d, :] = self._alpha_theta + phi_sum.T[0, :]; if self._compute_elbo: document_level_elbo += len(wordids[d]); gammad = batch_document_topic_distribution[d]; document_level_elbo += numpy.sum((self._alpha_theta - gammad) * numpy.exp(compute_dirichlet_expectation(gammad))); document_level_elbo += numpy.sum(scipy.special.gammaln(gammad) - scipy.special.gammaln(self._alpha_theta)); document_level_elbo += numpy.sum(scipy.special.gammaln(self._alpha_theta * self._number_of_topics) - scipy.special.gammaln(numpy.sum(gammad))); sufficient_statistics /= (self._number_of_samples - self._burn_in_sweeps); if self._compute_elbo: document_level_elbo *= self._number_of_documents / batchD; return batch_document_topic_distribution, sufficient_statistics, document_level_elbo
def e_step(self, parsed_corpus_response=None, local_parameter_iteration=10, local_parameter_converge_threshold=1e-6, approximate_phi=False): if parsed_corpus_response == None: word_idss = self._parsed_corpus; responses = self._responses else: word_idss, responses = parsed_corpus_response; ''' if parsed_corpus == None: word_ids = self._parsed_corpus[0]; word_cts = self._parsed_corpus[1]; else: word_ids = parsed_corpus[0]; word_cts = parsed_corpus[1]; assert len(word_ids) == len(word_cts); number_of_documents = len(word_ids); ''' number_of_documents = len(word_idss); document_log_likelihood = 0; words_log_likelihood = 0; # initialize a V-by-K matrix phi sufficient statistics phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_types)); E_A_sufficient_statistics = numpy.zeros((number_of_documents, self._number_of_topics)) E_AA_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_topics)) # initialize a D-by-K matrix gamma values gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics; E_log_beta = compute_dirichlet_expectation(self._beta); assert E_log_beta.shape == (self._number_of_topics, self._number_of_types); if parsed_corpus_response != None: E_log_prob_eta = E_log_beta - scipy.misc.logsumexp(E_log_beta, axis=1)[:, numpy.newaxis] for doc_id in xrange(number_of_documents): ''' total_word_count = numpy.sum(word_cts[doc_id]); term_ids = word_ids[doc_id]; term_counts = word_cts[doc_id]; assert term_counts.shape == (1, len(term_ids)); ''' total_word_count = len(word_idss[doc_id]); term_ids = word_idss[doc_id]; # initialize gamma for this document gamma_values[doc_id, :] = self._alpha_alpha + 1.0 * total_word_count / self._number_of_topics; log_phi = scipy.special.psi(gamma_values[doc_id, :][numpy.newaxis, :]) + E_log_beta[:, term_ids].T; log_phi -= scipy.misc.logsumexp(log_phi, axis=1)[:, numpy.newaxis]; # update phi and gamma until gamma converges for gamma_iteration in xrange(local_parameter_iteration): if approximate_phi: phi = numpy.exp(log_phi); assert phi.shape == (len(term_ids), self._number_of_topics); phi_sum = numpy.sum(phi, axis=0)[numpy.newaxis, :]; phi_sum_j = numpy.tile(phi_sum, (len(term_ids), 1)); phi_sum_j -= phi; assert phi_sum_j.shape == (len(term_ids), self._number_of_topics); # log_phi = self._E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]), (len(self._corpus[doc_id]), 1)); # log_phi = E_log_beta[:, term_ids].T + numpy.tile(scipy.special.psi(gamma_values[[doc_id], :]), (word_ids[doc_id].shape[0], 1)); log_phi = scipy.special.psi(gamma_values[doc_id, :][numpy.newaxis, :]) + E_log_beta[:, term_ids].T; assert log_phi.shape == (len(term_ids), self._number_of_topics); assert self._eta.shape == (1, self._number_of_topics); log_phi += ((responses[doc_id] / (total_word_count * self._sigma_square)) * self._eta) assert log_phi.shape == (len(term_ids), self._number_of_topics); log_phi -= (numpy.dot(phi_sum_j, self._eta.T) * self._eta + 0.5 * (self._eta ** 2)) / ((numpy.float(total_word_count) ** 2.) * self._sigma_square) assert log_phi.shape == (len(term_ids), self._number_of_topics); # phi_normalizer = numpy.log(numpy.sum(numpy.exp(log_phi), axis=1)[:, numpy.newaxis]); # assert phi_normalizer.shape == (len(term_ids), 1); # log_phi -= phi_normalizer; log_phi -= scipy.misc.logsumexp(log_phi, axis=1)[:, numpy.newaxis]; assert log_phi.shape == (len(term_ids), self._number_of_topics); gamma_update = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi), axis=0)); mean_change = numpy.mean(abs(gamma_update - gamma_values[doc_id, :])); gamma_values[doc_id, :] = gamma_update; if mean_change <= local_parameter_converge_threshold: break; else: old_gamma_values = gamma_values[doc_id, :].copy(); # assert phi.shape == (len(term_ids), self._number_of_topics); for term_pos in xrange(len(term_ids)): term_id = term_ids[term_pos]; phi_sum_j = numpy.zeros(self._number_of_topics) if term_pos > 0: phi_sum_j += numpy.exp(scipy.misc.logsumexp(log_phi[:term_pos, :], axis=0)); if term_pos < len(term_ids) - 1: phi_sum_j += numpy.exp(scipy.misc.logsumexp(log_phi[term_pos + 1:, :], axis=0)); assert phi_sum_j.shape == (self._number_of_topics,); # phi_sum = numpy.sum(phi, axis=0)[numpy.newaxis, :]; # phi_sum_j = numpy.tile(phi_sum, (len(term_ids), 1)); # phi_sum_j -= phi; # assert phi_sum_j.shape == (len(term_ids), self._number_of_topics); # log_phi = self._E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]), (len(self._corpus[doc_id]), 1)); # log_phi = E_log_beta[:, term_ids].T + numpy.tile(scipy.special.psi(gamma_values[[doc_id], :]), (word_ids[doc_id].shape[0], 1)); log_phi_j = scipy.special.psi(gamma_values[doc_id, :]) + E_log_beta[:, term_id]; assert log_phi_j.shape == (self._number_of_topics,); assert self._eta.shape == (1, self._number_of_topics); log_phi_j += ((responses[doc_id] / (total_word_count * self._sigma_square)) * self._eta[0, :]) assert log_phi_j.shape == (self._number_of_topics,); log_phi_j -= (numpy.sum(phi_sum_j * self._eta[0, :]) * self._eta[0, :] + 0.5 * (self._eta[0, :] ** 2)) / ((numpy.float(total_word_count) ** 2.) * self._sigma_square) assert log_phi_j.shape == (self._number_of_topics,); # phi_normalizer = numpy.log(numpy.sum(numpy.exp(log_phi), axis=1)[:, numpy.newaxis]); # assert phi_normalizer.shape == (len(term_ids), 1); # log_phi -= phi_normalizer; log_phi_j -= scipy.misc.logsumexp(log_phi_j); assert log_phi_j.shape == (self._number_of_topics,); log_phi[term_pos, :] = log_phi_j; gamma_values[doc_id, :] = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi), axis=0)); mean_change = numpy.mean(abs(gamma_values[doc_id, :] - old_gamma_values)); if mean_change <= local_parameter_converge_threshold: break; ''' # TODO: We could also update the gamma after all phi updates. gamma_update = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi), axis=0)); mean_change = numpy.mean(abs(gamma_update - gamma_values[doc_id, :])); gamma_values[doc_id, :] = gamma_update; if mean_change <= local_parameter_converge_threshold: break; ''' phi = numpy.exp(log_phi); assert phi.shape == (len(term_ids), self._number_of_topics); phi_mean = numpy.mean(phi, axis=0) assert phi_mean.shape == (self._number_of_topics,); # Note: all terms including E_q[p(\theta | \_alpha_alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates in E-step # compute the _alpha_alpha terms document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha)) # compute the gamma terms document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[doc_id, :])) - scipy.special.gammaln(numpy.sum(gamma_values[doc_id, :])); # compute the phi terms document_log_likelihood -= numpy.sum(phi * log_phi); # compute the eta terms document_log_likelihood -= 0.5 * numpy.log(2 * numpy.pi * self._sigma_square) document_log_likelihood -= 0.5 * (responses[doc_id] ** 2 - 2 * responses[doc_id] * numpy.sum(self._eta[0, :] * phi_mean) + numpy.dot(numpy.dot(self._eta, numpy.dot(phi_mean[:, numpy.newaxis], phi_mean[numpy.newaxis, :])), self._eta.T)) / self._sigma_square # Note: all terms including E_q[p(\_eta | \_beta)], i.e., terms involving \Psi(\_eta), are cancelled due to \_eta updates in M-step if parsed_corpus_response != None: # compute the p(w_{dn} | z_{dn}, \_eta) terms, which will be cancelled during M-step during training words_log_likelihood += numpy.sum(phi.T * E_log_prob_eta[:, term_ids]); assert(phi.shape == (len(term_ids), self._number_of_topics)); for term_pos in xrange(len(term_ids)): term_id = term_ids[term_pos]; phi_sufficient_statistics[:, term_id] += phi[term_pos, :]; # phi_sufficient_statistics[:, term_ids] += numpy.exp(log_phi + numpy.log(term_counts.transpose())).T; E_A_sufficient_statistics[doc_id, :] = phi_mean; E_AA_sufficient_statistics += numpy.dot(phi_mean[:, numpy.newaxis], phi_mean[numpy.newaxis, :]) if (doc_id + 1) % 1000 == 0: print "successfully processed %d documents..." % (doc_id + 1); # compute mean absolute error mean_absolute_error = numpy.abs(numpy.dot(E_A_sufficient_statistics, self._eta.T) - responses[:, numpy.newaxis]).sum() if parsed_corpus_response == None: self._gamma = gamma_values; return document_log_likelihood, phi_sufficient_statistics, E_A_sufficient_statistics, E_AA_sufficient_statistics else: return words_log_likelihood, gamma_values, numpy.dot(E_A_sufficient_statistics, self._eta.T)
def e_step(self, parsed_corpus=None, number_of_samples=10, burn_in_samples=5): if parsed_corpus == None: word_idss = self._parsed_corpus else: word_idss = parsed_corpus number_of_documents = len(word_idss) E_log_eta = compute_dirichlet_expectation(self._eta) exp_E_log_eta = numpy.exp(E_log_eta) document_log_likelihood = 0 words_log_likelihood = 0 # initialize a V-by-K matrix phi contribution phi_sufficient_statistics = numpy.zeros( (self._number_of_topics, self._number_of_types)) # initialize a D-by-K matrix gamma values gamma_values = numpy.zeros( (number_of_documents, self._number_of_topics) ) + self._alpha_alpha[ numpy. newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics # iterate over all documents for doc_id in xrange(number_of_documents): phi = numpy.random.random( (self._number_of_topics, len(word_idss[doc_id]))) phi = phi / numpy.sum(phi, axis=0)[numpy.newaxis, :] phi_sum = numpy.sum(phi, axis=1)[:, numpy.newaxis] assert (phi_sum.shape == (self._number_of_topics, 1)) document_phi = numpy.zeros( (len(word_idss[doc_id]), self._number_of_topics)) # collect phi samples from empirical distribution for it in xrange(number_of_samples): for word_pos in xrange(len(word_idss[doc_id])): word_index = word_idss[doc_id][word_pos] phi_sum -= phi[:, word_pos][:, numpy.newaxis] # this is to get rid of the underflow error from the above summation, ideally, phi will become all integers after few iterations phi_sum *= (phi_sum > 0) #assert(numpy.all(phi_sum >= 0)); temp_phi = (phi_sum.T + self._alpha_alpha ) * exp_E_log_eta[:, [word_index]].T assert (temp_phi.shape == (1, self._number_of_topics)) temp_phi /= numpy.sum(temp_phi) # sample a topic for this word temp_phi = numpy.random.multinomial( 1, temp_phi[0])[:, numpy.newaxis] assert (temp_phi.shape == (self._number_of_topics, 1)) phi[:, word_pos][:, numpy.newaxis] = temp_phi phi_sum += temp_phi # discard the first few burn-in sweeps if it < burn_in_samples: continue phi_sufficient_statistics[:, word_index] += temp_phi[:, 0] document_phi[word_pos, :] += temp_phi[:, 0] gamma_values[doc_id, :] = self._alpha_alpha + phi_sum.T[0, :] #batch_document_topic_distribution[doc_id, :] = self._alpha_alpha + phi_sum.T[0, :]; document_phi /= (number_of_samples - burn_in_samples) # this is to prevent 0 during log() document_phi += 1e-100 # Note: all terms including E_q[p(\theta|\alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates # Note: all terms including E_q[p(\eta | \beta)], i.e., terms involving \Psi(\eta), are cancelled due to \eta updates in M-step # compute the alpha terms document_log_likelihood += scipy.special.gammaln( numpy.sum(self._alpha_alpha)) - numpy.sum( scipy.special.gammaln(self._alpha_alpha)) # compute the gamma terms document_log_likelihood += numpy.sum( scipy.special.gammaln( gamma_values[doc_id, :])) - scipy.special.gammaln( numpy.sum(gamma_values[doc_id, :])) # compute the phi terms document_log_likelihood -= numpy.sum( numpy.log(document_phi) * document_phi) # compute the p(w_{dn} | z_{dn}, \eta) terms, which will be cancelled during M-step words_log_likelihood += numpy.sum( document_phi * (E_log_eta[:, word_idss[doc_id]].T)) if (doc_id + 1) % 1000 == 0: print "successfully processed %d documents in hybrid mode..." % ( doc_id + 1) phi_sufficient_statistics /= (number_of_samples - burn_in_samples) if parsed_corpus == None: self._gamma = gamma_values return document_log_likelihood, phi_sufficient_statistics else: return words_log_likelihood, gamma_values
def e_step(self, wordids, wordcts): batch_size = len(wordids) document_level_elbo = 0 # Initialize the variational distribution q(theta|gamma) for the mini-batch gamma = 1 * numpy.random.gamma(100., 1. / 100., (batch_size, self._number_of_topics)) exp_E_log_theta = numpy.exp(compute_dirichlet_expectation(gamma)) sstats = numpy.zeros(self._beta.shape) # Now, for each document d update that document's gamma and phi meanchange = 0 for d in range(0, batch_size): # These are mostly just shorthand (but might help cache locality) ids = wordids[d] cts = wordcts[d] gammad = gamma[d, :] exp_E_log_theta_d = exp_E_log_theta[d, :] exp_E_log_beta_d = self._exp_E_log_beta[:, ids] # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w. phi_norm is the normalizer. phi_norm = numpy.dot(exp_E_log_theta_d, exp_E_log_beta_d) + 1e-100 # Iterate between gamma and phi until convergence for it in range(0, self._maximum_gamma_update_iteration): lastgamma = gammad # We represent phi implicitly to save memory and time. Substituting the value of the optimal phi back into the update for gamma gives this update. Cf. Lee&Seung 2001. gammad = self._alpha_theta + exp_E_log_theta_d * numpy.dot( cts / phi_norm, exp_E_log_beta_d.T) exp_E_log_theta_d = numpy.exp( compute_dirichlet_expectation(gammad)) phi_norm = numpy.dot(exp_E_log_theta_d, exp_E_log_beta_d) + 1e-100 # If gamma hasn't changed much, we're done. meanchange = numpy.mean(abs(gammad - lastgamma)) if (meanchange < self._minimum_mean_change_threshold): break gamma[d, :] = gammad # Contribution of document d to the expected sufficient statistics for the M step. sstats[:, ids] += numpy.outer(exp_E_log_theta_d.T, cts / phi_norm) if self._compute_elbo: document_level_elbo += numpy.sum(cts * phi_norm) # E[log p(theta | alpha) - log q(theta | gamma)] document_level_elbo += numpy.sum( (self._alpha_theta - gammad) * exp_E_log_theta_d) document_level_elbo += numpy.sum( scipy.special.gammaln(gammad) - scipy.special.gammaln(self._alpha_theta)) document_level_elbo += numpy.sum( scipy.special.gammaln(self._alpha_theta * self._number_of_topics) - scipy.special.gammaln(numpy.sum(gammad))) # This step finishes computing the sufficient statistics for the M step, so that sstats[k, w] = \sum_d n_{dw} * phi_{dwk} = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}. sstats = sstats * self._exp_E_log_beta if self._compute_elbo: document_level_elbo *= self._number_of_documents / batch_size return gamma, sstats, document_level_elbo
def e_step(self): document_level_log_likelihood = 0; # initialize a V-by-K matrix phi contribution phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_paths)); # iterate over all documents for doc_id in xrange(self._number_of_documents): #for doc_id in xrange(0, 1): # compute the total number of words #total_word_count = self._data[doc_id].N() # initialize gamma for this document #self._gamma[[doc_id], :] = self._alpha + 1.0 * total_word_count / self._number_of_topics; #self._gamma[[doc_id], :] = self._alpha + 2.0 * total_word_count / self._number_of_topics * numpy.random.random((1, self._number_of_topics)); #word_ids = numpy.array(self._data[doc_id].keys()); #word_counts = numpy.array([self._data[doc_id].values()]); #assert(word_counts.shape == (1, len(word_ids))); # update phi and gamma until gamma converges for gamma_iteration in xrange(self._gamma_maximum_iteration): document_phi = numpy.zeros((self._number_of_topics, self._number_of_paths)); phi_entropy = 0; phi_E_log_beta = 0; #E_log_theta = scipy.special.psi(self._gamma[[doc_id], :]).T; #assert E_log_theta.shape==(self._number_of_topics, 1); E_log_theta = compute_dirichlet_expectation(self._gamma[[doc_id], :]).T; assert E_log_theta.shape==(self._number_of_topics, 1); for word_id in self._data[doc_id].keys():#word_ids: paths_lead_to_current_word = self._word_index_to_path_indices[word_id]; #log_phi = numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]).T, (1, len(paths_lead_to_current_word))); log_phi = numpy.tile(E_log_theta, (1, len(paths_lead_to_current_word))); assert log_phi.shape==(self._number_of_topics, len(paths_lead_to_current_word)); for position_index in xrange(len(paths_lead_to_current_word)): log_phi[:, position_index] += numpy.sum(self._E_log_beta[:, self._edges_along_path[paths_lead_to_current_word[position_index]]], axis=1); del position_index # log normalize # TODO: error with 2-level tree #log_phi = log_normalize(log_phi) # convert it into normal scale path_phi = numpy.exp(log_phi - numpy.max(log_phi)); assert path_phi.shape==(self._number_of_topics, len(paths_lead_to_current_word)); assert numpy.min(path_phi)>=0; path_phi.T # normalize path_phi over all topics and paths assert numpy.sum(path_phi)>0, log_phi.T path_phi /= numpy.sum(path_phi); #path_phi /= numpy.sum(path_phi, axis=1); phi_entropy += - numpy.sum(path_phi * numpy.log(path_phi)) * self._data[doc_id][word_id]; for position_index in xrange(len(paths_lead_to_current_word)): phi_E_log_beta += self._data[doc_id][word_id] * numpy.sum( path_phi[:, [position_index]] * numpy.sum(self._E_log_beta[:, self._edges_along_path[paths_lead_to_current_word[position_index]]], axis=1)[:, numpy.newaxis] ) del position_index # multiple path_phi with the count of current word path_phi *= self._data[doc_id][word_id]; document_phi[:, paths_lead_to_current_word] += path_phi; del word_id, paths_lead_to_current_word #print doc_id, "before", self._gamma[[doc_id], :]; self._gamma[[doc_id], :] = self._alpha + numpy.sum(document_phi, axis=1)[:, numpy.newaxis].T; # term 1 document_level_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha)) - numpy.sum(scipy.special.gammaln(self._alpha)); document_level_log_likelihood += numpy.sum((self._alpha - 1) * compute_dirichlet_expectation(self._gamma[[doc_id], :])); # term 2 document_level_log_likelihood += numpy.sum(numpy.sum(document_phi, axis=1)[:, numpy.newaxis].T * compute_dirichlet_expectation(self._gamma[[doc_id], :])); # term 4 document_level_log_likelihood += phi_E_log_beta; # term 5 document_level_log_likelihood += - scipy.special.gammaln(numpy.sum(self._gamma[[doc_id], :])) + numpy.sum(scipy.special.gammaln(self._gamma[[doc_id], :])) document_level_log_likelihood += - numpy.sum( (self._gamma[[doc_id], :] - 1) * compute_dirichlet_expectation(self._gamma[[doc_id], :]) ); # term 7 document_level_log_likelihood += phi_entropy; phi_sufficient_statistics += document_phi; if (doc_id+1) % 1000==0: print "successfully processed %d documents..." % (doc_id+1); del doc_id return phi_sufficient_statistics, document_level_log_likelihood;