Beispiel #1
0
    def e_step(self, wordids, wordcts):
        batch_size = len(wordids)

        document_level_elbo = 0;

        # Initialize the variational distribution q(theta|gamma) for the mini-batch
        gamma = 1*numpy.random.gamma(100., 1./100., (batch_size, self._number_of_topics))
        exp_E_log_theta = numpy.exp(compute_dirichlet_expectation(gamma))

        sstats = numpy.zeros(self._beta.shape)
        # Now, for each document d update that document's gamma and phi
        meanchange = 0
        for d in range(0, batch_size):
            # These are mostly just shorthand (but might help cache locality)
            ids = wordids[d]
            cts = wordcts[d]
            gammad = gamma[d, :]
            exp_E_log_theta_d = exp_E_log_theta[d, :]
            exp_E_log_beta_d = self._exp_E_log_beta[:, ids]
            # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w. phi_norm is the normalizer.
            phi_norm = numpy.dot(exp_E_log_theta_d, exp_E_log_beta_d) + 1e-100
            # Iterate between gamma and phi until convergence
            for it in range(0, self._maximum_gamma_update_iteration):
                lastgamma = gammad
                # We represent phi implicitly to save memory and time. Substituting the value of the optimal phi back into the update for gamma gives this update. Cf. Lee&Seung 2001.
                gammad = self._alpha_theta + exp_E_log_theta_d * numpy.dot(cts / phi_norm, exp_E_log_beta_d.T)
                exp_E_log_theta_d = numpy.exp(compute_dirichlet_expectation(gammad))
                phi_norm = numpy.dot(exp_E_log_theta_d, exp_E_log_beta_d) + 1e-100
                # If gamma hasn't changed much, we're done.
                meanchange = numpy.mean(abs(gammad - lastgamma))
                if (meanchange < self._minimum_mean_change_threshold):
                    break
            gamma[d, :] = gammad
            # Contribution of document d to the expected sufficient statistics for the M step.
            sstats[:, ids] += numpy.outer(exp_E_log_theta_d.T, cts/phi_norm)
                        
            if self._compute_elbo:
                document_level_elbo += numpy.sum(cts * phi_norm)

                # E[log p(theta | alpha) - log q(theta | gamma)]
                document_level_elbo += numpy.sum((self._alpha_theta - gammad) * exp_E_log_theta_d);
                document_level_elbo += numpy.sum(scipy.special.gammaln(gammad) - scipy.special.gammaln(self._alpha_theta));
                document_level_elbo += numpy.sum(scipy.special.gammaln(self._alpha_theta * self._number_of_topics) - scipy.special.gammaln(numpy.sum(gammad)));

        # This step finishes computing the sufficient statistics for the M step, so that sstats[k, w] = \sum_d n_{dw} * phi_{dwk} = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.
        sstats = sstats * self._exp_E_log_beta

        if self._compute_elbo:
            document_level_elbo *= self._number_of_documents / batch_size;

        return gamma, sstats, document_level_elbo
Beispiel #2
0
 def export_beta(self, exp_beta_path, top_display=-1):
     output = open(exp_beta_path, 'w');
     
     E_log_eta = numpy.copy(self._var_beta);
     assert E_log_eta.shape == (self._number_of_topics, self._number_of_edges)
     for internal_node_index in self._edges_from_internal_node:
         edge_index_list = self._edges_from_internal_node[internal_node_index];
         assert numpy.min(E_log_eta[:, edge_index_list]) >= 0;
         E_log_eta[:, edge_index_list] = compute_dirichlet_expectation(E_log_eta[:, edge_index_list]);
     del internal_node_index, edge_index_list;
     
     for topic_index in xrange(self._number_of_topics):
         output.write("==========\t%d\t==========\n" % (topic_index));
         
         freqdist = nltk.probability.FreqDist()
         for path_index in self._path_index_to_word_index:
             path_rank = 1;
             for edge_index in self._edges_along_path[path_index]:
                 path_rank *= numpy.exp(E_log_eta[topic_index, edge_index]);
             freqdist[path_index] += path_rank;
         
         i = 0;
         for (path_index, path_freq) in freqdist.most_common():
             i += 1;
             output.write("%s\t%g\n" % (self._index_to_type[self._path_index_to_word_index[path_index]], freqdist[path_index]));
             if top_display > 0 and i >= top_display:
                 break;
             
     output.close();
Beispiel #3
0
    def approx_bound(self, docs, gamma):
        """
        Estimates the variational bound over *all documents* using only
        the documents passed in as "docs." gamma is the set of parameters
        to the variational distribution q(theta) corresponding to the
        set of documents passed in.

        The output of this function is going to be noisy, but can be
        useful for assessing convergence.
        """

        # This is to handle the case where someone just hands us a single
        # document, not in a list.
        if (type(docs).__name__ == 'string'):
            temp = list()
            temp.append(docs)
            docs = temp

        (wordids, wordcts) = self.parse_doc_list(docs)
        batch_size = len(docs)

        score = 0
        Elogtheta = compute_dirichlet_expectation(gamma)
        expElogtheta = numpy.exp(Elogtheta)

        # E[log p(docs | theta, beta)]
        for d in range(0, batch_size):
            gammad = gamma[d, :]
            ids = wordids[d]
            cts = numpy.array(wordcts[d])
            phinorm = numpy.zeros(len(ids))
            for i in range(0, len(ids)):
                temp = Elogtheta[d, :] + self._Elogbeta[:, ids[i]]
                tmax = max(temp)
                phinorm[i] = numpy.log(sum(numpy.exp(temp - tmax))) + tmax
            score += numpy.sum(cts * phinorm)
#             oldphinorm = phinorm
#             phinorm = n.dot(expElogtheta[d, :], self._exp_E_log_beta[:, ids])
#             print oldphinorm
#             print n.log(phinorm)
#             score += n.sum(cts * n.log(phinorm))

        # E[log p(theta | alpha) - log q(theta | gamma)]
        score += numpy.sum((self._alpha_theta - gamma)*Elogtheta)
        score += numpy.sum(gammaln(gamma) - gammaln(self._alpha_theta))
        score += sum(gammaln(self._alpha_theta*self._number_of_topics) - gammaln(numpy.sum(gamma, 1)))

        # Compensate for the subsampling of the population of documents
        score = score * self._number_of_documents / len(docs)

        # E[log p(beta | eta) - log q (beta | lambda)]
        score = score + numpy.sum((self._alpha_eta-self._beta)*self._Elogbeta)
        score = score + numpy.sum(gammaln(self._beta) - gammaln(self._alpha_eta))
        score = score + numpy.sum(gammaln(self._alpha_eta*self._vocab_size) - 
                              gammaln(numpy.sum(self._beta, 1)))

        return(score)
    def export_beta(self, exp_beta_path, top_display=-1):
        output = open(exp_beta_path, 'w');
        E_log_eta = compute_dirichlet_expectation(self._beta);
        for topic_index in xrange(self._number_of_topics):
            output.write("==========\t%d\t==========\n" % (topic_index));
            
            beta_probability = numpy.exp(E_log_eta[topic_index, :] - scipy.misc.logsumexp(E_log_eta[topic_index, :]));

            i = 0;
            for type_index in reversed(numpy.argsort(beta_probability)):
                i += 1;
                output.write("%s\t%g\n" % (self._index_to_type[type_index], beta_probability[type_index]));
                if top_display > 0 and i >= top_display:
                    break;
                
        output.close();
Beispiel #5
0
    def export_beta(self, exp_beta_path, top_display=-1):
        output = open(exp_beta_path, 'w');
        E_log_eta = compute_dirichlet_expectation(self._eta);
        for topic_index in xrange(self._number_of_topics):
            output.write("==========\t%d\t==========\n" % (topic_index));
            
            beta_probability = numpy.exp(E_log_eta[topic_index, :] - scipy.misc.logsumexp(E_log_eta[topic_index, :]));

            i = 0;
            for type_index in reversed(numpy.argsort(beta_probability)):
                i += 1;
                output.write("%s\t%g\n" % (self._index_to_type[type_index], beta_probability[type_index]));
                if top_display > 0 and i >= top_display:
                    break;
                
        output.close();
Beispiel #6
0
    def e_step(self, wordids):
        batchD = len(wordids)

        document_level_elbo = 0

        sufficient_statistics = numpy.zeros(
            (self._number_of_topics, self._vocab_size))

        # Initialize the variational distribution q(theta|gamma) for the mini-batch
        batch_document_topic_distribution = numpy.zeros(
            (batchD, self._number_of_topics))

        # Now, for each document d update that document's gamma and phi
        for d in xrange(batchD):
            phi = numpy.random.random(
                (self._number_of_topics, len(wordids[d])))
            phi = phi / numpy.sum(phi, axis=0)[numpy.newaxis, :]
            phi_sum = numpy.sum(phi, axis=1)[:, numpy.newaxis]
            assert (phi_sum.shape == (self._number_of_topics, 1))

            for it in xrange(self._number_of_samples):
                for n in xrange(len(wordids[d])):
                    id = wordids[d][n]

                    phi_sum -= phi[:, n][:, numpy.newaxis]

                    # this is to get rid of the underflow error from the above summation, ideally, phi will become all integers after few iterations
                    phi_sum *= phi_sum > 0
                    #assert(numpy.all(phi_sum >= 0));

                    temp_phi = (phi_sum + self._alpha_theta
                                ).T * self._exp_E_log_beta[:, wordids[d][n]]
                    assert (temp_phi.shape == (1, self._number_of_topics))
                    temp_phi /= numpy.sum(temp_phi)

                    # sample a topic for this word
                    temp_phi = numpy.random.multinomial(
                        1, temp_phi[0, :])[:, numpy.newaxis]
                    assert (temp_phi.shape == (self._number_of_topics, 1))

                    phi[:, n][:, numpy.newaxis] = temp_phi
                    phi_sum += temp_phi

                    # discard the first few burn-in sweeps
                    if it < self._burn_in_sweeps:
                        continue

                    sufficient_statistics[:, id] += temp_phi[:, 0]

            batch_document_topic_distribution[
                d, :] = self._alpha_theta + phi_sum.T[0, :]

            if self._compute_elbo:
                document_level_elbo += len(wordids[d])

                gammad = batch_document_topic_distribution[d]
                document_level_elbo += numpy.sum(
                    (self._alpha_theta - gammad) *
                    numpy.exp(compute_dirichlet_expectation(gammad)))
                document_level_elbo += numpy.sum(
                    scipy.special.gammaln(gammad) -
                    scipy.special.gammaln(self._alpha_theta))
                document_level_elbo += numpy.sum(
                    scipy.special.gammaln(self._alpha_theta *
                                          self._number_of_topics) -
                    scipy.special.gammaln(numpy.sum(gammad)))

        sufficient_statistics /= (self._number_of_samples -
                                  self._burn_in_sweeps)

        if self._compute_elbo:
            document_level_elbo *= self._number_of_documents / batchD

        return batch_document_topic_distribution, sufficient_statistics, document_level_elbo
Beispiel #7
0
    def e_step(self, parsed_corpus=None, local_parameter_iteration=50, local_parameter_converge_threshold=1e-6):
        if parsed_corpus == None:
            word_ids = self._parsed_corpus[0];
            word_cts = self._parsed_corpus[1];
        else:
            word_ids = parsed_corpus[0]
            word_cts = parsed_corpus[1];
        
        assert len(word_ids) == len(word_cts);
        number_of_documents = len(word_ids);
        
        document_log_likelihood = 0;
        words_log_likelihood = 0;

        # initialize a V-by-K matrix phi sufficient statistics
        phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_types));
        
        # initialize a D-by-K matrix gamma values
        gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics;
        
        E_log_eta = compute_dirichlet_expectation(self._eta);
        assert E_log_eta.shape == (self._number_of_topics, self._number_of_types);
        if parsed_corpus != None:
            E_log_prob_eta = E_log_eta - scipy.misc.logsumexp(E_log_eta, axis=1)[:, numpy.newaxis]
        
        # iterate over all documents
        # for doc_id in xrange(number_of_documents):
        for doc_id in numpy.random.permutation(number_of_documents):
            # compute the total number of words
            # total_word_count = self._corpus[doc_id].N()
            total_word_count = numpy.sum(word_cts[doc_id]);

            # initialize gamma for this document
            gamma_values[doc_id, :] = self._alpha_alpha + 1.0 * total_word_count / self._number_of_topics;
            
            # term_ids = numpy.array(self._corpus[doc_id].keys());
            # term_counts = numpy.array([self._corpus[doc_id].values()]);
            term_ids = word_ids[doc_id];
            term_counts = word_cts[doc_id];
            assert term_counts.shape == (1, len(term_ids));

            # update phi and gamma until gamma converges
            for gamma_iteration in xrange(local_parameter_iteration):
                assert E_log_eta.shape == (self._number_of_topics, self._number_of_types);
                # log_phi = self._E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]), (len(self._corpus[doc_id]), 1));
                log_phi = E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(gamma_values[[doc_id], :]), (word_ids[doc_id].shape[0], 1));
                assert log_phi.shape == (len(term_ids), self._number_of_topics);
                # phi_normalizer = numpy.log(numpy.sum(numpy.exp(log_phi), axis=1)[:, numpy.newaxis]);
                # assert phi_normalizer.shape == (len(term_ids), 1);
                # log_phi -= phi_normalizer;
                log_phi -= scipy.misc.logsumexp(log_phi, axis=1)[:, numpy.newaxis];
                assert log_phi.shape == (len(term_ids), self._number_of_topics);
                
                gamma_update = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi + numpy.log(term_counts.transpose())), axis=0));
                
                mean_change = numpy.mean(abs(gamma_update - gamma_values[doc_id, :]));
                gamma_values[doc_id, :] = gamma_update;
                if mean_change <= local_parameter_converge_threshold:
                    break;
            
            # Note: all terms including E_q[p(\theta | \alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates in E-step
            
            # compute the alpha terms
            document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha))
            # compute the gamma terms
            document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[doc_id, :])) - scipy.special.gammaln(numpy.sum(gamma_values[doc_id, :]));
            # compute the phi terms
            document_log_likelihood -= numpy.sum(numpy.dot(term_counts, numpy.exp(log_phi) * log_phi));
            
            # Note: all terms including E_q[p(\eta | \beta)], i.e., terms involving \Psi(\eta), are cancelled due to \eta updates in M-step
            if parsed_corpus != None:
                # compute the p(w_{dn} | z_{dn}, \eta) terms, which will be cancelled during M-step during training
                words_log_likelihood += numpy.sum(numpy.exp(log_phi.T + numpy.log(term_counts)) * E_log_prob_eta[:, term_ids]);
            
            assert(log_phi.shape == (len(term_ids), self._number_of_topics));
            phi_sufficient_statistics[:, term_ids] += numpy.exp(log_phi + numpy.log(term_counts.transpose())).T;
            
            if (doc_id + 1) % 1000 == 0:
                print "successfully processed %d documents..." % (doc_id + 1);
        
        if parsed_corpus == None:
            self._gamma = gamma_values;
            return document_log_likelihood, phi_sufficient_statistics
        else:
            return words_log_likelihood, gamma_values
Beispiel #8
0
    def e_step(self, parsed_corpus=None, number_of_samples=10, burn_in_samples=5):
        if parsed_corpus == None:
            documents = self._parsed_corpus
        else:
            documents = parsed_corpus
        
        number_of_documents = len(documents);

        document_log_likelihood = 0;
        words_log_likelihood = 0;

        # initialize a V-by-K matrix phi contribution
        phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_paths));
        
        # gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics;
        gamma_values = self._alpha_alpha + 2.0 * self._number_of_paths / self._number_of_topics * numpy.random.random((number_of_documents, self._number_of_topics));

        E_log_eta = numpy.copy(self._var_beta);
        for internal_node_index in self._edges_from_internal_node:
            edge_index_list = self._edges_from_internal_node[internal_node_index];
            assert numpy.min(E_log_eta[:, edge_index_list]) >= 0;
            E_log_eta[:, edge_index_list] = compute_dirichlet_expectation(E_log_eta[:, edge_index_list]);
        del internal_node_index, edge_index_list;

        # iterate over all documents
        for document_index in xrange(number_of_documents):
            document_gamma = numpy.zeros(self._alpha_alpha.shape);
            
            topic_path_assignment = {};
            topic_sum = numpy.zeros((1, self._number_of_topics));
            for word_index in xrange(len(documents[document_index])):
                topic_assignment = numpy.random.randint(0, self._number_of_topics);
                path_assignment = numpy.random.randint(0, len(self._word_index_to_path_indices[documents[document_index][word_index]]));
                topic_path_assignment[word_index] = (topic_assignment, path_assignment);
                topic_sum[0, topic_assignment] += 1;
            del word_index, topic_assignment, path_assignment;

            # update path_phi and phi_sum until phi_sum converges
            for sample_index in xrange(number_of_samples):
                # document_phi = numpy.zeros((self._number_of_topics, self._number_of_paths));
                
                phi_entropy = 0;
                phi_E_log_eta = 0;
                
                for word_index in xrange(len(documents[document_index])):
                    word_id = documents[document_index][word_index];
                    topic_sum[0, topic_path_assignment[word_index][0]] -= 1;
                    
                    paths_lead_to_current_word = self._word_index_to_path_indices[word_id];
                    assert len(paths_lead_to_current_word) > 0
                    
                    # path_phi = numpy.tile(scipy.special.psi(self._gamma[[document_index], :]).T, (1, len(paths_lead_to_current_word)));
                    path_phi = numpy.tile((topic_sum + self._alpha_alpha).T, (1, len(paths_lead_to_current_word)));
                    assert path_phi.shape == (self._number_of_topics, len(paths_lead_to_current_word));
                    
                    for path_index in xrange(len(paths_lead_to_current_word)):
                        path_phi[:, path_index] *= numpy.exp(numpy.sum(E_log_eta[:, self._edges_along_path[paths_lead_to_current_word[path_index]]], axis=1));
                    del path_index
                    
                    assert path_phi.shape == (self._number_of_topics, len(paths_lead_to_current_word));
                    # normalize path_phi over all topics
                    path_phi /= numpy.sum(path_phi);
                    
                    # compute the phi terms
                    phi_entropy += -numpy.sum(path_phi * numpy.log(path_phi + 1e-100));
                    
                    random_number = numpy.random.random();
                    for topic_index in xrange(self._number_of_topics):
                        for path_index in xrange(len(paths_lead_to_current_word)):
                            random_number -= path_phi[topic_index, path_index];
                            if random_number <= 0:
                                break;
                        if random_number <= 0:
                            break;
                    topic_sum[0, topic_index] += 1;
                    topic_path_assignment[word_index] = (topic_index, path_index);
                    
                    if sample_index >= burn_in_samples:
                        phi_sufficient_statistics[topic_index, paths_lead_to_current_word[path_index]] += 1;
                    
                    #
                    #
                    #
                    #
                    #
                        
                    for position_index in xrange(len(paths_lead_to_current_word)):
                        phi_E_log_eta += numpy.sum(path_phi[:, [position_index]] * numpy.sum(E_log_eta[:, self._edges_along_path[paths_lead_to_current_word[position_index]]], axis=1)[:, numpy.newaxis])
                    del position_index
                    
                del word_index, paths_lead_to_current_word
                
                if sample_index >= burn_in_samples:
                    document_gamma += self._alpha_alpha + topic_sum
                    
            # gamma_values[[document_index], :] = self._alpha_alpha + topic_sum;    
            gamma_values[[document_index], :] = document_gamma / (number_of_samples - burn_in_samples);
            
            # Note: all terms including E_q[p(\theta | \alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates in E-step
            # document_log_likelihood += numpy.sum((self._alpha_alpha - 1) * compute_dirichlet_expectation(gamma_values[[document_index], :]));
            # document_log_likelihood += numpy.sum(numpy.sum(document_phi, axis=1)[:, numpy.newaxis].T * compute_dirichlet_expectation(gamma_values[[document_index], :]));
            # document_log_likelihood += -numpy.sum((gamma_values[[document_index], :] - 1) * compute_dirichlet_expectation(gamma_values[[document_index], :]));
            
            # compute the alpha terms
            document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha));
            
            # compute the gamma terms
            document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[document_index, :])) - scipy.special.gammaln(numpy.sum(gamma_values[document_index, :]));
            
            # compute the phi terms
            # phi_entropy += -numpy.sum(path_phi * numpy.log(path_phi)) * documents[doc_id][word_id];
            document_log_likelihood += phi_entropy;
            
            # Note: all terms including E_q[p(\eta | \beta)], i.e., terms involving \Psi(\eta), are cancelled due to \eta updates in M-step
            if parsed_corpus != None:
                # compute the p(w_{dn} | z_{dn}, \eta) terms, which will be cancelled during M-step during training
                words_log_likelihood += phi_E_log_eta;
            
            # phi_sufficient_statistics += document_phi;
        
            if (document_index + 1) % 1000 == 0:
                print "successfully processed %d documents..." % (document_index + 1);
                
            del document_index

        phi_sufficient_statistics /= (number_of_samples - burn_in_samples);
        assert phi_sufficient_statistics.shape == (self._number_of_topics, self._number_of_paths);
                
        if parsed_corpus == None:
            self._gamma = gamma_values;
            return document_log_likelihood, phi_sufficient_statistics
        else:
            return words_log_likelihood, gamma_values
Beispiel #9
0
    def e_step(self,
               parsed_corpus_response=None,
               number_of_samples=10,
               burn_in_samples=5,
               approximate_phi=True):
        
        if parsed_corpus_response == None:
            word_idss = self._parsed_corpus;
            responses = self._responses
        else:
            word_idss, responses = parsed_corpus_response;
        
        number_of_documents = len(word_idss);
        
        document_log_likelihood = 0;
        words_log_likelihood = 0;
        
        # initialize a V-by-K matrix phi sufficient statistics
        phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_types));
        E_A_sufficient_statistics = numpy.zeros((number_of_documents, self._number_of_topics))
        E_AA_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_topics))
        
        # initialize a D-by-K matrix gamma values
        gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics;
        
        E_log_beta = compute_dirichlet_expectation(self._beta);
        assert E_log_beta.shape == (self._number_of_topics, self._number_of_types);
        if parsed_corpus_response != None:
            E_log_prob_eta = E_log_beta - scipy.misc.logsumexp(E_log_beta, axis=1)[:, numpy.newaxis]
        exp_E_log_beta = numpy.exp(E_log_beta);

        for doc_id in xrange(number_of_documents):
            
            phi = numpy.random.random((self._number_of_topics, len(word_idss[doc_id])));
            phi = phi / numpy.sum(phi, axis=0)[numpy.newaxis, :];
            phi_sum = numpy.sum(phi, axis=1)[:, numpy.newaxis];
            assert(phi_sum.shape == (self._number_of_topics, 1));
            
            document_phi = numpy.zeros((len(word_idss[doc_id]), self._number_of_topics));
            
            for iter in xrange(number_of_samples):
                for word_pos in xrange(len(word_idss[doc_id])):
                    word_id = word_idss[doc_id][word_pos];
                    
                    phi_sum -= phi[:, word_pos][:, numpy.newaxis];
                    
                    # this is to get rid of the underflow error from the above summation, ideally, phi will become all integers after few iterations
                    phi_sum *= (phi_sum > 0);
                    # assert(numpy.all(phi_sum >= 0));

                    temp_phi = (phi_sum.T + self._alpha_alpha) * exp_E_log_beta[:, [word_id]].T;
                    assert(temp_phi.shape == (1, self._number_of_topics));
                    temp_phi /= numpy.sum(temp_phi);

                    # sample a topic for this word
                    temp_phi = numpy.random.multinomial(1, temp_phi[0])[:, numpy.newaxis];
                    assert(temp_phi.shape == (self._number_of_topics, 1));
                    
                    phi[:, word_pos][:, numpy.newaxis] = temp_phi;
                    phi_sum += temp_phi;

                    # discard the first few burn-in sweeps
                    if iter < burn_in_samples:
                        continue;
                    
                    phi_sufficient_statistics[:, word_id] += temp_phi[:, 0];
                    document_phi[word_pos, :] += temp_phi[:, 0];

            gamma_values[doc_id, :] = self._alpha_alpha + phi_sum.T[0, :];
            # batch_document_topic_distribution[doc_id, :] = self._alpha_alpha + phi_sum.T[0, :];
            
            document_phi /= (number_of_samples - burn_in_samples);
            # this is to prevent 0 during log()
            document_phi += 1e-100;
            assert document_phi.shape == (len(word_idss[doc_id]), self._number_of_topics);
            
            phi_mean = numpy.mean(document_phi, axis=0)
            assert phi_mean.shape == (self._number_of_topics,);
            
            # Note: all terms including E_q[p(\theta|\alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates
            # Note: all terms including E_q[p(\eta | \beta)], i.e., terms involving \Psi(\eta), are cancelled due to \eta updates in M-step
            
            # compute the alpha terms
            document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha))
            # compute the gamma terms
            document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[doc_id, :])) - scipy.special.gammaln(numpy.sum(gamma_values[doc_id, :]));
            # compute the phi terms
            document_log_likelihood -= numpy.sum(numpy.log(document_phi) * document_phi);

            # compute the eta terms
            document_log_likelihood -= 0.5 * numpy.log(2 * numpy.pi * self._sigma_square)
            document_log_likelihood -= 0.5 * (responses[doc_id] ** 2 - 2 * responses[doc_id] * numpy.sum(self._eta[0, :] * phi_mean) + numpy.dot(numpy.dot(self._eta, numpy.dot(phi_mean[:, numpy.newaxis], phi_mean[numpy.newaxis, :])), self._eta.T)) / self._sigma_square
            
            # Note: all terms including E_q[p(\_eta | \_beta)], i.e., terms involving \Psi(\_eta), are cancelled due to \_eta updates in M-step
            if parsed_corpus_response != None:
                # compute the p(w_{dn} | z_{dn}, \_eta) terms, which will be cancelled during M-step during training
                words_log_likelihood += numpy.sum(phi.T * E_log_prob_eta[:, word_idss[doc_id]]);
            
            E_A_sufficient_statistics[doc_id, :] = phi_mean;
            E_AA_sufficient_statistics += numpy.dot(phi_mean[:, numpy.newaxis], phi_mean[numpy.newaxis, :])
            
            if (doc_id + 1) % 1000 == 0:
                print "successfully processed %d documents..." % (doc_id + 1);

        phi_sufficient_statistics /= (number_of_samples - burn_in_samples);
        
        # compute mean absolute error
        mean_absolute_error = numpy.abs(numpy.dot(E_A_sufficient_statistics, self._eta.T) - responses[:, numpy.newaxis]).sum()
        
        if parsed_corpus_response == None:
            self._gamma = gamma_values;
            return document_log_likelihood, phi_sufficient_statistics, E_A_sufficient_statistics, E_AA_sufficient_statistics
        else:
            return words_log_likelihood, gamma_values, numpy.dot(E_A_sufficient_statistics, self._eta.T)
Beispiel #10
0
    def approx_bound(self, docs, gamma):
        """
        Estimates the variational bound over *all documents* using only
        the documents passed in as "docs." gamma is the set of parameters
        to the variational distribution q(theta) corresponding to the
        set of documents passed in.

        The output of this function is going to be noisy, but can be
        useful for assessing convergence.
        """

        # This is to handle the case where someone just hands us a single
        # document, not in a list.
        if (type(docs).__name__ == 'string'):
            temp = list()
            temp.append(docs)
            docs = temp

        (wordids, wordcts) = self.parse_doc_list(docs)
        batch_size = len(docs)

        score = 0
        Elogtheta = compute_dirichlet_expectation(gamma)
        expElogtheta = numpy.exp(Elogtheta)

        # E[log p(docs | theta, beta)]
        for d in range(0, batch_size):
            gammad = gamma[d, :]
            ids = wordids[d]
            cts = numpy.array(wordcts[d])
            phinorm = numpy.zeros(len(ids))
            for i in range(0, len(ids)):
                temp = Elogtheta[d, :] + self._Elogbeta[:, ids[i]]
                tmax = max(temp)
                phinorm[i] = numpy.log(sum(numpy.exp(temp - tmax))) + tmax
            score += numpy.sum(cts * phinorm)
#             oldphinorm = phinorm
#             phinorm = n.dot(expElogtheta[d, :], self._exp_E_log_beta[:, ids])
#             print oldphinorm
#             print n.log(phinorm)
#             score += n.sum(cts * n.log(phinorm))

# E[log p(theta | alpha) - log q(theta | gamma)]
        score += numpy.sum((self._alpha_theta - gamma) * Elogtheta)
        score += numpy.sum(gammaln(gamma) - gammaln(self._alpha_theta))
        score += sum(
            gammaln(self._alpha_theta * self._number_of_topics) -
            gammaln(numpy.sum(gamma, 1)))

        # Compensate for the subsampling of the population of documents
        score = score * self._number_of_documents / len(docs)

        # E[log p(beta | eta) - log q (beta | lambda)]
        score = score + numpy.sum(
            (self._alpha_eta - self._beta) * self._Elogbeta)
        score = score + numpy.sum(
            gammaln(self._beta) - gammaln(self._alpha_eta))
        score = score + numpy.sum(
            gammaln(self._alpha_eta * self._vocab_size) -
            gammaln(numpy.sum(self._beta, 1)))

        return (score)
Beispiel #11
0
    def e_step(self, parsed_corpus=None, local_parameter_iteration=50, local_parameter_converge_threshold=1e-6):
        if parsed_corpus==None:
            word_ids = self._parsed_corpus[0];
            word_cts = self._parsed_corpus[1];
        else:
            word_ids = parsed_corpus[0]
            word_cts = parsed_corpus[1];
        
        assert len(word_ids)==len(word_cts);
        number_of_documents = len(word_ids);
        
        document_log_likelihood = 0;
        words_log_likelihood = 0;

        # initialize a V-by-K matrix phi sufficient statistics
        phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_types));
        
        # initialize a D-by-K matrix gamma values
        gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics;
        
        E_log_eta = compute_dirichlet_expectation(self._eta);
        assert E_log_eta.shape==(self._number_of_topics, self._number_of_types);
        if parsed_corpus!=None:
            E_log_prob_eta = E_log_eta-scipy.misc.logsumexp(E_log_eta, axis=1)[:, numpy.newaxis]
        
        # iterate over all documents
        #for doc_id in xrange(number_of_documents):
        for doc_id in numpy.random.permutation(number_of_documents):
            # compute the total number of words
            #total_word_count = self._corpus[doc_id].N()
            total_word_count = numpy.sum(word_cts[doc_id]);

            # initialize gamma for this document
            gamma_values[doc_id, :] = self._alpha_alpha + 1.0 * total_word_count / self._number_of_topics;
            
            #term_ids = numpy.array(self._corpus[doc_id].keys());
            #term_counts = numpy.array([self._corpus[doc_id].values()]);
            term_ids = word_ids[doc_id];
            term_counts = word_cts[doc_id];
            assert term_counts.shape == (1, len(term_ids));

            # update phi and gamma until gamma converges
            for gamma_iteration in xrange(local_parameter_iteration):
                assert E_log_eta.shape==(self._number_of_topics, self._number_of_types);
                #log_phi = self._E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]), (len(self._corpus[doc_id]), 1));
                log_phi = E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(gamma_values[[doc_id], :]), (word_ids[doc_id].shape[0], 1));
                assert log_phi.shape==(len(term_ids), self._number_of_topics);
                #phi_normalizer = numpy.log(numpy.sum(numpy.exp(log_phi), axis=1)[:, numpy.newaxis]);
                #assert phi_normalizer.shape == (len(term_ids), 1);
                #log_phi -= phi_normalizer;
                log_phi -= scipy.misc.logsumexp(log_phi, axis=1)[:, numpy.newaxis];
                assert log_phi.shape==(len(term_ids), self._number_of_topics);
                
                gamma_update = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi + numpy.log(term_counts.transpose())), axis=0));
                
                mean_change = numpy.mean(abs(gamma_update - gamma_values[doc_id, :]));
                gamma_values[doc_id, :] = gamma_update;
                if mean_change <= local_parameter_converge_threshold:
                    break;
            
            # Note: all terms including E_q[p(\theta | \alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates in E-step

            
            # compute the alpha terms
            document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha))
            # compute the gamma terms
            document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[doc_id, :])) - scipy.special.gammaln(numpy.sum(gamma_values[doc_id, :]));
            # compute the phi terms
            document_log_likelihood -= numpy.dot(term_counts, numpy.exp(log_phi) * log_phi);
            
            # Note: all terms including E_q[p(\eta | \beta)], i.e., terms involving \Psi(\eta), are cancelled due to \eta updates in M-step
            if parsed_corpus!=None:
                # compute the p(w_{dn} | z_{dn}, \eta) terms, which will be cancelled during M-step during training
                words_log_likelihood += numpy.sum(numpy.exp(log_phi.T + numpy.log(term_counts)) * E_log_prob_eta[:, term_ids]);
            
            assert(log_phi.shape == (len(term_ids), self._number_of_topics));
            phi_sufficient_statistics[:, term_ids] += numpy.exp(log_phi + numpy.log(term_counts.transpose())).T;
            
            if (doc_id+1) % 1000==0:
                print "successfully processed %d documents..." % (doc_id+1);
        
        if parsed_corpus==None:
            self._gamma = gamma_values;
            return document_log_likelihood, phi_sufficient_statistics
        else:
            return words_log_likelihood, gamma_values
    def e_step(self,
               parsed_corpus_labels=None,
               local_gamma_iteration=10,
               local_phi_iteration=10,
               local_parameter_converge_threshold=1e-6,
               approximate_phi=False):
        
        if parsed_corpus_labels == None:
            word_idss = self._parsed_corpus;
            label_idss = self._parsed_labels
        else:
            word_idss = parsed_corpus_labels;
            label_idss = None;

        number_of_documents = len(word_idss);
        
        document_log_likelihood = 0;
        words_log_likelihood = 0;
        
        # initialize a V-by-K matrix phi sufficient statistics
        phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_types));
        E_A_sufficient_statistics = numpy.zeros((number_of_documents, self._number_of_topics))
        E_AA_sufficient_statistics = numpy.zeros((number_of_documents, self._number_of_topics, self._number_of_topics))
        
        # initialize a D-by-K matrix gamma values
        gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics;
        
        E_log_beta = compute_dirichlet_expectation(self._beta);
        assert E_log_beta.shape == (self._number_of_topics, self._number_of_types);
        if parsed_corpus_labels != None:
            E_log_prob_eta = E_log_beta - scipy.misc.logsumexp(E_log_beta, axis=1)[:, numpy.newaxis]

        for doc_id in xrange(number_of_documents):
            total_word_count = len(word_idss[doc_id]);
            term_ids = word_idss[doc_id];
            if parsed_corpus_labels == None:
                label_ids = label_idss[doc_id];
            
            # initialize gamma for this document
            gamma_values[doc_id, :] = self._alpha_alpha + 1.0 * total_word_count / self._number_of_topics;
            
            log_phi = scipy.special.psi(gamma_values[doc_id, :][numpy.newaxis, :]) + E_log_beta[:, term_ids].T;
            log_phi -= scipy.misc.logsumexp(log_phi, axis=1)[:, numpy.newaxis];
            assert log_phi.shape == (len(term_ids), self._number_of_topics);
            # phi = numpy.exp(log_phi);
            
            assert self._eta.shape == (self._number_of_labels, self._number_of_topics);
            auxilary_variables_per_label = numpy.zeros(len(self._index_to_label));
            # log_auxilary_variables_per_label_token = numpy.zeros((len(self._index_to_label), total_word_count));
            for label_index in self._index_to_label:
                log_sum_phi_exp_eta = scipy.misc.logsumexp(log_phi + self._eta[label_index, :][numpy.newaxis, :] / total_word_count, axis=1);
                assert log_sum_phi_exp_eta.shape == (len(term_ids),)
                # log_auxilary_variables_per_label_token[:, label_index] = log_sum_phi_exp_eta;
                auxilary_variables_per_label[label_index] = numpy.exp(numpy.sum(log_sum_phi_exp_eta));
            
            # update phi and gamma until gamma converges
            for gamma_iteration in xrange(local_gamma_iteration):
                if approximate_phi:
                    '''
                    phi = numpy.exp(log_phi);
                    assert phi.shape == (len(term_ids), self._number_of_topics);
                    
                    phi_sum = numpy.sum(phi, axis=0)[numpy.newaxis, :];
                    phi_sum_j = numpy.tile(phi_sum, (len(term_ids), 1));
                    phi_sum_j -= phi;
                    assert phi_sum_j.shape == (len(term_ids), self._number_of_topics);
                
                    # log_phi = self._E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]), (len(self._corpus[doc_id]), 1));
                    # log_phi = E_log_beta[:, term_ids].T + numpy.tile(scipy.special.psi(gamma_values[[doc_id], :]), (word_ids[doc_id].shape[0], 1));
                    log_phi = scipy.special.psi(gamma_values[doc_id, :][numpy.newaxis, :]) + E_log_beta[:, term_ids].T;
                    assert log_phi.shape == (len(term_ids), self._number_of_topics);
                    
                    assert self._eta.shape == (1, self._number_of_topics);
                    
                    log_phi += ((label_idss[doc_id] / (total_word_count * self._sigma_square)) * self._eta)
                    assert log_phi.shape == (len(term_ids), self._number_of_topics);
                    
                    log_phi -= (numpy.dot(phi_sum_j, self._eta.T) * self._eta + 0.5 * (self._eta ** 2)) / ((numpy.float(total_word_count) ** 2.) * self._sigma_square)
                    assert log_phi.shape == (len(term_ids), self._number_of_topics);
                    
                    # phi_normalizer = numpy.log(numpy.sum(numpy.exp(log_phi), axis=1)[:, numpy.newaxis]);
                    # assert phi_normalizer.shape == (len(term_ids), 1);
                    # log_phi -= phi_normalizer;
                    log_phi -= scipy.misc.logsumexp(log_phi, axis=1)[:, numpy.newaxis];
                    assert log_phi.shape == (len(term_ids), self._number_of_topics);
                    
                    gamma_update = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi), axis=0));
                    mean_change = numpy.mean(abs(gamma_update - gamma_values[doc_id, :]));
                    gamma_values[doc_id, :] = gamma_update;
                    if mean_change <= local_parameter_converge_threshold:
                        break;
                    '''
                    pass
                else:
                    old_gamma_values = gamma_values[doc_id, :].copy();

                    assert log_phi.shape == (len(term_ids), self._number_of_topics);
                    for term_pos in xrange(len(term_ids)):
                        term_id = term_ids[term_pos];
                        
                        h_vector = numpy.zeros(self._number_of_topics);
                        for label_index in self._index_to_label:
                            log_sum_phi_n_exp_eta = scipy.misc.logsumexp(log_phi[term_pos, :] + self._eta[label_index, :] / total_word_count);
                            sum_phi_n_exp_eta = numpy.exp(log_sum_phi_n_exp_eta);
                            # numpy.sum(log_auxilary_variables_per_label_token[:term_pos, label_index]) + numpy.sum(log_auxilary_variables_per_label_token[term_pos:, label_index])
                            auxilary_variables_per_label[label_index] /= sum_phi_n_exp_eta;
                            
                            h_vector += auxilary_variables_per_label[label_index] * numpy.exp(self._eta[label_index, :] / total_word_count)
                        
                        for phi_iteration in xrange(local_phi_iteration):
                            phi_n = numpy.exp(log_phi[term_pos, :]);
                            # log_phi = self._E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]), (len(self._corpus[doc_id]), 1));
                            # log_phi = E_log_beta[:, term_ids].T + numpy.tile(scipy.special.psi(gamma_values[[doc_id], :]), (word_ids[doc_id].shape[0], 1));
                            log_phi_n = scipy.special.psi(gamma_values[doc_id, :]) + E_log_beta[:, term_id];
                            assert log_phi_n.shape == (self._number_of_topics,);

                            if parsed_corpus_labels == None:
                                log_phi_n += numpy.sum(self._eta[label_ids, :], axis=0) / total_word_count
                                log_phi_n -= len(label_ids) * h_vector / numpy.dot(h_vector, phi_n)
                            else:
                                log_phi_n -= h_vector / numpy.dot(h_vector, phi_n)
                            assert log_phi_n.shape == (self._number_of_topics,);
                            
                            log_phi_n -= scipy.misc.logsumexp(log_phi_n);
                            
                            log_phi[term_pos, :] = log_phi_n;
                        
                        for label_index in self._index_to_label:
                            log_sum_phi_n_exp_eta = scipy.misc.logsumexp(log_phi[term_pos, :] + self._eta[label_index, :] / total_word_count);
                            sum_phi_n_exp_eta = numpy.exp(log_sum_phi_n_exp_eta);
                            # numpy.sum(log_auxilary_variables_per_label_token[:term_pos, label_index]) + numpy.sum(log_auxilary_variables_per_label_token[term_pos:, label_index])
                            auxilary_variables_per_label[label_index] *= sum_phi_n_exp_eta;
                            
                        gamma_values[doc_id, :] = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi), axis=0));
                    mean_change = numpy.mean(abs(gamma_values[doc_id, :] - old_gamma_values));
                    if mean_change <= local_parameter_converge_threshold:
                        break;
                
                '''
                # TODO: We could also update the gamma after all phi updates.
                gamma_update = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi), axis=0));
                mean_change = numpy.mean(abs(gamma_update - gamma_values[doc_id, :]));
                gamma_values[doc_id, :] = gamma_update;
                if mean_change <= local_parameter_converge_threshold:
                    break;
                '''
        
            phi = numpy.exp(log_phi);
            assert phi.shape == (len(term_ids), self._number_of_topics);
            phi_mean = numpy.mean(phi, axis=0)
            assert phi_mean.shape == (self._number_of_topics,);
            
            # Note: all terms including E_q[p(\theta | \_alpha_alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates in E-step
            
            # compute the _alpha_alpha terms
            document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha))
            # compute the gamma terms
            document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[doc_id, :])) - scipy.special.gammaln(numpy.sum(gamma_values[doc_id, :]));
            # compute the phi terms
            document_log_likelihood -= numpy.sum(phi * log_phi);
            # compute the eta terms
            if parsed_corpus_labels == None:
                document_log_likelihood += numpy.dot(numpy.sum(self._eta[label_ids, :], axis=0), phi_mean);
            document_log_likelihood -= numpy.log(numpy.sum(auxilary_variables_per_label))
            
            # Note: all terms including E_q[p(\_eta | \_beta)], i.e., terms involving \Psi(\_eta), are cancelled due to \_eta updates in M-step
            if parsed_corpus_labels != None:
                # compute the p(w_{dn} | z_{dn}, \_eta) terms, which will be cancelled during M-step during training
                words_log_likelihood += numpy.sum(phi.T * E_log_prob_eta[:, term_ids]);
                
            assert(phi.shape == (len(term_ids), self._number_of_topics));
            for term_pos in xrange(len(term_ids)):
                term_id = term_ids[term_pos];
                phi_sufficient_statistics[:, term_id] += phi[term_pos, :];
            # phi_sufficient_statistics[:, term_ids] += numpy.exp(log_phi + numpy.log(term_counts.transpose())).T;
            
            E_A_sufficient_statistics[doc_id, :] = phi_mean;
            E_AA_sufficient_statistics[doc_id, :, :] = numpy.dot(phi_mean[:, numpy.newaxis], phi_mean[numpy.newaxis, :]);
                
            if (doc_id + 1) % 10 == 0:
                print "successfully processed %d documents..." % (doc_id + 1);
            
        # compute mean absolute error
        # mean_absolute_error = numpy.abs(numpy.dot(E_A_sufficient_statistics, self._eta.T) - label_idss[:, numpy.newaxis]).sum()
        
        if parsed_corpus_labels == None:
            self._gamma = gamma_values;
            return document_log_likelihood, phi_sufficient_statistics, E_A_sufficient_statistics, E_AA_sufficient_statistics
        else:
            return words_log_likelihood, gamma_values, numpy.dot(E_A_sufficient_statistics, self._eta.T)
Beispiel #13
0
    def e_step(self, parsed_corpus=None, number_of_samples=10, burn_in_samples=5):
        if parsed_corpus==None:
            word_idss = self._parsed_corpus;
        else:
            word_idss = parsed_corpus;
        number_of_documents = len(word_idss);
        
        E_log_eta = compute_dirichlet_expectation(self._eta)
        exp_E_log_eta = numpy.exp(E_log_eta);
        
        document_log_likelihood = 0;
        words_log_likelihood = 0;

        # initialize a V-by-K matrix phi contribution
        phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_types));
        
        # initialize a D-by-K matrix gamma values
        gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics;

        # iterate over all documents
        for doc_id in xrange(number_of_documents):
            phi = numpy.random.random((self._number_of_topics, len(word_idss[doc_id])));
            phi = phi / numpy.sum(phi, axis=0)[numpy.newaxis, :];
            phi_sum = numpy.sum(phi, axis=1)[:, numpy.newaxis];
            assert(phi_sum.shape == (self._number_of_topics, 1));
            
            document_phi = numpy.zeros((len(word_idss[doc_id]), self._number_of_topics));

            # collect phi samples from empirical distribution
            for it in xrange(number_of_samples):
                for word_pos in xrange(len(word_idss[doc_id])):
                    word_index = word_idss[doc_id][word_pos];
                    
                    phi_sum -= phi[:, word_pos][:, numpy.newaxis];
                    
                    # this is to get rid of the underflow error from the above summation, ideally, phi will become all integers after few iterations
                    phi_sum *= (phi_sum > 0);
                    #assert(numpy.all(phi_sum >= 0));

                    temp_phi = (phi_sum.T + self._alpha_alpha) * exp_E_log_eta[:, [word_index]].T;
                    assert(temp_phi.shape == (1, self._number_of_topics));
                    temp_phi /= numpy.sum(temp_phi);

                    # sample a topic for this word
                    temp_phi = numpy.random.multinomial(1, temp_phi[0])[:, numpy.newaxis];
                    assert(temp_phi.shape == (self._number_of_topics, 1));
                    
                    phi[:, word_pos][:, numpy.newaxis] = temp_phi;
                    phi_sum += temp_phi;

                    # discard the first few burn-in sweeps
                    if it < burn_in_samples:
                        continue;
                    
                    phi_sufficient_statistics[:, word_index] += temp_phi[:, 0];
                    document_phi[word_pos, :] += temp_phi[:, 0];

            gamma_values[doc_id, :] = self._alpha_alpha + phi_sum.T[0, :];
            #batch_document_topic_distribution[doc_id, :] = self._alpha_alpha + phi_sum.T[0, :];
            
            document_phi /= (number_of_samples - burn_in_samples);
            # this is to prevent 0 during log()
            document_phi += 1e-100;
            
            # Note: all terms including E_q[p(\theta|\alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates
            # Note: all terms including E_q[p(\eta | \beta)], i.e., terms involving \Psi(\eta), are cancelled due to \eta updates in M-step
            
            # compute the alpha terms
            document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha))
            # compute the gamma terms
            document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[doc_id, :])) - scipy.special.gammaln(numpy.sum(gamma_values[doc_id, :]));
            # compute the phi terms
            document_log_likelihood -= numpy.sum(numpy.log(document_phi) * document_phi);
            
            # compute the p(w_{dn} | z_{dn}, \eta) terms, which will be cancelled during M-step
            words_log_likelihood += numpy.sum(document_phi * (E_log_eta[:, word_idss[doc_id]].T));
            
            if (doc_id+1) % 1000==0:
                print "successfully processed %d documents in hybrid mode..." % (doc_id+1);

        phi_sufficient_statistics /= (number_of_samples - burn_in_samples);
        
        if parsed_corpus==None:
            self._gamma = gamma_values;
            return document_log_likelihood, phi_sufficient_statistics
        else:
            return words_log_likelihood, gamma_values
Beispiel #14
0
    def e_step(self, parsed_corpus=None, local_parameter_iteration=50, local_parameter_converge_threshold=1e-6):
        if parsed_corpus == None:
            documents = self._parsed_corpus
        
        number_of_documents = len(documents);

        document_log_likelihood = 0;
        words_log_likelihood = 0;

        # initialize a V-by-K matrix phi contribution
        phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_paths));
        
        # gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics;
        gamma_values = self._alpha_alpha + 2.0 * self._number_of_paths / self._number_of_topics * numpy.random.random((number_of_documents, self._number_of_topics));

        E_log_eta = numpy.copy(self._var_beta);
        for internal_node_index in self._edges_from_internal_node:
            edge_index_list = self._edges_from_internal_node[internal_node_index];
            assert numpy.min(E_log_eta[:, edge_index_list]) >= 0;
            E_log_eta[:, edge_index_list] = compute_dirichlet_expectation(E_log_eta[:, edge_index_list]);
        del internal_node_index, edge_index_list;

        # iterate over all documents
        for doc_id in xrange(number_of_documents):
            # update phi and gamma until gamma converges
            for gamma_iteration in xrange(local_parameter_iteration):
                document_phi = numpy.zeros((self._number_of_topics, self._number_of_paths));
                
                phi_entropy = 0;
                phi_E_log_eta = 0;

                # E_log_theta = scipy.special.psi(self._gamma[[doc_id], :]).T;
                # assert E_log_theta.shape==(self._number_of_topics, 1);
                
                E_log_theta = compute_dirichlet_expectation(gamma_values[[doc_id], :]).T;
                assert E_log_theta.shape == (self._number_of_topics, 1);
                
                for word_id in documents[doc_id]:  # word_ids:
                    paths_lead_to_current_word = self._word_index_to_path_indices[word_id];

                    # log_phi = numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]).T, (1, len(paths_lead_to_current_word)));
                    log_phi = numpy.tile(E_log_theta, (1, len(paths_lead_to_current_word)));
                    assert log_phi.shape == (self._number_of_topics, len(paths_lead_to_current_word));

                    for position_index in xrange(len(paths_lead_to_current_word)):
                        log_phi[:, position_index] += numpy.sum(E_log_eta[:, self._edges_along_path[paths_lead_to_current_word[position_index]]], axis=1);
                    del position_index

                    # log_phi -= scipy.misc.logsumexp(log_phi, axis=0)[numpy.newaxis, :]
                    log_phi -= scipy.misc.logsumexp(log_phi)
                    path_phi = numpy.exp(log_phi)
                    
                    # compute the phi terms
                    phi_entropy += -numpy.sum(path_phi * numpy.log(path_phi + 1e-100)) * documents[doc_id][word_id];
                    
                    for position_index in xrange(len(paths_lead_to_current_word)):
                        phi_E_log_eta += documents[doc_id][word_id] * numpy.sum(path_phi[:, [position_index]] * numpy.sum(E_log_eta[:, self._edges_along_path[paths_lead_to_current_word[position_index]]], axis=1)[:, numpy.newaxis])
                    del position_index
                    
                    # multiple path_phi with the count of current word
                    document_phi[:, paths_lead_to_current_word] += path_phi * documents[doc_id][word_id];
                
                del word_id, paths_lead_to_current_word
                # print doc_id, "before", self._gamma[[doc_id], :];
                gamma_values[[doc_id], :] = self._alpha_alpha + numpy.sum(document_phi, axis=1)[:, numpy.newaxis].T;
                
            # Note: all terms including E_q[p(\theta | \alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates in E-step
            # document_log_likelihood += numpy.sum((self._alpha_alpha - 1) * compute_dirichlet_expectation(gamma_values[[doc_id], :]));
            # document_log_likelihood += numpy.sum(numpy.sum(document_phi, axis=1)[:, numpy.newaxis].T * compute_dirichlet_expectation(gamma_values[[doc_id], :]));
            # document_log_likelihood += -numpy.sum((gamma_values[[doc_id], :] - 1) * compute_dirichlet_expectation(gamma_values[[doc_id], :]));
            
            # compute the alpha terms
            document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha));
            
            # compute the gamma terms
            document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[doc_id, :])) - scipy.special.gammaln(numpy.sum(gamma_values[doc_id, :]));
            
            # compute the phi terms
            document_log_likelihood += phi_entropy;
            
            # Note: all terms including E_q[p(\eta | \beta)], i.e., terms involving \Psi(\eta), are cancelled due to \eta updates in M-step
            if parsed_corpus != None:
                # compute the p(w_{dn} | z_{dn}, \eta) terms, which will be cancelled during M-step during training
                words_log_likelihood += phi_E_log_eta;
            
            phi_sufficient_statistics += document_phi;
        
            if (doc_id + 1) % 1000 == 0:
                print "successfully processed %d documents..." % (doc_id + 1);
                
            del doc_id
        
        if parsed_corpus == None:
            self._gamma = gamma_values;
            return document_log_likelihood, phi_sufficient_statistics
        else:
            return words_log_likelihood, gamma_values
Beispiel #15
0
    def e_step(self, wordids):
        batchD = len(wordids)
        
        document_level_elbo = 0;

        sufficient_statistics = numpy.zeros((self._number_of_topics, self._vocab_size));

        # Initialize the variational distribution q(theta|gamma) for the mini-batch
        batch_document_topic_distribution = numpy.zeros((batchD, self._number_of_topics));

        # Now, for each document d update that document's gamma and phi
        for d in xrange(batchD):
            phi = numpy.random.random((self._number_of_topics, len(wordids[d])));
            phi = phi / numpy.sum(phi, axis=0)[numpy.newaxis, :];
            phi_sum = numpy.sum(phi, axis=1)[:, numpy.newaxis];
            assert(phi_sum.shape == (self._number_of_topics, 1));

            for it in xrange(self._number_of_samples):
                for n in xrange(len(wordids[d])):
                    id = wordids[d][n];
                    
                    phi_sum -= phi[:, n][:, numpy.newaxis];
                    
                    # this is to get rid of the underflow error from the above summation, ideally, phi will become all integers after few iterations
                    phi_sum *= phi_sum > 0;
                    #assert(numpy.all(phi_sum >= 0));

                    temp_phi = (phi_sum + self._alpha_theta).T * self._exp_E_log_beta[:, wordids[d][n]];
                    assert(temp_phi.shape == (1, self._number_of_topics));
                    temp_phi /= numpy.sum(temp_phi);

                    # sample a topic for this word
                    temp_phi = numpy.random.multinomial(1, temp_phi[0, :])[:, numpy.newaxis];
                    assert(temp_phi.shape == (self._number_of_topics, 1));
                    
                    phi[:, n][:, numpy.newaxis] = temp_phi;
                    phi_sum += temp_phi;

                    # discard the first few burn-in sweeps
                    if it < self._burn_in_sweeps:
                        continue;
                    
                    sufficient_statistics[:, id] += temp_phi[:, 0];
                    
            batch_document_topic_distribution[d, :] = self._alpha_theta + phi_sum.T[0, :];
            
            if self._compute_elbo:
                document_level_elbo += len(wordids[d]);

                gammad = batch_document_topic_distribution[d];
                document_level_elbo += numpy.sum((self._alpha_theta - gammad) * numpy.exp(compute_dirichlet_expectation(gammad)));
                document_level_elbo += numpy.sum(scipy.special.gammaln(gammad) - scipy.special.gammaln(self._alpha_theta));
                document_level_elbo += numpy.sum(scipy.special.gammaln(self._alpha_theta * self._number_of_topics) - scipy.special.gammaln(numpy.sum(gammad)));

        sufficient_statistics /= (self._number_of_samples - self._burn_in_sweeps);
        
        if self._compute_elbo:
            document_level_elbo *= self._number_of_documents / batchD;

        return batch_document_topic_distribution, sufficient_statistics, document_level_elbo
    def e_step(self,
               parsed_corpus_response=None,
               local_parameter_iteration=10,
               local_parameter_converge_threshold=1e-6,
               approximate_phi=False):
        
        if parsed_corpus_response == None:
            word_idss = self._parsed_corpus;
            responses = self._responses
        else:
            word_idss, responses = parsed_corpus_response;
        
        '''
        if parsed_corpus == None:
            word_ids = self._parsed_corpus[0];
            word_cts = self._parsed_corpus[1];
        else:
            word_ids = parsed_corpus[0];
            word_cts = parsed_corpus[1];
        assert len(word_ids) == len(word_cts);
        number_of_documents = len(word_ids);
        '''
        
        number_of_documents = len(word_idss);
        
        document_log_likelihood = 0;
        words_log_likelihood = 0;
        
        # initialize a V-by-K matrix phi sufficient statistics
        phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_types));
        E_A_sufficient_statistics = numpy.zeros((number_of_documents, self._number_of_topics))
        E_AA_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_topics))
        
        # initialize a D-by-K matrix gamma values
        gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics;
        
        E_log_beta = compute_dirichlet_expectation(self._beta);
        assert E_log_beta.shape == (self._number_of_topics, self._number_of_types);
        if parsed_corpus_response != None:
            E_log_prob_eta = E_log_beta - scipy.misc.logsumexp(E_log_beta, axis=1)[:, numpy.newaxis]

        for doc_id in xrange(number_of_documents):
            '''
            total_word_count = numpy.sum(word_cts[doc_id]);
            term_ids = word_ids[doc_id];
            term_counts = word_cts[doc_id];
            assert term_counts.shape == (1, len(term_ids));
            '''
            
            total_word_count = len(word_idss[doc_id]);
            term_ids = word_idss[doc_id];
            
            # initialize gamma for this document
            gamma_values[doc_id, :] = self._alpha_alpha + 1.0 * total_word_count / self._number_of_topics;
            
            log_phi = scipy.special.psi(gamma_values[doc_id, :][numpy.newaxis, :]) + E_log_beta[:, term_ids].T;
            log_phi -= scipy.misc.logsumexp(log_phi, axis=1)[:, numpy.newaxis];
            
            # update phi and gamma until gamma converges
            for gamma_iteration in xrange(local_parameter_iteration):
                if approximate_phi:
                    phi = numpy.exp(log_phi);
                    assert phi.shape == (len(term_ids), self._number_of_topics);
                    
                    phi_sum = numpy.sum(phi, axis=0)[numpy.newaxis, :];
                    phi_sum_j = numpy.tile(phi_sum, (len(term_ids), 1));
                    phi_sum_j -= phi;
                    assert phi_sum_j.shape == (len(term_ids), self._number_of_topics);
                
                    # log_phi = self._E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]), (len(self._corpus[doc_id]), 1));
                    # log_phi = E_log_beta[:, term_ids].T + numpy.tile(scipy.special.psi(gamma_values[[doc_id], :]), (word_ids[doc_id].shape[0], 1));
                    log_phi = scipy.special.psi(gamma_values[doc_id, :][numpy.newaxis, :]) + E_log_beta[:, term_ids].T;
                    assert log_phi.shape == (len(term_ids), self._number_of_topics);
                    
                    assert self._eta.shape == (1, self._number_of_topics);
                    
                    log_phi += ((responses[doc_id] / (total_word_count * self._sigma_square)) * self._eta)
                    assert log_phi.shape == (len(term_ids), self._number_of_topics);
                    
                    log_phi -= (numpy.dot(phi_sum_j, self._eta.T) * self._eta + 0.5 * (self._eta ** 2)) / ((numpy.float(total_word_count) ** 2.) * self._sigma_square)
                    assert log_phi.shape == (len(term_ids), self._number_of_topics);
                    
                    # phi_normalizer = numpy.log(numpy.sum(numpy.exp(log_phi), axis=1)[:, numpy.newaxis]);
                    # assert phi_normalizer.shape == (len(term_ids), 1);
                    # log_phi -= phi_normalizer;
                    log_phi -= scipy.misc.logsumexp(log_phi, axis=1)[:, numpy.newaxis];
                    assert log_phi.shape == (len(term_ids), self._number_of_topics);
                    
                    gamma_update = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi), axis=0));
                
                    mean_change = numpy.mean(abs(gamma_update - gamma_values[doc_id, :]));
                    gamma_values[doc_id, :] = gamma_update;
                    if mean_change <= local_parameter_converge_threshold:
                        break;
                else:
                    old_gamma_values = gamma_values[doc_id, :].copy();
                    
                    # assert phi.shape == (len(term_ids), self._number_of_topics);
                    for term_pos in xrange(len(term_ids)):
                        term_id = term_ids[term_pos];
                        
                        phi_sum_j = numpy.zeros(self._number_of_topics)
                        if term_pos > 0:
                            phi_sum_j += numpy.exp(scipy.misc.logsumexp(log_phi[:term_pos, :], axis=0));
                        if term_pos < len(term_ids) - 1:
                            phi_sum_j += numpy.exp(scipy.misc.logsumexp(log_phi[term_pos + 1:, :], axis=0));
                        assert phi_sum_j.shape == (self._number_of_topics,);
                        
                        # phi_sum = numpy.sum(phi, axis=0)[numpy.newaxis, :];
                        # phi_sum_j = numpy.tile(phi_sum, (len(term_ids), 1));
                        # phi_sum_j -= phi;
                        # assert phi_sum_j.shape == (len(term_ids), self._number_of_topics);
                        
                        # log_phi = self._E_log_eta[:, term_ids].T + numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]), (len(self._corpus[doc_id]), 1));
                        # log_phi = E_log_beta[:, term_ids].T + numpy.tile(scipy.special.psi(gamma_values[[doc_id], :]), (word_ids[doc_id].shape[0], 1));
                        log_phi_j = scipy.special.psi(gamma_values[doc_id, :]) + E_log_beta[:, term_id];
                        assert log_phi_j.shape == (self._number_of_topics,);
                        
                        assert self._eta.shape == (1, self._number_of_topics);
                        
                        log_phi_j += ((responses[doc_id] / (total_word_count * self._sigma_square)) * self._eta[0, :])
                        assert log_phi_j.shape == (self._number_of_topics,);
                        
                        log_phi_j -= (numpy.sum(phi_sum_j * self._eta[0, :]) * self._eta[0, :] + 0.5 * (self._eta[0, :] ** 2)) / ((numpy.float(total_word_count) ** 2.) * self._sigma_square)
                        assert log_phi_j.shape == (self._number_of_topics,);
                        
                        # phi_normalizer = numpy.log(numpy.sum(numpy.exp(log_phi), axis=1)[:, numpy.newaxis]);
                        # assert phi_normalizer.shape == (len(term_ids), 1);
                        # log_phi -= phi_normalizer;
                        log_phi_j -= scipy.misc.logsumexp(log_phi_j);
                        assert log_phi_j.shape == (self._number_of_topics,);
                        
                        log_phi[term_pos, :] = log_phi_j;
                        
                        gamma_values[doc_id, :] = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi), axis=0));
                    mean_change = numpy.mean(abs(gamma_values[doc_id, :] - old_gamma_values));
                    if mean_change <= local_parameter_converge_threshold:
                        break;
                    
                '''
                # TODO: We could also update the gamma after all phi updates.
                gamma_update = self._alpha_alpha + numpy.array(numpy.sum(numpy.exp(log_phi), axis=0));
                
                mean_change = numpy.mean(abs(gamma_update - gamma_values[doc_id, :]));
                gamma_values[doc_id, :] = gamma_update;
                if mean_change <= local_parameter_converge_threshold:
                    break;
                '''
                        
            phi = numpy.exp(log_phi);
            assert phi.shape == (len(term_ids), self._number_of_topics);
            phi_mean = numpy.mean(phi, axis=0)
            assert phi_mean.shape == (self._number_of_topics,);
            
            # Note: all terms including E_q[p(\theta | \_alpha_alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates in E-step
            
            # compute the _alpha_alpha terms
            document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha))
            # compute the gamma terms
            document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[doc_id, :])) - scipy.special.gammaln(numpy.sum(gamma_values[doc_id, :]));
            # compute the phi terms
            document_log_likelihood -= numpy.sum(phi * log_phi);
            # compute the eta terms
            document_log_likelihood -= 0.5 * numpy.log(2 * numpy.pi * self._sigma_square)
            document_log_likelihood -= 0.5 * (responses[doc_id] ** 2 - 2 * responses[doc_id] * numpy.sum(self._eta[0, :] * phi_mean) + numpy.dot(numpy.dot(self._eta, numpy.dot(phi_mean[:, numpy.newaxis], phi_mean[numpy.newaxis, :])), self._eta.T)) / self._sigma_square
            
            # Note: all terms including E_q[p(\_eta | \_beta)], i.e., terms involving \Psi(\_eta), are cancelled due to \_eta updates in M-step
            if parsed_corpus_response != None:
                # compute the p(w_{dn} | z_{dn}, \_eta) terms, which will be cancelled during M-step during training
                words_log_likelihood += numpy.sum(phi.T * E_log_prob_eta[:, term_ids]);
                
            assert(phi.shape == (len(term_ids), self._number_of_topics));
            for term_pos in xrange(len(term_ids)):
                term_id = term_ids[term_pos];
                phi_sufficient_statistics[:, term_id] += phi[term_pos, :];
            # phi_sufficient_statistics[:, term_ids] += numpy.exp(log_phi + numpy.log(term_counts.transpose())).T;
            
            E_A_sufficient_statistics[doc_id, :] = phi_mean;
            E_AA_sufficient_statistics += numpy.dot(phi_mean[:, numpy.newaxis], phi_mean[numpy.newaxis, :])
            
            if (doc_id + 1) % 1000 == 0:
                print "successfully processed %d documents..." % (doc_id + 1);
            
        # compute mean absolute error
        mean_absolute_error = numpy.abs(numpy.dot(E_A_sufficient_statistics, self._eta.T) - responses[:, numpy.newaxis]).sum()
        
        if parsed_corpus_response == None:
            self._gamma = gamma_values;
            return document_log_likelihood, phi_sufficient_statistics, E_A_sufficient_statistics, E_AA_sufficient_statistics
        else:
            return words_log_likelihood, gamma_values, numpy.dot(E_A_sufficient_statistics, self._eta.T)
Beispiel #17
0
    def e_step(self,
               parsed_corpus=None,
               number_of_samples=10,
               burn_in_samples=5):
        if parsed_corpus == None:
            word_idss = self._parsed_corpus
        else:
            word_idss = parsed_corpus
        number_of_documents = len(word_idss)

        E_log_eta = compute_dirichlet_expectation(self._eta)
        exp_E_log_eta = numpy.exp(E_log_eta)

        document_log_likelihood = 0
        words_log_likelihood = 0

        # initialize a V-by-K matrix phi contribution
        phi_sufficient_statistics = numpy.zeros(
            (self._number_of_topics, self._number_of_types))

        # initialize a D-by-K matrix gamma values
        gamma_values = numpy.zeros(
            (number_of_documents, self._number_of_topics)
        ) + self._alpha_alpha[
            numpy.
            newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics

        # iterate over all documents
        for doc_id in xrange(number_of_documents):
            phi = numpy.random.random(
                (self._number_of_topics, len(word_idss[doc_id])))
            phi = phi / numpy.sum(phi, axis=0)[numpy.newaxis, :]
            phi_sum = numpy.sum(phi, axis=1)[:, numpy.newaxis]
            assert (phi_sum.shape == (self._number_of_topics, 1))

            document_phi = numpy.zeros(
                (len(word_idss[doc_id]), self._number_of_topics))

            # collect phi samples from empirical distribution
            for it in xrange(number_of_samples):
                for word_pos in xrange(len(word_idss[doc_id])):
                    word_index = word_idss[doc_id][word_pos]

                    phi_sum -= phi[:, word_pos][:, numpy.newaxis]

                    # this is to get rid of the underflow error from the above summation, ideally, phi will become all integers after few iterations
                    phi_sum *= (phi_sum > 0)
                    #assert(numpy.all(phi_sum >= 0));

                    temp_phi = (phi_sum.T + self._alpha_alpha
                                ) * exp_E_log_eta[:, [word_index]].T
                    assert (temp_phi.shape == (1, self._number_of_topics))
                    temp_phi /= numpy.sum(temp_phi)

                    # sample a topic for this word
                    temp_phi = numpy.random.multinomial(
                        1, temp_phi[0])[:, numpy.newaxis]
                    assert (temp_phi.shape == (self._number_of_topics, 1))

                    phi[:, word_pos][:, numpy.newaxis] = temp_phi
                    phi_sum += temp_phi

                    # discard the first few burn-in sweeps
                    if it < burn_in_samples:
                        continue

                    phi_sufficient_statistics[:, word_index] += temp_phi[:, 0]
                    document_phi[word_pos, :] += temp_phi[:, 0]

            gamma_values[doc_id, :] = self._alpha_alpha + phi_sum.T[0, :]
            #batch_document_topic_distribution[doc_id, :] = self._alpha_alpha + phi_sum.T[0, :];

            document_phi /= (number_of_samples - burn_in_samples)
            # this is to prevent 0 during log()
            document_phi += 1e-100

            # Note: all terms including E_q[p(\theta|\alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates
            # Note: all terms including E_q[p(\eta | \beta)], i.e., terms involving \Psi(\eta), are cancelled due to \eta updates in M-step

            # compute the alpha terms
            document_log_likelihood += scipy.special.gammaln(
                numpy.sum(self._alpha_alpha)) - numpy.sum(
                    scipy.special.gammaln(self._alpha_alpha))
            # compute the gamma terms
            document_log_likelihood += numpy.sum(
                scipy.special.gammaln(
                    gamma_values[doc_id, :])) - scipy.special.gammaln(
                        numpy.sum(gamma_values[doc_id, :]))
            # compute the phi terms
            document_log_likelihood -= numpy.sum(
                numpy.log(document_phi) * document_phi)

            # compute the p(w_{dn} | z_{dn}, \eta) terms, which will be cancelled during M-step
            words_log_likelihood += numpy.sum(
                document_phi * (E_log_eta[:, word_idss[doc_id]].T))

            if (doc_id + 1) % 1000 == 0:
                print "successfully processed %d documents in hybrid mode..." % (
                    doc_id + 1)

        phi_sufficient_statistics /= (number_of_samples - burn_in_samples)

        if parsed_corpus == None:
            self._gamma = gamma_values
            return document_log_likelihood, phi_sufficient_statistics
        else:
            return words_log_likelihood, gamma_values
Beispiel #18
0
    def e_step(self, wordids, wordcts):
        batch_size = len(wordids)

        document_level_elbo = 0

        # Initialize the variational distribution q(theta|gamma) for the mini-batch
        gamma = 1 * numpy.random.gamma(100., 1. / 100.,
                                       (batch_size, self._number_of_topics))
        exp_E_log_theta = numpy.exp(compute_dirichlet_expectation(gamma))

        sstats = numpy.zeros(self._beta.shape)
        # Now, for each document d update that document's gamma and phi
        meanchange = 0
        for d in range(0, batch_size):
            # These are mostly just shorthand (but might help cache locality)
            ids = wordids[d]
            cts = wordcts[d]
            gammad = gamma[d, :]
            exp_E_log_theta_d = exp_E_log_theta[d, :]
            exp_E_log_beta_d = self._exp_E_log_beta[:, ids]
            # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w. phi_norm is the normalizer.
            phi_norm = numpy.dot(exp_E_log_theta_d, exp_E_log_beta_d) + 1e-100
            # Iterate between gamma and phi until convergence
            for it in range(0, self._maximum_gamma_update_iteration):
                lastgamma = gammad
                # We represent phi implicitly to save memory and time. Substituting the value of the optimal phi back into the update for gamma gives this update. Cf. Lee&Seung 2001.
                gammad = self._alpha_theta + exp_E_log_theta_d * numpy.dot(
                    cts / phi_norm, exp_E_log_beta_d.T)
                exp_E_log_theta_d = numpy.exp(
                    compute_dirichlet_expectation(gammad))
                phi_norm = numpy.dot(exp_E_log_theta_d,
                                     exp_E_log_beta_d) + 1e-100
                # If gamma hasn't changed much, we're done.
                meanchange = numpy.mean(abs(gammad - lastgamma))
                if (meanchange < self._minimum_mean_change_threshold):
                    break
            gamma[d, :] = gammad
            # Contribution of document d to the expected sufficient statistics for the M step.
            sstats[:, ids] += numpy.outer(exp_E_log_theta_d.T, cts / phi_norm)

            if self._compute_elbo:
                document_level_elbo += numpy.sum(cts * phi_norm)

                # E[log p(theta | alpha) - log q(theta | gamma)]
                document_level_elbo += numpy.sum(
                    (self._alpha_theta - gammad) * exp_E_log_theta_d)
                document_level_elbo += numpy.sum(
                    scipy.special.gammaln(gammad) -
                    scipy.special.gammaln(self._alpha_theta))
                document_level_elbo += numpy.sum(
                    scipy.special.gammaln(self._alpha_theta *
                                          self._number_of_topics) -
                    scipy.special.gammaln(numpy.sum(gammad)))

        # This step finishes computing the sufficient statistics for the M step, so that sstats[k, w] = \sum_d n_{dw} * phi_{dwk} = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.
        sstats = sstats * self._exp_E_log_beta

        if self._compute_elbo:
            document_level_elbo *= self._number_of_documents / batch_size

        return gamma, sstats, document_level_elbo
Beispiel #19
0
    def e_step(self):
        document_level_log_likelihood = 0;

        # initialize a V-by-K matrix phi contribution
        phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_paths));
        
        # iterate over all documents
        for doc_id in xrange(self._number_of_documents):
        #for doc_id in xrange(0, 1):
            # compute the total number of words
            #total_word_count = self._data[doc_id].N()

            # initialize gamma for this document
            #self._gamma[[doc_id], :] = self._alpha + 1.0 * total_word_count / self._number_of_topics;
            #self._gamma[[doc_id], :] = self._alpha + 2.0 * total_word_count / self._number_of_topics * numpy.random.random((1, self._number_of_topics));
            
            #word_ids = numpy.array(self._data[doc_id].keys());
            #word_counts = numpy.array([self._data[doc_id].values()]);
            #assert(word_counts.shape == (1, len(word_ids)));

            # update phi and gamma until gamma converges
            for gamma_iteration in xrange(self._gamma_maximum_iteration):
                document_phi = numpy.zeros((self._number_of_topics, self._number_of_paths));
                
                phi_entropy = 0;
                phi_E_log_beta = 0;

                #E_log_theta = scipy.special.psi(self._gamma[[doc_id], :]).T;
                #assert E_log_theta.shape==(self._number_of_topics, 1);
                
                E_log_theta = compute_dirichlet_expectation(self._gamma[[doc_id], :]).T;
                assert E_log_theta.shape==(self._number_of_topics, 1);

                for word_id in self._data[doc_id].keys():#word_ids:
                    paths_lead_to_current_word = self._word_index_to_path_indices[word_id];

                    #log_phi = numpy.tile(scipy.special.psi(self._gamma[[doc_id], :]).T, (1, len(paths_lead_to_current_word)));
                    log_phi = numpy.tile(E_log_theta, (1, len(paths_lead_to_current_word)));
                    assert log_phi.shape==(self._number_of_topics, len(paths_lead_to_current_word));

                    for position_index in xrange(len(paths_lead_to_current_word)):
                        log_phi[:, position_index] += numpy.sum(self._E_log_beta[:, self._edges_along_path[paths_lead_to_current_word[position_index]]], axis=1);
                    del position_index

                    # log normalize
                    # TODO: error with 2-level tree
                    #log_phi = log_normalize(log_phi)
                    
                    # convert it into normal scale
                    path_phi = numpy.exp(log_phi - numpy.max(log_phi));
                    assert path_phi.shape==(self._number_of_topics, len(paths_lead_to_current_word));
                    assert numpy.min(path_phi)>=0; path_phi.T
                    # normalize path_phi over all topics and paths
                    assert numpy.sum(path_phi)>0, log_phi.T
                    
                    path_phi /= numpy.sum(path_phi);
                    #path_phi /= numpy.sum(path_phi, axis=1);
                    
                    phi_entropy += - numpy.sum(path_phi * numpy.log(path_phi)) * self._data[doc_id][word_id];
                    
                    for position_index in xrange(len(paths_lead_to_current_word)):
                        phi_E_log_beta += self._data[doc_id][word_id] * numpy.sum( path_phi[:, [position_index]] * numpy.sum(self._E_log_beta[:, self._edges_along_path[paths_lead_to_current_word[position_index]]], axis=1)[:, numpy.newaxis] )
                    del position_index
                    
                    # multiple path_phi with the count of current word
                    path_phi *= self._data[doc_id][word_id];
                    
                    document_phi[:, paths_lead_to_current_word] += path_phi;
                
                del word_id, paths_lead_to_current_word
                #print doc_id, "before", self._gamma[[doc_id], :];
                self._gamma[[doc_id], :] = self._alpha + numpy.sum(document_phi, axis=1)[:, numpy.newaxis].T;
                
            # term 1
            document_level_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha)) - numpy.sum(scipy.special.gammaln(self._alpha));
            document_level_log_likelihood += numpy.sum((self._alpha - 1) * compute_dirichlet_expectation(self._gamma[[doc_id], :]));

            # term 2
            document_level_log_likelihood += numpy.sum(numpy.sum(document_phi, axis=1)[:, numpy.newaxis].T * compute_dirichlet_expectation(self._gamma[[doc_id], :]));
            
            # term 4
            document_level_log_likelihood += phi_E_log_beta;
            
            # term 5
            document_level_log_likelihood += - scipy.special.gammaln(numpy.sum(self._gamma[[doc_id], :])) + numpy.sum(scipy.special.gammaln(self._gamma[[doc_id], :]))
            document_level_log_likelihood += - numpy.sum( (self._gamma[[doc_id], :] - 1) * compute_dirichlet_expectation(self._gamma[[doc_id], :]) );
            
            # term 7
            document_level_log_likelihood += phi_entropy;
            
            phi_sufficient_statistics += document_phi;
        
            if (doc_id+1) % 1000==0:
                print "successfully processed %d documents..." % (doc_id+1);
                
            del doc_id
        
        return phi_sufficient_statistics, document_level_log_likelihood;