def __init__(self, K, alpha, eta, tau0, kappa, sanity_check=False, parse=parse): """ Arguments: K: Number of topics alpha: Hyperparameter for prior on weight vectors theta eta: Hyperparameter for prior on topics beta tau0: A (positive) learning parameter that downweights early iterations kappa: Learning rate: exponential decay rate---should be between (0.5, 1.0] to guarantee asymptotic convergence. Note that if you pass the same set of D documents in every time and set kappa=0 this class can also be used to do batch VB. """ if not isinstance(K, int): raise ParameterError # set the model-level parameters self._K = K self._alpha = alpha self._eta = eta self._tau0 = tau0 + 1 self._kappa = kappa self.sanity_check = sanity_check # number of documents seen *so far*. Updated each time a new batch is # submitted. self._D = 0 # number of batches processed so far. self._batches_to_date = 0 # cache the wordids and wordcts for the most recent batch so they don't # have to be recalculated when computing perplexity self.recentbatch = {'wordids': None, 'wordcts': None} # Initialize lambda as a DirichletWords object which has a non-zero # probability for any character sequence, even those unseen. self._lambda = DirichletWords(self._K, sanity_check=self.sanity_check, initialize=True) self._lambda_mat = self._lambda.as_matrix() # set the variational distribution q(beta|lambda). self._Elogbeta = self._lambda_mat # num_topics x num_words self._expElogbeta = n.exp(self._Elogbeta) # num_topics x num_words # normalize and parse string function. self.parse = parse
def do_e_step(self, docs): """ Given a mini-batch of documents, estimates the parameters gamma controlling the variational distribution over the topic weights for each document in the mini-batch. Arguments: docs: List of D documents. Each document must be represented as a string. (Word order is unimportant.) Any words not in the vocabulary will be ignored. Returns a tuple containing the estimated values of gamma, as well as sufficient statistics needed to update lambda. """ # This is to handle the case where someone just passes in a single # document, not in a list. if type(docs) == str: docs = [ docs, ] (wordids, wordcts) = self.parse_new_docs(docs) # don't use len(docs) here because if we encounter any empty documents, # they'll be skipped in the parse step above, and then batchD will be # longer than wordids list. batchD = len(wordids) # Initialize the variational distribution q(theta|gamma) for # the mini-batch gamma = 1 * n.random.gamma(100., 1. / 100., (batchD, self._K)) # batchD x K Elogtheta = dirichlet_expectation(gamma) # D x K expElogtheta = n.exp(Elogtheta) # create a new_lambda to store the stats for this batch new_lambda = DirichletWords(self._K, sanity_check=self.sanity_check) # Now, for each document d update that document's gamma and phi it = 0 meanchange = 0 for d in range(0, batchD): if d % 10 == 0: print 'Updating gamma and phi for document %d in batch' % d # These are mostly just shorthand (but might help cache locality) ids = wordids[d] cts = wordcts[d] gammad = gamma[d, :] Elogthetad = Elogtheta[d, :] # K x 1 expElogthetad = expElogtheta[d, :] # k x 1 for this D. # make sure exp/Elogbeta is initialized for all the needed indices. self.Elogbeta_sizecheck(ids) expElogbetad = self._expElogbeta[:, ids] # dims(expElogbetad) = k x len(doc_vocab) # The optimal phi_{dwk} is proportional to # expElogthetad_k * expElogbetad_w. phinorm is the normalizer. phinorm = n.dot(expElogthetad, expElogbetad) + 1e-100 # Iterate between gamma and phi until convergence for it in range(0, 100): lastgamma = gammad # In these steps, phi is represented implicitly to save memory # and time. Substituting the value of the optimal phi back # into the update for gamma gives this update. Cf. Lee&Seung # 2001. gammad = self._alpha + expElogthetad * \ n.dot(cts / phinorm, expElogbetad.T) Elogthetad = dirichlet_expectation(gammad) expElogthetad = n.exp(Elogthetad) phinorm = n.dot(expElogthetad, expElogbetad) + 1e-100 # If gamma hasn't changed much, we're done. meanchange = n.mean(abs(gammad - lastgamma)) if (meanchange < meanchangethresh): break gamma[d, :] = gammad # Contribution of document d to the expected sufficient # statistics for the M step. Updates the statistics only for words # in ids list, with their respective counts in cts (also a list). # the multiplying factor from self._expElogbeta # lambda_stats is basically phi multiplied by the word counts, ie # lambda_stats_wk = n_dw * phi_dwk # the sum over documents shown in equation (5) happens as each # document is iterated over. # lambda stats is K x len(ids), while the actual word ids can be # any integer, so we need a way to map word ids to their # lambda_stats (ie we can't just index into the lambda_stats array # using the wordid because it will be out of range). so we create # lambda_data, which contains a list of 2-tuples of length len(ids). # the first tuple item contains the wordid, and the second contains # a numpy array with the statistics for each topic, for that word. lambda_stats = n.outer(expElogthetad.T, cts / phinorm) * expElogbetad lambda_data = zip(ids, lambda_stats.T) for wordid, stats in lambda_data: word = self._lambda.dictionary(wordid) for topic in xrange(self._K): stats_wk = stats[topic] new_lambda.update_count(word, topic, stats_wk) return ((gamma, new_lambda))