def vi(self, i, ids, cts, words_no, expElogbetad, no_iter=1000): alpha = self.G_0.G_0 * self.m_gamma gamma = np.ones(len(alpha)) expElogtheta = np.exp(dirichlet_expectation(gamma)) phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100 counts = np.array(cts) for _ in range(no_iter): lastgamma = gamma gamma = alpha + expElogtheta * np.dot(cts / phinorm, expElogbetad.T) expElogtheta = np.exp(dirichlet_expectation(gamma)) phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100 meanchange = mean_absolute_difference(gamma, lastgamma) if meanchange < meanchangethresh: break pro_mat = np.outer(expElogtheta.T, 1 / phinorm) * expElogbetad mat_z = my_multinomial(pro_mat) self.mat_z[i][self.effe_list] = mat_z self.mat_z_sum[i][self.effe_list] = np.dot(mat_z, cts)
def testDirichletExpectation(self): # test dirichlet_expectation rs = self.random_state for dtype in [np.float16, np.float32, np.float64]: for i in range(self.num_runs): # 1 dimensional case input_1d = rs.uniform(.01, 10000, size=(self.num_topics, )) known_good = dirichlet_expectation(input_1d) test_values = matutils.dirichlet_expectation(input_1d) msg = "dirichlet_expectation_1d failed for dtype={}".format( dtype) self.assertTrue(np.allclose(known_good, test_values), msg) # 2 dimensional case input_2d = rs.uniform(.01, 10000, size=( 1, self.num_topics, )) known_good = dirichlet_expectation(input_2d) test_values = matutils.dirichlet_expectation(input_2d) msg = "dirichlet_expectation_2d failed for dtype={}".format( dtype) self.assertTrue(np.allclose(known_good, test_values), msg)
def bound(self, corpus, gamma=None, subsample_ratio=1.0): """ Estimate the variational bound of documents from `corpus`: E_q[log p(corpus)] - E_q[log q(corpus)] `gamma` are the variational parameters on topic weights for each `corpus` document (=2d matrix=what comes out of `inference()`). If not supplied, will be inferred from the model. """ score = 0.0 _lambda = self.state.get_lambda() Elogbeta = dirichlet_expectation(_lambda) for d, doc in enumerate( corpus ): # stream the input doc-by-doc, in case it's too large to fit in RAM if d % self.chunksize == 0: logger.debug("bound: at document #%i", d) if gamma is None: gammad, _ = self.inference([doc]) else: gammad = gamma[d] Elogthetad = dirichlet_expectation(gammad) # E[log p(doc | theta, beta)] score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc) # E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a vector score += np.sum((self.alpha - gammad) * Elogthetad) score += np.sum(gammaln(gammad) - gammaln(self.alpha)) score += gammaln(np.sum(self.alpha)) - gammaln(np.sum(gammad)) # Compensate likelihood for when `corpus` above is only a sample of the whole corpus. This ensures # that the likelihood is always rougly on the same scale. score *= subsample_ratio # E[log p(beta | eta) - log q (beta | lambda)]; assumes eta is a scalar score += np.sum((self.eta - _lambda) * Elogbeta) score += np.sum(gammaln(_lambda) - gammaln(self.eta)) if np.ndim(self.eta) == 0: sum_eta = self.eta * self.num_terms else: sum_eta = np.sum(self.eta) score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1))) return score
def delete_empty_list(self, delete_list): delete_no = np.sum(delete_list) delete_list2 = self.effe_list[delete_list] if delete_no != 0: self.m_K -= delete_no self.effe_list = self.effe_list[np.logical_not(delete_list)] self.m_lambda[delete_list2] = np.zeros_like( self.m_lambda[delete_list2]) * self.m_lambda[0, 0] self.m_dir_exp_lambda[delete_list2] = np.exp( dirichlet_expectation(self.m_lambda[delete_list2] + self.m_beta)) self.mat_phi[delete_list2] = np.zeros( (delete_no, self.chunk_doc_no)) self.G_0.G_0 = self.G_0.G_0[np.logical_not(delete_list)] self.G_0.m_K = int(len(self.G_0.G_0) - 1) self.G_0.G_0[0] = 1 - np.sum(self.G_0.G_0[1:]) for i in range(self.chunk_doc_no): self.mat_z[i][delete_list2] = np.zeros_like( self.mat_z[i][delete_list2]) self.mat_z_avrg[i][delete_list2] = np.zeros_like( self.mat_z_avrg[i][delete_list2])
def get_initial(self): self.G_0 = Global_Prior(self.m_alpha, self.m_K, self.m_gamma, self.m_D, self.m_W, self.chunksize) self.effe_list = np.arange(self.m_K + 1) self.m_lambda = np.zeros((self.max_K + 1, self.m_W)) self.m_dir_exp_lambda = np.exp( dirichlet_expectation(self.m_lambda + self.m_beta))
def bound(self, corpus, gamma=None, subsample_ratio=1.0): """ Estimate the variational bound of documents from `corpus`: E_q[log p(corpus)] - E_q[log q(corpus)] `gamma` are the variational parameters on topic weights for each `corpus` document (=2d matrix=what comes out of `inference()`). If not supplied, will be inferred from the model. """ score = 0.0 _lambda = self.state.get_lambda() Elogbeta = dirichlet_expectation(_lambda) for d, doc in enumerate(corpus): # stream the input doc-by-doc, in case it's too large to fit in RAM if d % self.chunksize == 0: logger.debug("bound: at document #%i", d) if gamma is None: gammad, _ = self.inference([doc]) else: gammad = gamma[d] Elogthetad = dirichlet_expectation(gammad) # E[log p(doc | theta, beta)] score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc) # E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a vector score += np.sum((self.alpha - gammad) * Elogthetad) score += np.sum(gammaln(gammad) - gammaln(self.alpha)) score += gammaln(np.sum(self.alpha)) - gammaln(np.sum(gammad)) # Compensate likelihood for when `corpus` above is only a sample of the whole corpus. This ensures # that the likelihood is always rougly on the same scale. score *= subsample_ratio # E[log p(beta | eta) - log q (beta | lambda)]; assumes eta is a scalar score += np.sum((self.eta - _lambda) * Elogbeta) score += np.sum(gammaln(_lambda) - gammaln(self.eta)) if np.ndim(self.eta) == 0: sum_eta = self.eta * self.num_terms else: sum_eta = np.sum(self.eta) score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1))) return score
def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): """Performs EM-iteration on a single document for calculation of likelihood for a maximum iteration of `max_iter`. Parameters ---------- doc_word_ids : int Id of corresponding words in a document. doc_word_counts : int Count of words in a single document. alpha : numpy.ndarray Lda equivalent value of alpha. beta : numpy.ndarray Lda equivalent value of beta. max_iter : int, optional Maximum number of times the expectation will be maximised. Returns ------- (numpy.ndarray, numpy.ndarray) Computed (:math:`likelihood`, :math:`\\gamma`). """ gamma = np.ones(len(alpha)) expElogtheta = np.exp(dirichlet_expectation(gamma)) betad = beta[:, doc_word_ids] phinorm = np.dot(expElogtheta, betad) + 1e-100 counts = np.array(doc_word_counts) for _ in xrange(max_iter): lastgamma = gamma gamma = alpha + expElogtheta * np.dot(counts / phinorm, betad.T) Elogtheta = dirichlet_expectation(gamma) expElogtheta = np.exp(Elogtheta) phinorm = np.dot(expElogtheta, betad) + 1e-100 meanchange = np.mean(abs(gamma - lastgamma)) if meanchange < meanchangethresh: break likelihood = np.sum(counts * np.log(phinorm)) likelihood += np.sum((alpha - gamma) * Elogtheta) likelihood += np.sum(gammaln(gamma) - gammaln(alpha)) likelihood += gammaln(np.sum(alpha)) - gammaln(np.sum(gamma)) return likelihood, gamma
def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): r"""Performs EM-iteration on a single document for calculation of likelihood for a maximum iteration of `max_iter`. Parameters ---------- doc_word_ids : int Id of corresponding words in a document. doc_word_counts : int Count of words in a single document. alpha : numpy.ndarray Lda equivalent value of alpha. beta : numpy.ndarray Lda equivalent value of beta. max_iter : int, optional Maximum number of times the expectation will be maximised. Returns ------- (numpy.ndarray, numpy.ndarray) Computed (:math:`likelihood`, :math:`\gamma`). """ gamma = np.ones(len(alpha)) expElogtheta = np.exp(dirichlet_expectation(gamma)) betad = beta[:, doc_word_ids] phinorm = np.dot(expElogtheta, betad) + 1e-100 counts = np.array(doc_word_counts) for _ in range(max_iter): lastgamma = gamma gamma = alpha + expElogtheta * np.dot(counts / phinorm, betad.T) Elogtheta = dirichlet_expectation(gamma) expElogtheta = np.exp(Elogtheta) phinorm = np.dot(expElogtheta, betad) + 1e-100 meanchange = mean_absolute_difference(gamma, lastgamma) if meanchange < meanchangethresh: break likelihood = np.sum(counts * np.log(phinorm)) likelihood += np.sum((alpha - gamma) * Elogtheta) likelihood += np.sum(gammaln(gamma) - gammaln(alpha)) likelihood += gammaln(np.sum(alpha)) - gammaln(np.sum(gamma)) return likelihood, gamma
def update_eta(self, lambdat, rho): """ Update parameters for the Dirichlet prior on the per-topic word weights `eta` given the last `lambdat`. """ N = float(lambdat.shape[0]) logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,)) self.eta = update_dir_prior(self.eta, N, logphat, rho) return self.eta
def update_eta(self, lambdat, rho): """ Update parameters for the Dirichlet prior on the per-topic word weights `eta` given the last `lambdat`. """ N = float(lambdat.shape[0]) logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,)) self.eta = update_dir_prior(self.eta, N, logphat, rho) return self.eta
def lda_e_step(ids, cts, alpha, expElogbetad, max_iter=1000): """ the function to update global parameters """ gamma = np.ones(len(alpha)) expElogtheta = np.exp(dirichlet_expectation(gamma)) phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100 counts = np.array(cts) for _ in range(max_iter): lastgamma = gamma gamma = alpha + expElogtheta * np.dot(cts / phinorm, expElogbetad.T) expElogtheta = np.exp(dirichlet_expectation(gamma)) phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100 meanchange = mean_absolute_difference(gamma, lastgamma) if meanchange < meanchangethresh: break return gamma / np.sum(gamma)
def testDirichletExpectation(self): # test dirichlet_expectation rs = self.random_state for dtype in [np.float16, np.float32, np.float64]: for i in range(self.num_runs): # 1 dimensional case input_1d = rs.uniform(.01, 10000, size=(self.num_topics,)) known_good = dirichlet_expectation(input_1d) test_values = matutils.dirichlet_expectation(input_1d) msg = "dirichlet_expectation_1d failed for dtype={}".format(dtype) self.assertTrue(np.allclose(known_good, test_values), msg) # 2 dimensional case input_2d = rs.uniform(.01, 10000, size=(1, self.num_topics,)) known_good = dirichlet_expectation(input_2d) test_values = matutils.dirichlet_expectation(input_2d) msg = "dirichlet_expectation_2d failed for dtype={}".format(dtype) self.assertTrue(np.allclose(known_good, test_values), msg)
def update_alpha(self, gammat, rho): """ Update parameters for the Dirichlet prior on the per-document topic weights `alpha` given the last `gammat`. """ N = float(len(gammat)) logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N self.alpha = update_dir_prior(self.alpha, N, logphat, rho) logger.info("optimized alpha %s", list(self.alpha)) return self.alpha
def update_alpha(self, gammat, rho): """ Update parameters for the Dirichlet prior on the per-document topic weights `alpha` given the last `gammat`. """ N = float(len(gammat)) logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N self.alpha = update_dir_prior(self.alpha, N, logphat, rho) logger.info("optimized alpha %s", list(self.alpha)) return self.alpha
def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): gamma = np.ones(len(alpha)) expElogtheta = np.exp(dirichlet_expectation(gamma)) betad = beta[:, doc_word_ids] phinorm = np.dot(expElogtheta, betad) + 1e-100 counts = np.array(doc_word_counts) for _ in xrange(max_iter): lastgamma = gamma gamma = alpha + expElogtheta * np.dot(counts / phinorm, betad.T) Elogtheta = dirichlet_expectation(gamma) expElogtheta = np.exp(Elogtheta) phinorm = np.dot(expElogtheta, betad) + 1e-100 meanchange = np.mean(abs(gamma - lastgamma)) if (meanchange < meanchangethresh): break likelihood = np.sum(counts * np.log(phinorm)) likelihood += np.sum((alpha - gamma) * Elogtheta) likelihood += np.sum(gammaln(gamma) - gammaln(alpha)) likelihood += gammaln(np.sum(alpha)) - gammaln(np.sum(gamma)) return (likelihood, gamma)
def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): gamma = np.ones(len(alpha)) expElogtheta = np.exp(dirichlet_expectation(gamma)) betad = beta[:, doc_word_ids] phinorm = np.dot(expElogtheta, betad) + 1e-100 counts = np.array(doc_word_counts) for _ in xrange(max_iter): lastgamma = gamma gamma = alpha + expElogtheta * np.dot(counts / phinorm, betad.T) Elogtheta = dirichlet_expectation(gamma) expElogtheta = np.exp(Elogtheta) phinorm = np.dot(expElogtheta, betad) + 1e-100 meanchange = np.mean(abs(gamma - lastgamma)) if (meanchange < meanchangethresh): break likelihood = np.sum(counts * np.log(phinorm)) likelihood += np.sum((alpha - gamma) * Elogtheta) likelihood += np.sum(gammaln(gamma) - gammaln(alpha)) likelihood += gammaln(np.sum(alpha)) - gammaln(np.sum(gamma)) return (likelihood, gamma)
def update_lambda(self, rhot): self.m_lambda[self.effe_list] -= rhot * (self.m_lambda[self.effe_list]) for i in range(self.chunk_doc_no): ids = self.chunk_doc_word_ids_list[i] cts = self.chunk_doc_word_counts_list[i] self.m_lambda[np.ix_( self.effe_list, ids)] += rhot * (self.m_D / self.chunksize) * (np.tile( cts, (self.m_K + 1, 1)) * self.mat_z_avrg[i][self.effe_list]) self.m_dir_exp_lambda[self.effe_list] = np.exp( dirichlet_expectation(self.m_lambda[self.effe_list] + self.m_beta))
def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, doc2author=None): """ Estimate the variational bound of documents from `corpus`: E_q[log p(corpus)] - E_q[log q(corpus)] There are basically two use cases of this method: 1. `chunk` is a subset of the training corpus, and `chunk_doc_idx` is provided, indicating the indexes of the documents in the training corpus. 2. `chunk` is a test set (held-out data), and author2doc and doc2author corrsponding to this test set are provided. There must not be any new authors passed to this method. `chunk_doc_idx` is not needed in this case. To obtain the per-word bound, compute: >>> corpus_words = sum(cnt for document in corpus for _, cnt in document) >>> model.bound(corpus, author2doc=author2doc, doc2author=doc2author) / corpus_words """ # TODO: enable evaluation of documents with new authors. One could, for example, make it # possible to pass a list of documents to self.inference with no author dictionaries, # assuming all the documents correspond to one (unseen) author, learn the author's # gamma, and return gamma (without adding it to self.state.gamma). Of course, # collect_sstats should be set to false, so that the model is not updated w.r.t. these # new documents. _lambda = self.state.get_lambda() Elogbeta = dirichlet_expectation(_lambda) expElogbeta = np.exp(Elogbeta) gamma = self.state.gamma if author2doc is None and doc2author is None: # Evaluating on training documents (chunk of self.corpus). author2doc = self.author2doc doc2author = self.doc2author if not chunk_doc_idx: # If author2doc and doc2author are not provided, chunk is assumed to be a subset of # self.corpus, and chunk_doc_idx is thus required. raise ValueError('Either author dictionaries or chunk_doc_idx must be provided. Consult documentation of bound method.') elif author2doc is not None and doc2author is not None: # Training on held-out documents (documents not seen during training). # All authors in dictionaries must still be seen during training. for a in author2doc.keys(): if not self.author2doc.get(a): raise ValueError('bound cannot be called with authors not seen during training.') if chunk_doc_idx: raise ValueError('Either author dictionaries or chunk_doc_idx must be provided, not both. Consult documentation of bound method.') else: raise ValueError('Either both author2doc and doc2author should be provided, or neither. Consult documentation of bound method.') Elogtheta = dirichlet_expectation(gamma) expElogtheta = np.exp(Elogtheta) word_score = 0.0 theta_score = 0.0 for d, doc in enumerate(chunk): if chunk_doc_idx: doc_no = chunk_doc_idx[d] else: doc_no = d # Get all authors in current document, and convert the author names to integer IDs. authors_d = [self.author2id[a] for a in self.doc2author[doc_no]] ids = np.array([id for id, _ in doc]) # Word IDs in doc. cts = np.array([cnt for _, cnt in doc]) # Word counts. if d % self.chunksize == 0: logger.debug("bound: at document #%i in chunk", d) # Computing the bound requires summing over expElogtheta[a, k] * expElogbeta[k, v], which # is the same computation as in normalizing phi. phinorm = self.compute_phinorm(expElogtheta[authors_d, :], expElogbeta[:, ids]) word_score += np.log(1.0 / len(authors_d)) * sum(cts) + cts.dot(np.log(phinorm)) # Compensate likelihood for when `chunk` above is only a sample of the whole corpus. This ensures # that the likelihood is always roughly on the same scale. word_score *= subsample_ratio # E[log p(theta | alpha) - log q(theta | gamma)] for a in author2doc.keys(): a = self.author2id[a] theta_score += np.sum((self.alpha - gamma[a, :]) * Elogtheta[a, :]) theta_score += np.sum(gammaln(gamma[a, :]) - gammaln(self.alpha)) theta_score += gammaln(np.sum(self.alpha)) - gammaln(np.sum(gamma[a, :])) # theta_score is rescaled in a similar fashion. # TODO: treat this in a more general way, similar to how it is done with word_score. theta_score *= self.num_authors / len(author2doc) # E[log p(beta | eta) - log q (beta | lambda)] beta_score = 0.0 beta_score += np.sum((self.eta - _lambda) * Elogbeta) beta_score += np.sum(gammaln(_lambda) - gammaln(self.eta)) sum_eta = np.sum(self.eta) beta_score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1))) total_score = word_score + theta_score + beta_score return total_score
def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, chunk_doc_idx=None): """ Given a chunk of sparse document vectors, update gamma (parameters controlling the topic weights) for each author corresponding to the documents in the chunk. The whole input chunk of document is assumed to fit in RAM; chunking of a large corpus must be done earlier in the pipeline. If `collect_sstats` is True, also collect sufficient statistics needed to update the model's topic-word distributions, and return a 2-tuple `(gamma_chunk, sstats)`. Otherwise, return `(gamma_chunk, None)`. `gamma_cunk` is of shape `len(chunk_authors) x self.num_topics`, where `chunk_authors` is the number of authors in the documents in the current chunk. Avoids computing the `phi` variational parameter directly using the optimization presented in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. """ try: len(chunk) except TypeError: # convert iterators/generators to plain list, so we have len() etc. chunk = list(chunk) if len(chunk) > 1: logger.debug("performing inference on a chunk of %i documents", len(chunk)) # Initialize the variational distribution q(theta|gamma) for the chunk if collect_sstats: sstats = np.zeros_like(self.expElogbeta) else: sstats = None converged = 0 # Stack all the computed gammas into this output array. gamma_chunk = np.zeros((0, self.num_topics)) # Now, for each document d update gamma and phi w.r.t. all authors in those documents. for d, doc in enumerate(chunk): if chunk_doc_idx is not None: doc_no = chunk_doc_idx[d] else: doc_no = d # Get the IDs and counts of all the words in the current document. # TODO: this is duplication of code in LdaModel. Refactor. if doc and not isinstance(doc[0][0], six.integer_types + (np.integer,)): # make sure the term IDs are ints, otherwise np will get upset ids = [int(idx) for idx, _ in doc] else: ids = [idx for idx, _ in doc] cts = np.array([cnt for _, cnt in doc]) # Get all authors in current document, and convert the author names to integer IDs. authors_d = [self.author2id[a] for a in self.doc2author[doc_no]] gammad = self.state.gamma[authors_d, :] # gamma of document d before update. tilde_gamma = gammad.copy() # gamma that will be updated. # Compute the expectation of the log of the Dirichlet parameters theta and beta. Elogthetad = dirichlet_expectation(tilde_gamma) expElogthetad = np.exp(Elogthetad) expElogbetad = self.expElogbeta[:, ids] # Compute the normalizing constant of phi for the current document. phinorm = self.compute_phinorm(expElogthetad, expElogbetad) # Iterate between gamma and phi until convergence for _ in xrange(self.iterations): lastgamma = tilde_gamma.copy() # Update gamma. # phi is computed implicitly below, for ai, a in enumerate(authors_d): tilde_gamma[ai, :] = self.alpha + len(self.author2doc[self.id2author[a]]) * expElogthetad[ai, :] * np.dot(cts / phinorm, expElogbetad.T) # Update gamma. # Interpolation between document d's "local" gamma (tilde_gamma), # and "global" gamma (gammad). tilde_gamma = (1 - rhot) * gammad + rhot * tilde_gamma # Update Elogtheta and Elogbeta, since gamma and lambda have been updated. Elogthetad = dirichlet_expectation(tilde_gamma) expElogthetad = np.exp(Elogthetad) # Update the normalizing constant in phi. phinorm = self.compute_phinorm(expElogthetad, expElogbetad) # Check for convergence. # Criterion is mean change in "local" gamma. meanchange_gamma = np.mean(abs(tilde_gamma - lastgamma)) gamma_condition = meanchange_gamma < self.gamma_threshold if gamma_condition: converged += 1 break # End of iterations loop. # Store the updated gammas in the model state. self.state.gamma[authors_d, :] = tilde_gamma # Stack the new gammas into the output array. gamma_chunk = np.vstack([gamma_chunk, tilde_gamma]) if collect_sstats: # Contribution of document d to the expected sufficient # statistics for the M step. expElogtheta_sum_a = expElogthetad.sum(axis=0) sstats[:, ids] += np.outer(expElogtheta_sum_a.T, cts / phinorm) if len(chunk) > 1: logger.debug( "%i/%i documents converged within %i iterations", converged, len(chunk), self.iterations ) if collect_sstats: # This step finishes computing the sufficient statistics for the # M step, so that # sstats[k, w] = \sum_d n_{dw} * \sum_a phi_{dwak} # = \sum_d n_{dw} * exp{Elogtheta_{ak} + Elogbeta_{kw}} / phinorm_{dw}. sstats *= self.expElogbeta return gamma_chunk, sstats
def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, doc2author=None, chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0, alpha='symmetric', eta='symmetric', update_every=1, eval_every=10, gamma_threshold=0.001, serialized=False, serialization_path=None, minimum_probability=0.01, random_state=None): """ If the iterable corpus and one of author2doc/doc2author dictionaries are given, start training straight away. If not given, the model is left untrained (presumably because you want to call the `update` method manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `author2doc` is a dictionary where the keys are the names of authors, and the values are lists of documents that the author contributes to. `doc2author` is a dictionary where the keys are document IDs (indexes to corpus) and the values are lists of author names. I.e. this is the reverse mapping of `author2doc`. Only one of the two, `author2doc` and `doc2author` have to be supplied. `passes` is the number of times the model makes a pass over the entire trianing data. `iterations` is the maximum number of times the model loops over each document (M-step). The iterations stop when convergence is reached. `chunksize` controls the size of the mini-batches. `alpha` and `eta` are hyperparameters that affect sparsity of the author-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a vector of shape num_words, which can be used to impose (user defined) asymmetric priors over the word distribution. It also supports the special value 'auto', which learns an asymmetric prior over words directly from your data. `eta` can also be a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis (can not be learned from data). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates. Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `decay` controls how quickly old documents are forgotten, while `offset` down-weights early iterations. `minimum_probability` controls filtering the topics returned for a document (bow). `random_state` can be an integer or a numpy.random.RandomState object. Set the state of the random number generator inside the author-topic model, to ensure reproducibility of your experiments, for example. `serialized` indicates whether the input corpora to the model are simple in-memory lists (`serialized = False`) or saved to the hard-drive (`serialized = True`). Note that this behaviour is quite different from other Gensim models. If your data is too large to fit in to memory, use this functionality. Note that calling `AuthorTopicModel.update` with new data may be cumbersome as it requires all the existing data to be re-serialized. `serialization_path` must be set to a filepath, if `serialized = True` is used. Use, for example, `serialization_path = /tmp/serialized_model.mm` or use your working directory by setting `serialization_path = serialized_model.mm`. An existing file *cannot* be overwritten; either delete the old file or choose a different name. Example: >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word) # train model >>> model.update(corpus2) # update the author-topic model with additional documents >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState. distributed = False self.dispatcher = None self.numworkers = 1 self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( "at least one of corpus/id2word must be specified, to establish input space dimensionality" ) if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError("cannot compute the author-topic model over an empty collection (no terms)") logger.info('Vocabulary consists of %d words.', self.num_terms) self.author2doc = {} self.doc2author = {} self.distributed = distributed self.num_topics = num_topics self.num_authors = 0 self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.total_docs = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.author2id = {} self.id2author = {} self.serialized = serialized if serialized and not serialization_path: raise ValueError("If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path).") if serialized and serialization_path: assert not isfile(serialization_path), \ "A file already exists at the serialization_path path; " \ "choose a different serialization_path, or delete the file." self.serialization_path = serialization_path # Initialize an empty self.corpus. self.init_empty_corpus() self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == (self.num_topics,), \ "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) if isinstance(eta, six.string_types): if eta == 'asymmetric': raise ValueError("The 'asymmetric' option cannot be used for eta") self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') self.random_state = utils.get_random_state(random_state) assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), ( "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms) ) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # Initialize the variational distributions q(beta|lambda) and q(theta|gamma) self.state = AuthorTopicState(self.eta, (self.num_topics, self.num_terms), (self.num_authors, self.num_topics)) self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None and (author2doc is not None or doc2author is not None): use_numpy = self.dispatcher is not None self.update(corpus, author2doc, doc2author, chunks_as_numpy=use_numpy)
def update_doc(self, i, max_iter=500): self.mat_z[i] = np.zeros(((self.max_K + 1), self.chunk_doc_word_no[i])) self.mat_z_avrg[i] = np.copy(self.mat_z[i]) self.mat_z_sum[i] = np.zeros((self.max_K + 1)) ids = self.chunk_doc_word_ids_list[i] cts = self.chunk_doc_word_counts_list[i] words_no = self.chunk_doc_word_no[i] expElogbetad = self.m_dir_exp_lambda[np.ix_(self.effe_list, ids)] self.vi(i, ids, cts, words_no, expElogbetad, no_iter=1000) self.gibbs_samplings(i, ids, cts, words_no, expElogbetad, max_iter=10) iter = 2 aver_sum = np.copy(self.mat_z_sum[i]) aver_phi = digamma(self.G_0.G_0 * self.m_gamma + self.mat_z_sum[i][self.effe_list]) while iter < max_iter: last_aver_sum = np.copy(aver_sum) self.gibbs_samplings(i, ids, cts, words_no, expElogbetad, max_iter=1) self.mat_z_avrg[i] -= 1 / iter * (self.mat_z_avrg[i] - self.mat_z[i]) aver_sum -= 1 / iter * (last_aver_sum - self.mat_z_sum[i]) aver_phi -= 1 / iter * (aver_phi - digamma(self.G_0.G_0 * self.m_gamma + self.mat_z_sum[i][self.effe_list])) iter += 1 meanchange = mean_absolute_difference( aver_sum[self.effe_list], last_aver_sum[self.effe_list]) / np.sum(cts) if meanchange < meanchangethresh: break self.mat_phi[self.effe_list, i] = aver_phi - digamma(self.G_0.G_0 * self.m_gamma) if np.sum(self.mat_z_avrg[i][0]) > 0: add_vector = self.mat_z_sum[i][0] add_no = 1 add_list = ids self.m_K += add_no new_k = find_gap_in_np_array(self.effe_list, add_no) self.effe_list = np.sort(self.effe_list.tolist() + new_k) self.mat_z_avrg[i][new_k] = self.mat_z_avrg[i][0] self.mat_z_avrg[i][0] = np.zeros_like(self.mat_z_avrg[i][0]) self.mat_z[i][new_k] = self.mat_z[i][0] self.mat_z[i][0] = np.zeros_like(self.mat_z[i][0]) self.mat_phi[new_k, i] = self.mat_phi[0, i] self.mat_phi[0, i] = np.zeros_like(self.mat_phi[0, i]) self.G_0.add_new(add_no) self.m_lambda[np.ix_(new_k, add_list)] += self.rhot * self.m_D / self.chunksize * np.array(cts) * \ self.mat_z_avrg[i][new_k] self.m_dir_exp_lambda[new_k] = np.exp( dirichlet_expectation(self.m_lambda[new_k] + self.m_beta))
def __init__(self, corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): """ Parameters ---------- corpus : iterable of list of (int, float) Corpus in BoW format. id2word : :class:`~gensim.corpora.dictionary.Dictionary` Dictionary for the input corpus. max_chunks : int, optional Upper bound on how many chunks to process. It wraps around corpus beginning in another corpus pass, if there are not enough chunks in the corpus. max_time : int, optional Upper bound on time (in seconds) for which model will be trained. chunksize : int, optional Number of documents in one chuck. kappa: float,optional Learning parameter which acts as exponential decay factor to influence extent of learning from each batch. tau: float, optional Learning parameter which down-weights early iterations of documents. K : int, optional Second level truncation level T : int, optional Top level truncation level alpha : int, optional Second level concentration gamma : int, optional First level concentration eta : float, optional The topic Dirichlet scale : float, optional Weights information from the mini-chunk of corpus to calculate rhot. var_converge : float, optional Lower bound on the right side of convergence. Used when updating variational parameters for a single document. outputdir : str, optional Stores topic and options information in the specified directory. random_state : {None, int, array_like, :class:`~np.random.RandomState`, optional} Adds a little random jitter to randomize results around same alpha when trying to fetch a closest corresponding lda model from :meth:`~gensim.models.hdpmodel.HdpModel.suggested_lda_model` """ self.corpus = corpus self.id2word = id2word self.chunksize = chunksize self.max_chunks = max_chunks self.max_time = max_time self.outputdir = outputdir self.random_state = utils.get_random_state(random_state) self.lda_alpha = None self.lda_beta = None self.m_W = len(id2word) self.m_D = 0 if corpus: self.m_D = len(corpus) self.m_T = T self.m_K = K self.m_alpha = alpha self.m_gamma = gamma self.m_var_sticks = np.zeros((2, T - 1)) self.m_var_sticks[0] = 1.0 self.m_var_sticks[1] = range(T - 1, 0, -1) self.m_varphi_ss = np.zeros(T) self.m_lambda = self.random_state.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta self.m_eta = eta self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda) self.m_tau = tau + 1 self.m_kappa = kappa self.m_scale = scale self.m_updatect = 0 self.m_status_up_to_date = True self.m_num_docs_processed = 0 self.m_timestamp = np.zeros(self.m_W, dtype=int) self.m_r = [0] self.m_lambda_sum = np.sum(self.m_lambda, axis=1) self.m_var_converge = var_converge if self.outputdir: self.save_options() # if a training corpus was provided, start estimating the model right away if corpus is not None: self.update(corpus)
def get_Elogbeta(self): return dirichlet_expectation(self.get_lambda())
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=None, chunk_size=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, evaluate_every=10, iterations=200, gamma_threshold=0.001, min_prob=0.01, random_state=None, ns_conf={}, min_phi_val=0.01, per_word_topics=False): self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( 'At least one of corpus/id2word must be specified.') if self.id2word is None: logger.warning( 'No word-id mapping provided; initializing from corpus, assuming identity' ) self.id2word = corpus_handle.dict_from_corpus(corpus) self.num_items = len(self.id2word) elif len(self.id2word) > 0: self.num_items = 1 + max(self.id2word.keys()) else: self.num_items = 0 if self.num_items == 0: raise ValueError( " Cannot compute LDA over an empty collection(no items)") self.distributed = distributed self.num_topics = num_topics self.chunk_size = chunk_size self.decay = decay self.offset = offset self.min_prob = min_prob self.num_updates = 0 self.passes = passes self.update_every = update_every self.evaluate_every = evaluate_every self.min_phi_val = min_phi_val self.alpha, self.optimize_alpha = self.init_dirichlet_prior( alpha, 'alpha') assert self.alpha.shape == ( self.num_topics, ), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str( self.alpha.shape), self.num_topics) if isinstance(eta, six.string_types): if eta == 'asymmetric': raise ValueError( "The 'asymmetric' option cannot be used for eta") self.eta, self.optimize_eta = self.init_dirichlet_prior(eta, 'eta') self.random_state = corpus_handle.get_random_state(random_state) assert ( self.eta.shape == (self.num_terms, ) or self.eta.shape == (self.num_topics, self.num_terms) ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)) self.iterations = iterations self.gamma_threshold = gamma_threshold if not distributed: logger.info("Using serial LDA version on this node.") self.dispatcher = None self.num_workers = 1 else: pass self.state = LDAState(self.eta, (self, num_topics, self.num_items)) self.state.s_stats = self.random_state.gamma( 100., 1. / 100., (self.num_topics, self.num_items)) self.expElogbeta = np.exp(dirichlet_expectation(self.state.s_stats)) # if a training corpus was provided, start training estimating right away. if corpus is not None: use_numpy = self.dispatcher is not None self.update(corpus, chunk_as_numpy=use_numpy)
def inference(self, chunk, collect_sstats=False): """ Given a chunk of sparse document vectors, estimate gamma (parameters controlling the topic weights) for each document in the chunk. This function does not modify the model (=is read-only aka const). The whole input chunk of document is assumed to fit in RAM; chunking of a large corpus must be done earlier in the pipeline. If `collect_sstats` is True, also collect sufficient statistics needed to update the model's topic-word distributions, and return a 2-tuple `(gamma, sstats)`. Otherwise, return `(gamma, None)`. `gamma` is of shape `len(chunk) x self.num_topics`. Avoids computing the `phi` variational parameter directly using the optimization presented in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. """ try: _ = len(chunk) except: # convert iterators/generators to plain list, so we have len() etc. chunk = list(chunk) if len(chunk) > 1: logger.debug("performing inference on a chunk of %i documents", len(chunk)) # Initialize the variational distribution q(theta|gamma) for the chunk gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)) Elogtheta = dirichlet_expectation(gamma) expElogtheta = np.exp(Elogtheta) if collect_sstats: sstats = np.zeros_like(self.expElogbeta) else: sstats = None converged = 0 # Now, for each document d update that document's gamma and phi # Inference code copied from Hoffman's `onlineldavb.py` (esp. the # Lee&Seung trick which speeds things up by an order of magnitude, compared # to Blei's original LDA-C code, cool!). for d, doc in enumerate(chunk): if len(doc) > 0 and not isinstance(doc[0][0], six.integer_types + (np.integer,)): # make sure the term IDs are ints, otherwise np will get upset ids = [int(id) for id, _ in doc] else: ids = [id for id, _ in doc] cts = np.array([cnt for _, cnt in doc]) gammad = gamma[d, :] Elogthetad = Elogtheta[d, :] expElogthetad = expElogtheta[d, :] expElogbetad = self.expElogbeta[:, ids] # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w. # phinorm is the normalizer. # TODO treat zeros explicitly, instead of adding 1e-100? phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 # Iterate between gamma and phi until convergence for _ in xrange(self.iterations): lastgamma = gammad # We represent phi implicitly to save memory and time. # Substituting the value of the optimal phi back into # the update for gamma gives this update. Cf. Lee&Seung 2001. gammad = self.alpha + expElogthetad * np.dot(cts / phinorm, expElogbetad.T) Elogthetad = dirichlet_expectation(gammad) expElogthetad = np.exp(Elogthetad) phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 # If gamma hasn't changed much, we're done. meanchange = np.mean(abs(gammad - lastgamma)) if (meanchange < self.gamma_threshold): converged += 1 break gamma[d, :] = gammad if collect_sstats: # Contribution of document d to the expected sufficient # statistics for the M step. sstats[:, ids] += np.outer(expElogthetad.T, cts / phinorm) if len(chunk) > 1: logger.debug("%i/%i documents converged within %i iterations", converged, len(chunk), self.iterations) if collect_sstats: # This step finishes computing the sufficient statistics for the # M step, so that # sstats[k, w] = \sum_d n_{dw} * phi_{dwk} # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}. sstats *= self.expElogbeta return gamma, sstats
print change_index_result.shape ''' ''' import torch from torch.autograd import Variable gamma_test = -1*Variable(torch.from_numpy(np.random.rand(3, 6))) para_test = Variable(torch.from_numpy(np.random.rand(6, 4))) print gamma_test print gamma_test.abs() print (gamma_test.abs().sum(dim=1).view(-1, 1)) print (gamma_test.abs()) / (gamma_test.abs().sum(dim=1).view(-1, 1)) print para_test mum = gamma_test.abs().mm(para_test) print mum print 'max: ' max = mum.max(dim=1)[0] print max sum = max.sum() print sum re = Variable(torch.DoubleTensor([1.0]))/sum print re ''' a = Variable(torch.FloatTensor([1.1])) print a print psi([1.0]) # print psi(a) print matutils.dirichlet_expectation(np.array([1.0])) print matutils.dirichlet_expectation(a.data) print str('"yang"').replace('"', '')
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf={}, minimum_phi_value=0.01, per_word_topics=False): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a vector of shape num_words, which can be used to impose (user defined) asymmetric priors over the word distribution. It also supports the special value 'auto', which learns an asymmetric prior over words directly from your data. `eta` can also be a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis (can not be learned from data). Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_ on how to set up a cluster of machines for gensim). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates (setting this to 1 slows down training ~2x; default is 10 for better performance). Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `minimum_probability` controls filtering the topics returned for a document (bow). `random_state` can be a np.random.RandomState object or the seed for one Example: >>> lda = LdaModel(corpus, num_topics=100) # train model >>> print(lda[doc_bow]) # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print(lda[doc_bow]) >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError( "cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.minimum_phi_value = minimum_phi_value self.per_word_topics = per_word_topics self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == ( self.num_topics, ), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str( self.alpha.shape), self.num_topics) if isinstance(eta, six.string_types): if eta == 'asymmetric': raise ValueError( "The 'asymmetric' option cannot be used for eta") self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') self.random_state = utils.get_random_state(random_state) assert ( self.eta.shape == (self.num_terms, ) or self.eta.shape == (self.num_topics, self.num_terms) ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: if self.optimize_alpha: raise NotImplementedError( "auto-optimizing alpha not implemented in distributed LDA") # set up distributed version try: import Pyro4 with utils.getNS(**ns_conf) as ns: from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX self.dispatcher = Pyro4.Proxy( ns.list(prefix=LDA_DISPATCHER_PREFIX) [LDA_DISPATCHER_PREFIX]) logger.debug("looking for dispatcher at %s" % str(self.dispatcher._pyroUri)) self.dispatcher.initialize(id2word=self.id2word, num_topics=self.num_topics, chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) self.numworkers = len(self.dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception as err: logger.error("failed to initialize distributed LDA (%s)", err) raise RuntimeError( "failed to initialize distributed LDA (%s)" % err) # Initialize the variational distribution q(beta|lambda) self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) self.state.sstats = self.random_state.gamma( 100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None: use_numpy = self.dispatcher is not None self.update(corpus, chunks_as_numpy=use_numpy)
def get_Elogbeta(self): return dirichlet_expectation(self.get_lambda())
def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, chunk_doc_idx=None): """ Given a chunk of sparse document vectors, update gamma (parameters controlling the topic weights) for each author corresponding to the documents in the chunk. The whole input chunk of document is assumed to fit in RAM; chunking of a large corpus must be done earlier in the pipeline. If `collect_sstats` is True, also collect sufficient statistics needed to update the model's topic-word distributions, and return a 2-tuple `(gamma_chunk, sstats)`. Otherwise, return `(gamma_chunk, None)`. `gamma_cunk` is of shape `len(chunk_authors) x self.num_topics`, where `chunk_authors` is the number of authors in the documents in the current chunk. Avoids computing the `phi` variational parameter directly using the optimization presented in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. """ try: _ = len(chunk) except: # convert iterators/generators to plain list, so we have len() etc. chunk = list(chunk) if len(chunk) > 1: logger.debug("performing inference on a chunk of %i documents", len(chunk)) # Initialize the variational distribution q(theta|gamma) for the chunk if collect_sstats: sstats = np.zeros_like(self.expElogbeta) else: sstats = None converged = 0 # Stack all the computed gammas into this output array. gamma_chunk = np.zeros((0, self.num_topics)) # Now, for each document d update gamma and phi w.r.t. all authors in those documents. for d, doc in enumerate(chunk): if chunk_doc_idx is not None: doc_no = chunk_doc_idx[d] else: doc_no = d # Get the IDs and counts of all the words in the current document. # TODO: this is duplication of code in LdaModel. Refactor. if doc and not isinstance(doc[0][0], six.integer_types): # make sure the term IDs are ints, otherwise np will get upset ids = [int(id) for id, _ in doc] else: ids = [id for id, _ in doc] cts = np.array([cnt for _, cnt in doc]) # Get all authors in current document, and convert the author names to integer IDs. authors_d = [self.author2id[a] for a in self.doc2author[doc_no]] gammad = self.state.gamma[ authors_d, :] # gamma of document d before update. tilde_gamma = gammad.copy() # gamma that will be updated. # Compute the expectation of the log of the Dirichlet parameters theta and beta. Elogthetad = dirichlet_expectation(tilde_gamma) expElogthetad = np.exp(Elogthetad) expElogbetad = self.expElogbeta[:, ids] # Compute the normalizing constant of phi for the current document. phinorm = self.compute_phinorm(ids, authors_d, expElogthetad, expElogbetad) # Iterate between gamma and phi until convergence for iteration in xrange(self.iterations): lastgamma = tilde_gamma.copy() # Update gamma. # phi is computed implicitly below, for ai, a in enumerate(authors_d): tilde_gamma[ai, :] = self.alpha + len(self.author2doc[ self.id2author[a]]) * expElogthetad[ai, :] * np.dot( cts / phinorm, expElogbetad.T) # Update gamma. # Interpolation between document d's "local" gamma (tilde_gamma), # and "global" gamma (gammad). tilde_gamma = (1 - rhot) * gammad + rhot * tilde_gamma # Update Elogtheta and Elogbeta, since gamma and lambda have been updated. Elogthetad = dirichlet_expectation(tilde_gamma) expElogthetad = np.exp(Elogthetad) # Update the normalizing constant in phi. phinorm = self.compute_phinorm(ids, authors_d, expElogthetad, expElogbetad) # Check for convergence. # Criterion is mean change in "local" gamma. meanchange_gamma = np.mean(abs(tilde_gamma - lastgamma)) gamma_condition = meanchange_gamma < self.gamma_threshold if gamma_condition: converged += 1 break # End of iterations loop. # Store the updated gammas in the model state. self.state.gamma[authors_d, :] = tilde_gamma # Stack the new gammas into the output array. gamma_chunk = np.vstack([gamma_chunk, tilde_gamma]) if collect_sstats: # Contribution of document d to the expected sufficient # statistics for the M step. expElogtheta_sum_a = expElogthetad.sum(axis=0) sstats[:, ids] += np.outer(expElogtheta_sum_a.T, cts / phinorm) if len(chunk) > 1: logger.debug("%i/%i documents converged within %i iterations", converged, len(chunk), self.iterations) if collect_sstats: # This step finishes computing the sufficient statistics for the # M step, so that # sstats[k, w] = \sum_d n_{dw} * \sum_a phi_{dwak} # = \sum_d n_{dw} * exp{Elogtheta_{ak} + Elogbeta_{kw}} / phinorm_{dw}. sstats *= self.expElogbeta return gamma_chunk, sstats
def __init__(self, corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): """ `gamma`: first level concentration `alpha`: second level concentration `eta`: the topic Dirichlet `T`: top level truncation level `K`: second level truncation level `kappa`: learning rate `tau`: slow down parameter `max_time`: stop training after this many seconds `max_chunks`: stop after having processed this many chunks (wrap around corpus beginning in another corpus pass, if there are not enough chunks in the corpus) """ self.corpus = corpus self.id2word = id2word self.chunksize = chunksize self.max_chunks = max_chunks self.max_time = max_time self.outputdir = outputdir self.random_state = utils.get_random_state(random_state) self.lda_alpha = None self.lda_beta = None self.m_W = len(id2word) self.m_D = 0 if corpus: self.m_D = len(corpus) self.m_T = T self.m_K = K self.m_alpha = alpha self.m_gamma = gamma self.m_var_sticks = np.zeros((2, T - 1)) self.m_var_sticks[0] = 1.0 self.m_var_sticks[1] = range(T - 1, 0, -1) self.m_varphi_ss = np.zeros(T) self.m_lambda = self.random_state.gamma( 1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta self.m_eta = eta self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda) self.m_tau = tau + 1 self.m_kappa = kappa self.m_scale = scale self.m_updatect = 0 self.m_status_up_to_date = True self.m_num_docs_processed = 0 self.m_timestamp = np.zeros(self.m_W, dtype=int) self.m_r = [0] self.m_lambda_sum = np.sum(self.m_lambda, axis=1) self.m_var_converge = var_converge if self.outputdir: self.save_options() # if a training corpus was provided, start estimating the model right away if corpus is not None: self.update(corpus)
def get_inference_penalty(net, hidden_size, docs_path, topic_num): # train the lda model selected_docs = pd.read_csv(docs_path, header=None, index_col=[0]).values print 'number of docs:', selected_docs.shape # print selected_docs[:5] texts = [[word for word in doc[0].split(' ')] for doc in selected_docs] # pprint(texts[:5]) dictionary = corpora.Dictionary(texts) dictionary.save_as_text(Path+'/data-repository/available_word_in_literature.csv') print dictionary # print dictionary.token2id corpus = [dictionary.doc2bow(text) for text in texts] print corpus[:5] print len(corpus) lda_model = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_num, update_every=1, chunksize=1000, passes=1) # to inference the new doc # initialize the variational distribution q(theta|gamma) for the chunk init_gamma = utils.get_random_state(None).gamma(100., 1. / 100., (hidden_size, topic_num)) Elogtheta = matutils.dirichlet_expectation(init_gamma) expElogtheta = np.exp(Elogtheta) converged = 0 # Now, for each document d update that document's gamma and phi # Inference code copied from Hoffman's `onlineldavb.py` (esp. the # Lee&Seung trick which speeds things up by an order of magnitude, compared # to Blei's original LDA-C code, cool!). for para_iter, para in enumerate(net.parameters()): if para_iter == 0: para_data = para.abs() for d, doc in enumerate(chunk): if len(doc) > 0 and not isinstance(doc[0][0], six.integer_types + (np.integer,)): # make sure the term IDs are ints, otherwise np will get upset ids = [int(idx) for idx, _ in doc] else: ids = [idx for idx, _ in doc] cts = np.array([cnt for _, cnt in doc]) gammad = init_gamma[d, :] Elogthetad = Elogtheta[d, :] expElogthetad = expElogtheta[d, :] expElogbetad = lda_model.expElogbeta[:, ids] # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w. # phinorm is the normalizer. # TODO treat zeros explicitly, instead of adding 1e-100? phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 # Iterate between gamma and phi until convergence for _ in xrange(lda_model.iterations): lastgamma = gammad # We represent phi implicitly to save memory and time. # Substituting the value of the optimal phi back into # the update for gamma gives this update. Cf. Lee&Seung 2001. gammad = lda_model.alpha + expElogthetad * np.dot(cts / phinorm, expElogbetad.T) Elogthetad = matutils.dirichlet_expectation(gammad) expElogthetad = np.exp(Elogthetad) phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 # If gamma hasn't changed much, we're done. meanchange = np.mean(abs(gammad - lastgamma)) if meanchange < lda_model.gamma_threshold: converged += 1 break init_gamma[d, :] = gammad pass
def inference(self, chunk, collect_sstats=False): """ Given a chunk of sparse document vectors, estimate gamma (parameters controlling the topic weights) for each document in the chunk. This function does not modify the model (=is read-only aka const). The whole input chunk of document is assumed to fit in RAM; chunking of a large corpus must be done earlier in the pipeline. If `collect_sstats` is True, also collect sufficient statistics needed to update the model's topic-word distributions, and return a 2-tuple `(gamma, sstats)`. Otherwise, return `(gamma, None)`. `gamma` is of shape `len(chunk) x self.num_topics`. Avoids computing the `phi` variational parameter directly using the optimization presented in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. """ try: _ = len(chunk) except: # convert iterators/generators to plain list, so we have len() etc. chunk = list(chunk) if len(chunk) > 1: logger.debug("performing inference on a chunk of %i documents", len(chunk)) # Initialize the variational distribution q(theta|gamma) for the chunk gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)) Elogtheta = dirichlet_expectation(gamma) expElogtheta = np.exp(Elogtheta) if collect_sstats: sstats = np.zeros_like(self.expElogbeta) else: sstats = None converged = 0 # Now, for each document d update that document's gamma and phi # Inference code copied from Hoffman's `onlineldavb.py` (esp. the # Lee&Seung trick which speeds things up by an order of magnitude, compared # to Blei's original LDA-C code, cool!). for d, doc in enumerate(chunk): if doc and not isinstance(doc[0][0], six.integer_types): # make sure the term IDs are ints, otherwise np will get upset ids = [int(id) for id, _ in doc] else: ids = [id for id, _ in doc] cts = np.array([cnt for _, cnt in doc]) gammad = gamma[d, :] Elogthetad = Elogtheta[d, :] expElogthetad = expElogtheta[d, :] expElogbetad = self.expElogbeta[:, ids] # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w. # phinorm is the normalizer. # TODO treat zeros explicitly, instead of adding 1e-100? phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 # Iterate between gamma and phi until convergence for _ in xrange(self.iterations): lastgamma = gammad # We represent phi implicitly to save memory and time. # Substituting the value of the optimal phi back into # the update for gamma gives this update. Cf. Lee&Seung 2001. gammad = self.alpha + expElogthetad * np.dot( cts / phinorm, expElogbetad.T) Elogthetad = dirichlet_expectation(gammad) expElogthetad = np.exp(Elogthetad) phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 # If gamma hasn't changed much, we're done. meanchange = np.mean(abs(gammad - lastgamma)) if (meanchange < self.gamma_threshold): converged += 1 break gamma[d, :] = gammad if collect_sstats: # Contribution of document d to the expected sufficient # statistics for the M step. sstats[:, ids] += np.outer(expElogthetad.T, cts / phinorm) if len(chunk) > 1: logger.debug("%i/%i documents converged within %i iterations", converged, len(chunk), self.iterations) if collect_sstats: # This step finishes computing the sufficient statistics for the # M step, so that # sstats[k, w] = \sum_d n_{dw} * phi_{dwk} # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}. sstats *= self.expElogbeta return gamma, sstats
def __init__(self, corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): """ `gamma`: first level concentration `alpha`: second level concentration `eta`: the topic Dirichlet `T`: top level truncation level `K`: second level truncation level `kappa`: learning rate `tau`: slow down parameter `max_time`: stop training after this many seconds `max_chunks`: stop after having processed this many chunks (wrap around corpus beginning in another corpus pass, if there are not enough chunks in the corpus) """ self.corpus = corpus self.id2word = id2word self.chunksize = chunksize self.max_chunks = max_chunks self.max_time = max_time self.outputdir = outputdir self.random_state = utils.get_random_state(random_state) self.lda_alpha = None self.lda_beta = None self.m_W = len(id2word) self.m_D = 0 if corpus: self.m_D = len(corpus) self.m_T = T self.m_K = K self.m_alpha = alpha self.m_gamma = gamma self.m_var_sticks = np.zeros((2, T - 1)) self.m_var_sticks[0] = 1.0 self.m_var_sticks[1] = range(T - 1, 0, -1) self.m_varphi_ss = np.zeros(T) self.m_lambda = self.random_state.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta self.m_eta = eta self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda) self.m_tau = tau + 1 self.m_kappa = kappa self.m_scale = scale self.m_updatect = 0 self.m_status_up_to_date = True self.m_num_docs_processed = 0 self.m_timestamp = np.zeros(self.m_W, dtype=int) self.m_r = [0] self.m_lambda_sum = np.sum(self.m_lambda, axis=1) self.m_var_converge = var_converge if self.outputdir: self.save_options() # if a training corpus was provided, start estimating the model right away if corpus is not None: self.update(corpus)
def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, doc2author=None, chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0, alpha='symmetric', eta='symmetric', update_every=1, eval_every=10, gamma_threshold=0.001, serialized=False, serialization_path=None, minimum_probability=0.01, random_state=None): """ If the iterable corpus and one of author2doc/doc2author dictionaries are given, start training straight away. If not given, the model is left untrained (presumably because you want to call the `update` method manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `author2doc` is a dictionary where the keys are the names of authors, and the values are lists of documents that the author contributes to. `doc2author` is a dictionary where the keys are document IDs (indexes to corpus) and the values are lists of author names. I.e. this is the reverse mapping of `author2doc`. Only one of the two, `author2doc` and `doc2author` have to be supplied. `passes` is the number of times the model makes a pass over the entire trianing data. `iterations` is the maximum number of times the model loops over each document (M-step). The iterations stop when convergence is reached. `chunksize` controls the size of the mini-batches. `alpha` and `eta` are hyperparameters that affect sparsity of the author-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a vector of shape num_words, which can be used to impose (user defined) asymmetric priors over the word distribution. It also supports the special value 'auto', which learns an asymmetric prior over words directly from your data. `eta` can also be a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis (can not be learned from data). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates. Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `decay` controls how quickly old documents are forgotten, while `offset` down-weights early iterations. `minimum_probability` controls filtering the topics returned for a document (bow). `random_state` can be an integer or a numpy.random.RandomState object. Set the state of the random number generator inside the author-topic model, to ensure reproducibility of your experiments, for example. `serialized` indicates whether the input corpora to the model are simple in-memory lists (`serialized = False`) or saved to the hard-drive (`serialized = True`). Note that this behaviour is quite different from other Gensim models. If your data is too large to fit in to memory, use this functionality. Note that calling `AuthorTopicModel.update` with new data may be cumbersome as it requires all the existing data to be re-serialized. `serialization_path` must be set to a filepath, if `serialized = True` is used. Use, for example, `serialization_path = /tmp/serialized_model.mm` or use your working directory by setting `serialization_path = serialized_model.mm`. An existing file *cannot* be overwritten; either delete the old file or choose a different name. Example: >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word) # train model >>> model.update(corpus2) # update the author-topic model with additional documents >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState. distributed = False self.dispatcher = None self.numworkers = 1 self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError( "cannot compute the author-topic model over an empty collection (no terms)" ) logger.info('Vocabulary consists of %d words.', self.num_terms) self.author2doc = {} self.doc2author = {} self.distributed = distributed self.num_topics = num_topics self.num_authors = 0 self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.total_docs = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.author2id = {} self.id2author = {} self.serialized = serialized if serialized and not serialization_path: raise ValueError( "If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path)." ) if serialized and serialization_path: assert not isfile( serialization_path ), "A file already exists at the serialization_path path; choose a different serialization_path, or delete the file." self.serialization_path = serialization_path # Initialize an empty self.corpus. self.init_empty_corpus() self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == ( self.num_topics, ), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str( self.alpha.shape), self.num_topics) if isinstance(eta, six.string_types): if eta == 'asymmetric': raise ValueError( "The 'asymmetric' option cannot be used for eta") self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') self.random_state = utils.get_random_state(random_state) assert ( self.eta.shape == (self.num_terms, ) or self.eta.shape == (self.num_topics, self.num_terms) ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # Initialize the variational distributions q(beta|lambda) and q(theta|gamma) self.state = AuthorTopicState(self.eta, (self.num_topics, self.num_terms), (self.num_authors, self.num_topics)) self.state.sstats = self.random_state.gamma( 100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None and (author2doc is not None or doc2author is not None): use_numpy = self.dispatcher is not None self.update(corpus, author2doc, doc2author, chunks_as_numpy=use_numpy)
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf={}, minimum_phi_value=0.01, per_word_topics=False): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a vector of shape num_words, which can be used to impose (user defined) asymmetric priors over the word distribution. It also supports the special value 'auto', which learns an asymmetric prior over words directly from your data. `eta` can also be a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis (can not be learned from data). Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_ on how to set up a cluster of machines for gensim). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates (setting this to 1 slows down training ~2x; default is 10 for better performance). Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `minimum_probability` controls filtering the topics returned for a document (bow). `random_state` can be a np.random.RandomState object or the seed for one Example: >>> lda = LdaModel(corpus, num_topics=100) # train model >>> print(lda[doc_bow]) # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print(lda[doc_bow]) >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.minimum_phi_value = minimum_phi_value self.per_word_topics = per_word_topics self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) if isinstance(eta, six.string_types): if eta == 'asymmetric': raise ValueError("The 'asymmetric' option cannot be used for eta") self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') self.random_state = utils.get_random_state(random_state) assert self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms), ( "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: if self.optimize_alpha: raise NotImplementedError("auto-optimizing alpha not implemented in distributed LDA") # set up distributed version try: import Pyro4 with utils.getNS(**ns_conf) as ns: from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX self.dispatcher = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) logger.debug("looking for dispatcher at %s" % str(self.dispatcher._pyroUri)) self.dispatcher.initialize(id2word=self.id2word, num_topics=self.num_topics, chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) self.numworkers = len(self.dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception as err: logger.error("failed to initialize distributed LDA (%s)", err) raise RuntimeError("failed to initialize distributed LDA (%s)" % err) # Initialize the variational distribution q(beta|lambda) self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None: use_numpy = self.dispatcher is not None self.update(corpus, chunks_as_numpy=use_numpy)
def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, doc2author=None): """ Estimate the variational bound of documents from `corpus`: E_q[log p(corpus)] - E_q[log q(corpus)] There are basically two use cases of this method: 1. `chunk` is a subset of the training corpus, and `chunk_doc_idx` is provided, indicating the indexes of the documents in the training corpus. 2. `chunk` is a test set (held-out data), and author2doc and doc2author corrsponding to this test set are provided. There must not be any new authors passed to this method. `chunk_doc_idx` is not needed in this case. To obtain the per-word bound, compute: >>> corpus_words = sum(cnt for document in corpus for _, cnt in document) >>> model.bound(corpus, author2doc=author2doc, doc2author=doc2author) / corpus_words """ # TODO: enable evaluation of documents with new authors. One could, for example, make it # possible to pass a list of documents to self.inference with no author dictionaries, # assuming all the documents correspond to one (unseen) author, learn the author's # gamma, and return gamma (without adding it to self.state.gamma). Of course, # collect_sstats should be set to false, so that the model is not updated w.r.t. these # new documents. _lambda = self.state.get_lambda() Elogbeta = dirichlet_expectation(_lambda) expElogbeta = np.exp(Elogbeta) gamma = self.state.gamma if author2doc is None and doc2author is None: # Evaluating on training documents (chunk of self.corpus). author2doc = self.author2doc doc2author = self.doc2author if not chunk_doc_idx: # If author2doc and doc2author are not provided, chunk is assumed to be a subset of # self.corpus, and chunk_doc_idx is thus required. raise ValueError( 'Either author dictionaries or chunk_doc_idx must be provided. Consult documentation of bound method.' ) elif author2doc is not None and doc2author is not None: # Training on held-out documents (documents not seen during training). # All authors in dictionaries must still be seen during training. for a in author2doc.keys(): if not self.author2doc.get(a): raise ValueError( 'bound cannot be called with authors not seen during training.' ) if chunk_doc_idx: raise ValueError( 'Either author dictionaries or chunk_doc_idx must be provided, not both. Consult documentation of bound method.' ) else: raise ValueError( 'Either both author2doc and doc2author should be provided, or neither. Consult documentation of bound method.' ) Elogtheta = dirichlet_expectation(gamma) expElogtheta = np.exp(Elogtheta) word_score = 0.0 theta_score = 0.0 for d, doc in enumerate(chunk): if chunk_doc_idx: doc_no = chunk_doc_idx[d] else: doc_no = d # Get all authors in current document, and convert the author names to integer IDs. authors_d = [self.author2id[a] for a in self.doc2author[doc_no]] ids = np.array([id for id, _ in doc]) # Word IDs in doc. cts = np.array([cnt for _, cnt in doc]) # Word counts. if d % self.chunksize == 0: logger.debug("bound: at document #%i in chunk", d) # Computing the bound requires summing over expElogtheta[a, k] * expElogbeta[k, v], which # is the same computation as in normalizing phi. phinorm = self.compute_phinorm(ids, authors_d, expElogtheta[authors_d, :], expElogbeta[:, ids]) word_score += np.log(1.0 / len(authors_d)) + cts.dot( np.log(phinorm)) # Compensate likelihood for when `chunk` above is only a sample of the whole corpus. This ensures # that the likelihood is always rougly on the same scale. word_score *= subsample_ratio # E[log p(theta | alpha) - log q(theta | gamma)] for a in author2doc.keys(): a = self.author2id[a] theta_score += np.sum((self.alpha - gamma[a, :]) * Elogtheta[a, :]) theta_score += np.sum(gammaln(gamma[a, :]) - gammaln(self.alpha)) theta_score += gammaln(np.sum(self.alpha)) - gammaln( np.sum(gamma[a, :])) # theta_score is rescaled in a similar fashion. # TODO: treat this in a more general way, similar to how it is done with word_score. theta_score *= self.num_authors / len(author2doc) # E[log p(beta | eta) - log q (beta | lambda)] beta_score = 0.0 beta_score += np.sum((self.eta - _lambda) * Elogbeta) beta_score += np.sum(gammaln(_lambda) - gammaln(self.eta)) sum_eta = np.sum(self.eta) beta_score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1))) total_score = word_score + theta_score + beta_score return total_score
def __init__(self, corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): """ Parameters ---------- corpus : iterable of list of (int, float) Corpus in BoW format. id2word : :class:`~gensim.corpora.dictionary.Dictionary` Dictionary for the input corpus. max_chunks : int, optional Upper bound on how many chunks to process. It wraps around corpus beginning in another corpus pass, if there are not enough chunks in the corpus. max_time : int, optional Upper bound on time (in seconds) for which model will be trained. chunksize : int, optional Number of documents in one chuck. kappa: float,optional Learning parameter which acts as exponential decay factor to influence extent of learning from each batch. tau: float, optional Learning parameter which down-weights early iterations of documents. K : int, optional Second level truncation level T : int, optional Top level truncation level alpha : int, optional Second level concentration gamma : int, optional First level concentration eta : float, optional The topic Dirichlet scale : float, optional Weights information from the mini-chunk of corpus to calculate rhot. var_converge : float, optional Lower bound on the right side of convergence. Used when updating variational parameters for a single document. outputdir : str, optional Stores topic and options information in the specified directory. random_state : {None, int, array_like, :class:`~np.random.RandomState`, optional} Adds a little random jitter to randomize results around same alpha when trying to fetch a closest corresponding lda model from :meth:`~gensim.models.hdpmodel.HdpModel.suggested_lda_model` """ self.corpus = corpus self.id2word = id2word self.chunksize = chunksize self.max_chunks = max_chunks self.max_time = max_time self.outputdir = outputdir self.random_state = utils.get_random_state(random_state) self.lda_alpha = None self.lda_beta = None self.m_W = len(id2word) self.m_D = 0 if corpus: self.m_D = len(corpus) self.m_T = T self.m_K = K self.m_alpha = alpha self.m_gamma = gamma self.m_var_sticks = np.zeros((2, T - 1)) self.m_var_sticks[0] = 1.0 self.m_var_sticks[1] = range(T - 1, 0, -1) self.m_varphi_ss = np.zeros(T) self.m_lambda = self.random_state.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta self.m_eta = eta self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda) self.m_tau = tau + 1 self.m_kappa = kappa self.m_scale = scale self.m_updatect = 0 self.m_status_up_to_date = True self.m_num_docs_processed = 0 self.m_timestamp = np.zeros(self.m_W, dtype=int) self.m_r = [0] self.m_lambda_sum = np.sum(self.m_lambda, axis=1) self.m_var_converge = var_converge if self.outputdir: self.save_options() # if a training corpus was provided, start estimating the model right away if corpus is not None: self.update(corpus)