def _init_latent_vars(self, n_features): """Initialize latent variables.""" self.random_state_ = check_random_state(self.random_state) self.n_batch_iter_ = 1 self.n_iter_ = 0 if self.doc_topic_prior is None: self.doc_topic_prior_ = 1. / self._n_components else: self.doc_topic_prior_ = self.doc_topic_prior if self.topic_word_prior is None: self.topic_word_prior_ = 1. / self._n_components else: self.topic_word_prior_ = self.topic_word_prior init_gamma = 100. init_var = 1. / init_gamma # In the literature, this is called `lambda` self.components_ = self.random_state_.gamma( init_gamma, init_var, (self._n_components, n_features)) # In the literature, this is `exp(E[log(beta)])` self.exp_dirichlet_component_ = np.exp( _dirichlet_expectation_2d(self.components_))
def approx_bound(lda_model, documents, doc_topic_dist): n_samples, n_topics = doc_topic_dist.shape n_features = lda_model.components_.shape[1] score = 0 dirichlet_doc_topic = _dirichlet_expectation_2d(doc_topic_dist) dirichlet_component_ = _dirichlet_expectation_2d(lda_model.components_) doc_topic_prior = lda_model.alpha topic_word_prior = lda_model.eta for idx_d in xrange(0, n_samples): ids = np.nonzero(documents[idx_d, :])[0] cnts = documents[idx_d, ids] norm_phi = logsumexp(dirichlet_doc_topic[idx_d, :, np.newaxis] + dirichlet_component_[:, ids]) score += np.dot(cnts, norm_phi) # score += log_likelihood(doc_topic_prior, doc_topic_dist, dirichlet_doc_topic, lda_model.n_topics) # score += log_likelihood(topic_word_prior, lda_model.components_, dirichlet_component_, n_features) return score
def test_dirichlet_expectation(): """Test Cython version of Dirichlet expectation calculation.""" x = np.logspace(-100, 10, 10000) assert_allclose(_dirichlet_expectation_1d(x), np.exp(psi(x) - psi(np.sum(x))), atol=1e-19) x = x.reshape(100, 100) assert_allclose(_dirichlet_expectation_2d(x), psi(x) - psi(np.sum(x, axis=1)[:, np.newaxis]), rtol=1e-11, atol=3e-9)
def _em_step(self, X, total_samples, batch_update, parallel=None): """EM update for 1 iteration. update `_component` by batch VB or online VB. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. total_samples : integer Total number of documents. It is only used when batch_update is `False`. batch_update : boolean Parameter that controls updating method. `True` for batch learning, `False` for online learning. parallel : joblib.Parallel Pre-initialized instance of joblib.Parallel Returns ------- doc_topic_distr : array, shape=(n_samples, n_components) Unnormalized document topic distribution. """ # E-step _, suff_stats = self._e_step(X, cal_sstats=True, random_init=True, parallel=parallel) # M-step if batch_update: self.components_ = self.topic_word_prior_ + suff_stats else: # online update # In the literature, the weight is `rho` weight = np.power(self.learning_offset + self.n_batch_iter_, -self.learning_decay) doc_ratio = float(total_samples) / X.shape[0] self.components_ *= (1 - weight) self.components_ += ( weight * (self.topic_word_prior_ + doc_ratio * suff_stats)) # update `component_` related variables self.exp_dirichlet_component_ = np.exp( _dirichlet_expectation_2d(self.components_)) self.n_batch_iter_ += 1 return
def _approx_bound(self, X, doc_topic_distr, sub_sampling): """Estimate the variational bound. Estimate the variational bound over "all documents" using only the documents passed in as X. Since log-likelihood of each word cannot be computed directly, we use this bound to estimate it. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. doc_topic_distr : array, shape=(n_samples, n_components) Document topic distribution. In the literature, this is called gamma. sub_sampling : boolean, optional, (default=False) Compensate for subsampling of documents. It is used in calculate bound in online learning. Returns ------- score : float """ def _loglikelihood(prior, distr, dirichlet_distr, size): # calculate log-likelihood score = np.sum((prior - distr) * dirichlet_distr) score += np.sum(gammaln(distr) - gammaln(prior)) score += np.sum(gammaln(prior * size) - gammaln(np.sum(distr, 1))) return score is_sparse_x = sp.issparse(X) n_samples, n_components = doc_topic_distr.shape n_features = self.components_.shape[1] score = 0 dirichlet_doc_topic = _dirichlet_expectation_2d(doc_topic_distr) dirichlet_component_ = _dirichlet_expectation_2d(self.components_) doc_topic_prior = self.doc_topic_prior_ topic_word_prior = self.topic_word_prior_ if is_sparse_x: X_data = X.data X_indices = X.indices X_indptr = X.indptr # E[log p(docs | theta, beta)] for idx_d in xrange(0, n_samples): if is_sparse_x: ids = X_indices[X_indptr[idx_d]:X_indptr[idx_d + 1]] cnts = X_data[X_indptr[idx_d]:X_indptr[idx_d + 1]] else: ids = np.nonzero(X[idx_d, :])[0] cnts = X[idx_d, ids] temp = (dirichlet_doc_topic[idx_d, :, np.newaxis] + dirichlet_component_[:, ids]) norm_phi = logsumexp(temp, axis=0) score += np.dot(cnts, norm_phi) # compute E[log p(theta | alpha) - log q(theta | gamma)] score += _loglikelihood(doc_topic_prior, doc_topic_distr, dirichlet_doc_topic, self._n_components) # Compensate for the subsampling of the population of documents if sub_sampling: doc_ratio = float(self.total_samples) / n_samples score *= doc_ratio # E[log p(beta | eta) - log q (beta | lambda)] score += _loglikelihood(topic_word_prior, self.components_, dirichlet_component_, n_features) return score
def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, max_iters, mean_change_tol, cal_sstats, random_state): """E-step: update document-topic distribution. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. exp_topic_word_distr : dense matrix, shape=(n_topics, n_features) Exponential value of expection of log topic word distribution. In the literature, this is `exp(E[log(beta)])`. doc_topic_prior : float Prior of document topic distribution `theta`. max_iters : int Max number of iterations for updating document topic distribution in the E-step. mean_change_tol : float Stopping tolerance for updating document topic distribution in E-setp. cal_sstats : boolean Parameter that indicate to calculate sufficient statistics or not. Set `cal_sstats` to `True` when we need to run M-step. random_state : RandomState instance or None Parameter that indicate how to initialize document topic distribution. Set `random_state` to None will initialize document topic distribution to a constant number. Returns ------- (doc_topic_distr, suff_stats) : `doc_topic_distr` is unnormalized topic distribution for each document. In the literature, this is `gamma`. we can calculate `E[log(theta)]` from it. `suff_stats` is expected sufficient statistics for the M-step. When `cal_sstats == False`, this will be None. """ is_sparse_x = sp.issparse(X) n_samples, n_features = X.shape n_topics = exp_topic_word_distr.shape[0] if random_state: doc_topic_distr = random_state.gamma(100., 0.01, (n_samples, n_topics)) else: doc_topic_distr = np.ones((n_samples, n_topics)) # In the literature, this is `exp(E[log(theta)])` exp_doc_topic = np.exp(_dirichlet_expectation_2d(doc_topic_distr)) # diff on `component_` (only calculate it when `cal_diff` is True) suff_stats = np.zeros(exp_topic_word_distr.shape) if cal_sstats else None if is_sparse_x: X_data = X.data X_indices = X.indices X_indptr = X.indptr for idx_d in xrange(n_samples): if is_sparse_x: ids = X_indices[X_indptr[idx_d]:X_indptr[idx_d + 1]] cnts = X_data[X_indptr[idx_d]:X_indptr[idx_d + 1]] else: ids = np.nonzero(X[idx_d, :])[0] cnts = X[idx_d, ids] doc_topic_d = doc_topic_distr[idx_d, :] # The next one is a copy, since the inner loop overwrites it. exp_doc_topic_d = exp_doc_topic[idx_d, :].copy() #exp_topic_word_d = exp_topic_word_distr[:, ids] exp_topic_word_d = np.repeat(exp_topic_word_distr[:, ids], cnts, axis=1) # Iterate between `doc_topic_d` and `norm_phi` until convergence for _ in xrange(0, max_iters): last_d = doc_topic_d # The optimal phi_{dwk} is proportional to # exp(E[log(theta_{dk})]) * exp(E[log(beta_{dw})]). norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS doc_topic_d = (exp_doc_topic_d * np.dot(1.0 / norm_phi, exp_topic_word_d.T)) # Note: adds doc_topic_prior to doc_topic_d, in-place. _dirichlet_expectation_1d(doc_topic_d, doc_topic_prior, exp_doc_topic_d) if mean_change(last_d, doc_topic_d) < mean_change_tol: break doc_topic_distr[idx_d, :] = doc_topic_d # Contribution of document d to the expected sufficient # statistics for the M step. if cal_sstats: norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS suff_stats[:, ids] += np.outer(exp_doc_topic_d, 1.0 / norm_phi) return (doc_topic_distr, suff_stats)