def vi(self, i, ids, cts, words_no, expElogbetad, no_iter=1000): alpha = self.G_0.G_0 * self.m_gamma gamma = np.ones(len(alpha)) expElogtheta = np.exp(dirichlet_expectation(gamma)) phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100 counts = np.array(cts) for _ in range(no_iter): lastgamma = gamma gamma = alpha + expElogtheta * np.dot(cts / phinorm, expElogbetad.T) expElogtheta = np.exp(dirichlet_expectation(gamma)) phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100 meanchange = mean_absolute_difference(gamma, lastgamma) if meanchange < meanchangethresh: break pro_mat = np.outer(expElogtheta.T, 1 / phinorm) * expElogbetad mat_z = my_multinomial(pro_mat) self.mat_z[i][self.effe_list] = mat_z self.mat_z_sum[i][self.effe_list] = np.dot(mat_z, cts)
def testMeanAbsoluteDifference(self): # test mean_absolute_difference rs = self.random_state for dtype in [np.float16, np.float32, np.float64]: for i in range(self.num_runs): input1 = rs.uniform(-10000, 10000, size=(self.num_topics,)) input2 = rs.uniform(-10000, 10000, size=(self.num_topics,)) known_good = mean_absolute_difference(input1, input2) test_values = matutils.mean_absolute_difference(input1, input2) msg = "mean_absolute_difference failed for dtype={}".format(dtype) self.assertTrue(np.allclose(known_good, test_values), msg)
def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): r"""Performs EM-iteration on a single document for calculation of likelihood for a maximum iteration of `max_iter`. Parameters ---------- doc_word_ids : int Id of corresponding words in a document. doc_word_counts : int Count of words in a single document. alpha : numpy.ndarray Lda equivalent value of alpha. beta : numpy.ndarray Lda equivalent value of beta. max_iter : int, optional Maximum number of times the expectation will be maximised. Returns ------- (numpy.ndarray, numpy.ndarray) Computed (:math:`likelihood`, :math:`\gamma`). """ gamma = np.ones(len(alpha)) expElogtheta = np.exp(dirichlet_expectation(gamma)) betad = beta[:, doc_word_ids] phinorm = np.dot(expElogtheta, betad) + 1e-100 counts = np.array(doc_word_counts) for _ in range(max_iter): lastgamma = gamma gamma = alpha + expElogtheta * np.dot(counts / phinorm, betad.T) Elogtheta = dirichlet_expectation(gamma) expElogtheta = np.exp(Elogtheta) phinorm = np.dot(expElogtheta, betad) + 1e-100 meanchange = mean_absolute_difference(gamma, lastgamma) if meanchange < meanchangethresh: break likelihood = np.sum(counts * np.log(phinorm)) likelihood += np.sum((alpha - gamma) * Elogtheta) likelihood += np.sum(gammaln(gamma) - gammaln(alpha)) likelihood += gammaln(np.sum(alpha)) - gammaln(np.sum(gamma)) return likelihood, gamma
def lda_e_step(ids, cts, alpha, expElogbetad, max_iter=1000): """ the function to update global parameters """ gamma = np.ones(len(alpha)) expElogtheta = np.exp(dirichlet_expectation(gamma)) phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100 counts = np.array(cts) for _ in range(max_iter): lastgamma = gamma gamma = alpha + expElogtheta * np.dot(cts / phinorm, expElogbetad.T) expElogtheta = np.exp(dirichlet_expectation(gamma)) phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100 meanchange = mean_absolute_difference(gamma, lastgamma) if meanchange < meanchangethresh: break return gamma / np.sum(gamma)
def update_doc(self, i, max_iter=500): self.mat_z[i] = np.zeros(((self.max_K + 1), self.chunk_doc_word_no[i])) self.mat_z_avrg[i] = np.copy(self.mat_z[i]) self.mat_z_sum[i] = np.zeros((self.max_K + 1)) ids = self.chunk_doc_word_ids_list[i] cts = self.chunk_doc_word_counts_list[i] words_no = self.chunk_doc_word_no[i] expElogbetad = self.m_dir_exp_lambda[np.ix_(self.effe_list, ids)] self.vi(i, ids, cts, words_no, expElogbetad, no_iter=1000) self.gibbs_samplings(i, ids, cts, words_no, expElogbetad, max_iter=10) iter = 2 aver_sum = np.copy(self.mat_z_sum[i]) aver_phi = digamma(self.G_0.G_0 * self.m_gamma + self.mat_z_sum[i][self.effe_list]) while iter < max_iter: last_aver_sum = np.copy(aver_sum) self.gibbs_samplings(i, ids, cts, words_no, expElogbetad, max_iter=1) self.mat_z_avrg[i] -= 1 / iter * (self.mat_z_avrg[i] - self.mat_z[i]) aver_sum -= 1 / iter * (last_aver_sum - self.mat_z_sum[i]) aver_phi -= 1 / iter * (aver_phi - digamma(self.G_0.G_0 * self.m_gamma + self.mat_z_sum[i][self.effe_list])) iter += 1 meanchange = mean_absolute_difference( aver_sum[self.effe_list], last_aver_sum[self.effe_list]) / np.sum(cts) if meanchange < meanchangethresh: break self.mat_phi[self.effe_list, i] = aver_phi - digamma(self.G_0.G_0 * self.m_gamma) if np.sum(self.mat_z_avrg[i][0]) > 0: add_vector = self.mat_z_sum[i][0] add_no = 1 add_list = ids self.m_K += add_no new_k = find_gap_in_np_array(self.effe_list, add_no) self.effe_list = np.sort(self.effe_list.tolist() + new_k) self.mat_z_avrg[i][new_k] = self.mat_z_avrg[i][0] self.mat_z_avrg[i][0] = np.zeros_like(self.mat_z_avrg[i][0]) self.mat_z[i][new_k] = self.mat_z[i][0] self.mat_z[i][0] = np.zeros_like(self.mat_z[i][0]) self.mat_phi[new_k, i] = self.mat_phi[0, i] self.mat_phi[0, i] = np.zeros_like(self.mat_phi[0, i]) self.G_0.add_new(add_no) self.m_lambda[np.ix_(new_k, add_list)] += self.rhot * self.m_D / self.chunksize * np.array(cts) * \ self.mat_z_avrg[i][new_k] self.m_dir_exp_lambda[new_k] = np.exp( dirichlet_expectation(self.m_lambda[new_k] + self.m_beta))