Ejemplo n.º 1
0
    def vi(self, i, ids, cts, words_no, expElogbetad, no_iter=1000):
        alpha = self.G_0.G_0 * self.m_gamma

        gamma = np.ones(len(alpha))
        expElogtheta = np.exp(dirichlet_expectation(gamma))

        phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100
        counts = np.array(cts)
        for _ in range(no_iter):
            lastgamma = gamma

            gamma = alpha + expElogtheta * np.dot(cts / phinorm,
                                                  expElogbetad.T)
            expElogtheta = np.exp(dirichlet_expectation(gamma))

            phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100
            meanchange = mean_absolute_difference(gamma, lastgamma)
            if meanchange < meanchangethresh:
                break

        pro_mat = np.outer(expElogtheta.T, 1 / phinorm) * expElogbetad

        mat_z = my_multinomial(pro_mat)

        self.mat_z[i][self.effe_list] = mat_z
        self.mat_z_sum[i][self.effe_list] = np.dot(mat_z, cts)
Ejemplo n.º 2
0
    def testMeanAbsoluteDifference(self):
        # test mean_absolute_difference
        rs = self.random_state

        for dtype in [np.float16, np.float32, np.float64]:
            for i in range(self.num_runs):
                input1 = rs.uniform(-10000, 10000, size=(self.num_topics,))
                input2 = rs.uniform(-10000, 10000, size=(self.num_topics,))

                known_good = mean_absolute_difference(input1, input2)
                test_values = matutils.mean_absolute_difference(input1, input2)

                msg = "mean_absolute_difference failed for dtype={}".format(dtype)
                self.assertTrue(np.allclose(known_good, test_values), msg)
Ejemplo n.º 3
0
def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100):
    r"""Performs EM-iteration on a single document for calculation of likelihood for a maximum iteration of `max_iter`.

    Parameters
    ----------
    doc_word_ids : int
        Id of corresponding words in a document.
    doc_word_counts : int
        Count of words in a single document.
    alpha : numpy.ndarray
        Lda equivalent value of alpha.
    beta : numpy.ndarray
        Lda equivalent value of beta.
    max_iter : int, optional
        Maximum number of times the expectation will be maximised.

    Returns
    -------
    (numpy.ndarray, numpy.ndarray)
        Computed (:math:`likelihood`, :math:`\gamma`).

    """
    gamma = np.ones(len(alpha))
    expElogtheta = np.exp(dirichlet_expectation(gamma))
    betad = beta[:, doc_word_ids]
    phinorm = np.dot(expElogtheta, betad) + 1e-100
    counts = np.array(doc_word_counts)
    for _ in range(max_iter):
        lastgamma = gamma

        gamma = alpha + expElogtheta * np.dot(counts / phinorm, betad.T)
        Elogtheta = dirichlet_expectation(gamma)
        expElogtheta = np.exp(Elogtheta)
        phinorm = np.dot(expElogtheta, betad) + 1e-100
        meanchange = mean_absolute_difference(gamma, lastgamma)
        if meanchange < meanchangethresh:
            break

    likelihood = np.sum(counts * np.log(phinorm))
    likelihood += np.sum((alpha - gamma) * Elogtheta)
    likelihood += np.sum(gammaln(gamma) - gammaln(alpha))
    likelihood += gammaln(np.sum(alpha)) - gammaln(np.sum(gamma))

    return likelihood, gamma
Ejemplo n.º 4
0
def lda_e_step(ids, cts, alpha, expElogbetad, max_iter=1000):
    """
    the function to update global parameters
    """
    gamma = np.ones(len(alpha))
    expElogtheta = np.exp(dirichlet_expectation(gamma))

    phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100
    counts = np.array(cts)
    for _ in range(max_iter):
        lastgamma = gamma

        gamma = alpha + expElogtheta * np.dot(cts / phinorm, expElogbetad.T)
        expElogtheta = np.exp(dirichlet_expectation(gamma))

        phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100
        meanchange = mean_absolute_difference(gamma, lastgamma)
        if meanchange < meanchangethresh:
            break

    return gamma / np.sum(gamma)
Ejemplo n.º 5
0
    def update_doc(self, i, max_iter=500):
        self.mat_z[i] = np.zeros(((self.max_K + 1), self.chunk_doc_word_no[i]))
        self.mat_z_avrg[i] = np.copy(self.mat_z[i])
        self.mat_z_sum[i] = np.zeros((self.max_K + 1))

        ids = self.chunk_doc_word_ids_list[i]
        cts = self.chunk_doc_word_counts_list[i]
        words_no = self.chunk_doc_word_no[i]
        expElogbetad = self.m_dir_exp_lambda[np.ix_(self.effe_list, ids)]

        self.vi(i, ids, cts, words_no, expElogbetad, no_iter=1000)
        self.gibbs_samplings(i, ids, cts, words_no, expElogbetad, max_iter=10)

        iter = 2
        aver_sum = np.copy(self.mat_z_sum[i])
        aver_phi = digamma(self.G_0.G_0 * self.m_gamma +
                           self.mat_z_sum[i][self.effe_list])

        while iter < max_iter:
            last_aver_sum = np.copy(aver_sum)

            self.gibbs_samplings(i,
                                 ids,
                                 cts,
                                 words_no,
                                 expElogbetad,
                                 max_iter=1)
            self.mat_z_avrg[i] -= 1 / iter * (self.mat_z_avrg[i] -
                                              self.mat_z[i])
            aver_sum -= 1 / iter * (last_aver_sum - self.mat_z_sum[i])
            aver_phi -= 1 / iter * (aver_phi -
                                    digamma(self.G_0.G_0 * self.m_gamma +
                                            self.mat_z_sum[i][self.effe_list]))

            iter += 1

            meanchange = mean_absolute_difference(
                aver_sum[self.effe_list],
                last_aver_sum[self.effe_list]) / np.sum(cts)
            if meanchange < meanchangethresh:
                break

        self.mat_phi[self.effe_list,
                     i] = aver_phi - digamma(self.G_0.G_0 * self.m_gamma)

        if np.sum(self.mat_z_avrg[i][0]) > 0:
            add_vector = self.mat_z_sum[i][0]
            add_no = 1
            add_list = ids

            self.m_K += add_no
            new_k = find_gap_in_np_array(self.effe_list, add_no)

            self.effe_list = np.sort(self.effe_list.tolist() + new_k)

            self.mat_z_avrg[i][new_k] = self.mat_z_avrg[i][0]
            self.mat_z_avrg[i][0] = np.zeros_like(self.mat_z_avrg[i][0])

            self.mat_z[i][new_k] = self.mat_z[i][0]
            self.mat_z[i][0] = np.zeros_like(self.mat_z[i][0])

            self.mat_phi[new_k, i] = self.mat_phi[0, i]
            self.mat_phi[0, i] = np.zeros_like(self.mat_phi[0, i])

            self.G_0.add_new(add_no)

            self.m_lambda[np.ix_(new_k, add_list)] += self.rhot * self.m_D / self.chunksize * np.array(cts) * \
                                                      self.mat_z_avrg[i][new_k]
            self.m_dir_exp_lambda[new_k] = np.exp(
                dirichlet_expectation(self.m_lambda[new_k] + self.m_beta))