コード例 #1
0
    def test_LDA_held_out_basic(self):
        """
        Test basic properties of left-to-right sequential sampler evaluation method
        """
        # simple test for consistency and monotonicity
        hold_prop = 0.05
        n_iter = 10
        R = 5
        n_topics_true, vocab_size = self.topics.shape
        dtm = self.dtm
        num_test = int(hold_prop * len(dtm))
        num_train = len(dtm) - num_test
        random_state = np.random.RandomState(5)
        dtm_train = dtm[:num_train]
        dtm_test = dtm[num_train:]

        # fit with lower n_iter
        fit = horizont.LDA(n_topics=n_topics_true,
                           n_iter=n_iter,
                           random_state=random_state).fit(dtm_train)

        # quick test for consistency
        logprob1 = np.sum(fit.score(dtm_test[:10], R=R, random_state=5))
        logprob2 = np.sum(fit.score(dtm_test[:10], R=R, random_state=5))
        self.assertEqual(logprob1, logprob2)

        # score lower n_iter
        logprob_orig = np.sum(fit.score(dtm_test, R=R, random_state=5))

        # test with higher n_iter
        n_iter = 20
        fit = horizont.LDA(n_topics=n_topics_true,
                           n_iter=n_iter,
                           random_state=random_state).fit(dtm_train)
        logprob = np.sum(fit.score(dtm_test, R=R))
        self.assertGreater(logprob, logprob_orig)
コード例 #2
0
 def test_LDA_loglikelihood(self):
     """
     Test loglikelihood calculations
     """
     dtm = self.dtm
     n_topics_true, vocab_size = self.topics.shape
     n_iter = 10
     random_state = 5
     clf = horizont.LDA(n_topics=n_topics_true,
                        n_iter=n_iter,
                        random_state=random_state)
     clf.fit(dtm)
     theta, phi = clf.theta_, clf.phi_
     ll = clf.loglikelihood()
     ll_cond = _loglikelihood_conditional(dtm, theta, phi)
     self.assertGreater(-240000, ll_cond)
     self.assertGreater(ll_cond, ll)
コード例 #3
0
 def test_LDA_random_seed(self):
     """
     Make sure we get to the same place starting from the same random seed.
     """
     dtm = self.dtm
     n_iter = 2
     random_state = 5
     fits = []
     for _ in range(2):
         clf = horizont.LDA(n_topics=NUM_TOPICS,
                            n_iter=n_iter,
                            random_state=random_state)
         clf.fit(dtm)
         fits.append(clf)
     np.testing.assert_array_equal(fits[0].nzw_, fits[1].nzw_)
     np.testing.assert_array_equal(fits[0].ndz_, fits[1].ndz_)
     np.testing.assert_array_equal(fits[0].theta_, fits[1].theta_)
     np.testing.assert_array_equal(fits[0].components_, fits[1].components_)
     np.testing.assert_array_equal(fits[0].loglikelihood(),
                                   fits[1].loglikelihood())
コード例 #4
0
 def test_LDA(self):
     dtm = self.dtm
     n_words = np.sum(dtm)
     n_iter = 50
     # sometimes the sampler gets stuck so we adopt the following procedure
     # run the sampler with 4 random initializations and check the following:
     # 1. all samplers have log likelihood above some (lower) threshold
     # 2. at least one sampler has a log likelihood above a higher threshold
     lls = []
     for seed in range(4):
         clf = horizont.LDA(n_topics=NUM_TOPICS,
                            n_iter=n_iter,
                            random_state=seed)
         clf.fit(dtm)
         ll = clf.loglikelihood()
         lls.append(ll)
     for ll in lls:
         # LDA after 20 iterations should be -266000
         self.assertGreater(ll / n_words, -267000 / 1e5)
     # LDA after 100 iterations should be around -255000
     self.assertGreater(max(lls) / n_words, -255000 / 1e5)
コード例 #5
0
ファイル: test_lda_ap.py プロジェクト: shiozakixlg/horizont
 def setUpClass(cls):
     test_dir = os.path.dirname(__file__)
     ap_ldac_fn = os.path.join(test_dir, 'ap.dat')
     cls.dtm = dtm = horizont.utils.ldac2dtm(open(ap_ldac_fn), offset=0)
     cls.model = model = horizont.LDA(n_topics=10, n_iter=2, random_state=1)
     cls.doctopic = model.fit_transform(dtm)