def test_LDA_held_out_basic(self): """ Test basic properties of left-to-right sequential sampler evaluation method """ # simple test for consistency and monotonicity hold_prop = 0.05 n_iter = 10 R = 5 n_topics_true, vocab_size = self.topics.shape dtm = self.dtm num_test = int(hold_prop * len(dtm)) num_train = len(dtm) - num_test random_state = np.random.RandomState(5) dtm_train = dtm[:num_train] dtm_test = dtm[num_train:] # fit with lower n_iter fit = horizont.LDA(n_topics=n_topics_true, n_iter=n_iter, random_state=random_state).fit(dtm_train) # quick test for consistency logprob1 = np.sum(fit.score(dtm_test[:10], R=R, random_state=5)) logprob2 = np.sum(fit.score(dtm_test[:10], R=R, random_state=5)) self.assertEqual(logprob1, logprob2) # score lower n_iter logprob_orig = np.sum(fit.score(dtm_test, R=R, random_state=5)) # test with higher n_iter n_iter = 20 fit = horizont.LDA(n_topics=n_topics_true, n_iter=n_iter, random_state=random_state).fit(dtm_train) logprob = np.sum(fit.score(dtm_test, R=R)) self.assertGreater(logprob, logprob_orig)
def test_LDA_loglikelihood(self): """ Test loglikelihood calculations """ dtm = self.dtm n_topics_true, vocab_size = self.topics.shape n_iter = 10 random_state = 5 clf = horizont.LDA(n_topics=n_topics_true, n_iter=n_iter, random_state=random_state) clf.fit(dtm) theta, phi = clf.theta_, clf.phi_ ll = clf.loglikelihood() ll_cond = _loglikelihood_conditional(dtm, theta, phi) self.assertGreater(-240000, ll_cond) self.assertGreater(ll_cond, ll)
def test_LDA_random_seed(self): """ Make sure we get to the same place starting from the same random seed. """ dtm = self.dtm n_iter = 2 random_state = 5 fits = [] for _ in range(2): clf = horizont.LDA(n_topics=NUM_TOPICS, n_iter=n_iter, random_state=random_state) clf.fit(dtm) fits.append(clf) np.testing.assert_array_equal(fits[0].nzw_, fits[1].nzw_) np.testing.assert_array_equal(fits[0].ndz_, fits[1].ndz_) np.testing.assert_array_equal(fits[0].theta_, fits[1].theta_) np.testing.assert_array_equal(fits[0].components_, fits[1].components_) np.testing.assert_array_equal(fits[0].loglikelihood(), fits[1].loglikelihood())
def test_LDA(self): dtm = self.dtm n_words = np.sum(dtm) n_iter = 50 # sometimes the sampler gets stuck so we adopt the following procedure # run the sampler with 4 random initializations and check the following: # 1. all samplers have log likelihood above some (lower) threshold # 2. at least one sampler has a log likelihood above a higher threshold lls = [] for seed in range(4): clf = horizont.LDA(n_topics=NUM_TOPICS, n_iter=n_iter, random_state=seed) clf.fit(dtm) ll = clf.loglikelihood() lls.append(ll) for ll in lls: # LDA after 20 iterations should be -266000 self.assertGreater(ll / n_words, -267000 / 1e5) # LDA after 100 iterations should be around -255000 self.assertGreater(max(lls) / n_words, -255000 / 1e5)
def setUpClass(cls): test_dir = os.path.dirname(__file__) ap_ldac_fn = os.path.join(test_dir, 'ap.dat') cls.dtm = dtm = horizont.utils.ldac2dtm(open(ap_ldac_fn), offset=0) cls.model = model = horizont.LDA(n_topics=10, n_iter=2, random_state=1) cls.doctopic = model.fit_transform(dtm)