def test_random_topic_selection(self): #Simply checks if probabilities come from valid space. annots = self.create_annots(test.DELICIOUS_FILE) estimator = LDAEstimator(annots, 2, .5, .5, .5, 0, 0, 1, 0) for _ in xrange(1000): self.assertTrue(estimator._sample_topic(0, 0, 0, 0) in [0, 1])
def test_initial_population(self): annots = self.create_annots(test.SMALL_DEL_FILE) #With zero GIBBs will not run estimator = LDAEstimator(annots, 2, .5, .5, .5, 0, 0, 1, 0) user_cnt = estimator._get_user_counts() topic_cnt = estimator._get_topic_counts() document_cnt = estimator._get_document_counts() self.assertEquals(user_cnt[0], 4) self.assertEquals(user_cnt[1], 4) self.assertEquals(user_cnt[2], 2) self.assertEquals(document_cnt[0], 5) self.assertEquals(document_cnt[1], 1) self.assertEquals(document_cnt[2], 2) self.assertEquals(document_cnt[3], 1) self.assertEquals(document_cnt[4], 1) #10 assignments = 10 topics self.assertEquals(10, sum(topic_cnt)) user_topic_cnt = estimator._get_user_topic_counts() topic_document_cnt = estimator._get_topic_document_counts() topic_term_cnt = estimator._get_topic_term_counts() #We can only test shapes and sum, since assignments are random self.assertEqual((3, 2), user_topic_cnt.shape) self.assertEqual((2, 5), topic_document_cnt.shape) self.assertEqual((2, 6), topic_term_cnt.shape) self.assertEqual(10, user_topic_cnt.sum()) self.assertEqual(10, topic_document_cnt.sum()) self.assertEqual(10, topic_term_cnt.sum()) topic_assigments = estimator._get_topic_assignments() self.assertEqual(10, len(topic_assigments)) self.assertEqual(10, estimator._get_topic_counts().sum()) #Were the topics populated correctly? for annot in annots: aux = (annot['user'], annot['item'], annot['tag']) self.assertTrue(aux in topic_assigments) #Simple sanity check on topic assigmnets. Check if topics have valid #ids and if count matches count matrix from collections import Counter c = Counter(topic_assigments.values()) for topic in c: self.assertTrue(topic in [0, 1]) self.assertTrue(c[topic] == topic_cnt[topic])
def create_lda_estimator(annotations_it, gamma, num_items, num_tags, num_topics=200): ''' Creates the lda estimator with the parameters described in [1]_. Alpha and Beta are defined as a function of the number of items and tags, thus only gamma is needed to be varied. References ---------- ..[1] Harvey, M., Ruthven, I., & Carman, M. J. (2011). "Improving social bookmark search using personalised latent variable language models." Proceedings of the fourth ACM international conference on Web search and data mining - WSDM ’11. doi:10.1145/1935826.1935898 ''' alpha = 0.1 / num_items beta = 0.1 / num_tags iterations = 300 burn_in = 200 sample_every = 5 #based on the author thesis seed = 0 #time based seed lda_estimator = LDAEstimator(annotations_it, num_topics, alpha, beta, gamma, iterations, burn_in, sample_every, seed) return lda_estimator
def test_gibbs_sample_with_same_sample_seed(self): annots = self.create_annots(test.DELICIOUS_FILE) #Last two parameters -> sample_every=1, seed=0 estimator_seed_one_a = LDAEstimator(annots, 10, .5, .5, .5, 5, 2, 1, 1) estimator_seed_one_b = LDAEstimator(annots, 10, .5, .5, .5, 5, 2, 1, 1) ut_1a = estimator_seed_one_a._get_user_topic_prb() td_1a = estimator_seed_one_a._get_topic_document_prb() tt_1a = estimator_seed_one_a._get_topic_term_prb() ut_1b = estimator_seed_one_b._get_user_topic_prb() td_1b = estimator_seed_one_b._get_topic_document_prb() tt_1b = estimator_seed_one_b._get_topic_term_prb() self.assertFalse((ut_1a - ut_1b).any()) self.assertFalse((td_1a - td_1b).any()) self.assertFalse((tt_1a - tt_1b).any())
def test_gibbs_sample_with_sample_user_every(self): annots = self.create_annots(test.DELICIOUS_FILE) #Last two parameters -> sample_every=1, seed=1 estimator_seed_one_a = LDAEstimator(annots, 10, .5, .5, .5, 5, 2, 1, 1) #Last two parameters -> sample_every=3, seed=1 estimator_seed_one_b = LDAEstimator(annots, 10, .5, .5, .5, 5, 2, 3, 1) ut_1a = estimator_seed_one_a._get_user_topic_prb() td_1a = estimator_seed_one_a._get_topic_document_prb() tt_1a = estimator_seed_one_a._get_topic_term_prb() ut_1b = estimator_seed_one_b._get_user_topic_prb() td_1b = estimator_seed_one_b._get_topic_document_prb() tt_1b = estimator_seed_one_b._get_topic_term_prb() #If sum is diff 0 at least one different cell in matrices self.assertTrue(np.sum(ut_1a - ut_1b) != 0) self.assertTrue(np.sum(td_1a - td_1b) != 0) self.assertTrue(np.sum(tt_1a - tt_1b) != 0)
def create_lda_estimator(annotations_it, num_docs, num_tags): ''' Creates the lda estimator with the parameters described in [1]_ References ---------- ..[1] Harvey, M., Ruthven, I., & Carman, M. J. (2011). "Improving social bookmark search using personalised latent variable language models." Proceedings of the fourth ACM international conference on Web search and data mining - WSDM ’11. doi:10.1145/1935826.1935898 ''' num_topics = 200 alpha = 0.1 * num_docs beta = 0.1 * num_tags gamma = 25 iterations = 300 burn_in = 200 lda_estimator = LDAEstimator(annotations_it, num_topics, alpha, beta, gamma, iterations, burn_in) return lda_estimator
def test_valid_probabilities(self): def isvalid(probs): return probs.sum() <= 1.00001 and probs.sum() >= 0.99999 and \ (probs > 0).all() and \ (probs < 1).all() annots = self.create_annots(test.SMALL_DEL_FILE) estimator = LDAEstimator(annots, 2, .1, .2, .3, 2, 0, 1, 0) gamma = np.arange(5) prob_items = estimator.prob_items(gamma) prob_items_tag = estimator.prob_items_given_tag(0, gamma) prob_items_user = estimator.prob_items_given_user(0, gamma) prob_items_user_tag = estimator.prob_items_given_user_tag(0, 0, gamma) self.assertTrue(isvalid(prob_items)) self.assertTrue(isvalid(prob_items_tag)) self.assertTrue(isvalid(prob_items_user)) self.assertTrue(isvalid(prob_items_user_tag))
def test_gibbs_sample(self): #Runs everything on a large dataset annots = self.create_annots(test.DELICIOUS_FILE) estimator = LDAEstimator(annots, 10, .5, .5, .5, 5, 2, 1, 0) self.assertEqual(estimator.get_iter(), 4) ut = estimator._get_user_topic_prb() td = estimator._get_topic_document_prb() tt = estimator._get_topic_term_prb() self.assertTrue(ut.any()) self.assertTrue(td.any()) self.assertTrue(tt.any()) self.assertTrue((ut >= 0).all()) self.assertTrue((td >= 0).all()) self.assertTrue((tt >= 0).all()) self.assertTrue((ut <= 1).all()) self.assertTrue((td <= 1).all()) self.assertTrue((tt <= 1).all())
def test_valid_run(self): def isvalid(probs): return probs.sum() <= 1.00001 and probs.sum() >= 0.99999 and \ (probs > 0).all() and \ (probs < 1).all() annots = self.create_annots(test.SMALL_DEL_FILE) estimator = LDAEstimator(annots, 200, .001, .002, .003, 100, 50, 5, 0) gamma = np.arange(5) prob_items = estimator.prob_items(gamma) prob_items_tag = estimator.prob_items_given_tag(0, gamma) prob_items_user = estimator.prob_items_given_user(0, gamma) prob_items_user_tag = estimator.prob_items_given_user_tag(0, 0, gamma) self.assertTrue(isvalid(prob_items)) self.assertTrue(isvalid(prob_items_tag)) self.assertTrue(isvalid(prob_items_user)) self.assertTrue(isvalid(prob_items_user_tag)) self.assertTrue(estimator.chain_likelihood().all()) self.assertTrue((estimator._get_user_topic_prb() >= 0).all()) self.assertTrue((estimator._get_topic_document_prb() >= 0).all()) self.assertTrue((estimator._get_topic_term_prb() >= 0).all()) self.assertTrue((estimator._get_user_topic_prb() <= 1).all()) self.assertTrue((estimator._get_topic_document_prb() <= 1).all()) self.assertTrue((estimator._get_topic_term_prb() <= 1).all()) self.assertTrue((estimator._get_user_topic_prb()).any()) self.assertTrue((estimator._get_topic_document_prb()).any()) self.assertTrue((estimator._get_topic_term_prb()).any())
def test_gibbs_update(self): #This test checks if topic assignment are decrased and re-increased annots = self.create_annots(test.DELICIOUS_FILE) estimator = LDAEstimator(annots, 2, .5, .5, .5, 0, 0, 1, 0) changed = False for annot, old_topic in estimator._get_topic_assignments().items(): user, document, term = annot old_ut = estimator._get_user_topic_counts()[user, old_topic] old_td = estimator._get_topic_document_counts()[old_topic, document] old_tr = estimator._get_topic_term_counts()[old_topic, term] new_topic = estimator._gibbs_update(user, old_topic, document, term, 0) new_ut = estimator._get_user_topic_counts()[user, old_topic] new_td = estimator._get_topic_document_counts()[old_topic, document] new_tr = estimator._get_topic_term_counts()[old_topic, term] if old_topic != new_topic: changed = True self.assertEqual(new_ut, old_ut - 1) self.assertEqual(new_td, old_td - 1) self.assertEqual(new_tr, old_tr - 1) else: self.assertEqual(new_ut, old_ut) self.assertEqual(new_td, old_td) self.assertEqual(new_tr, old_tr) self.assertTrue(changed) self.assertEqual(len(estimator._get_topic_assignments()), estimator._get_topic_counts().sum())