コード例 #1
0
 def test_random_topic_selection(self):
     
     #Simply checks if probabilities come from valid space.
     
     annots = self.create_annots(test.DELICIOUS_FILE)
     estimator = LDAEstimator(annots, 2, .5, .5, .5, 0, 0, 1, 0) 
 
     for _ in xrange(1000):
         self.assertTrue(estimator._sample_topic(0, 0, 0, 0) in [0, 1])
コード例 #2
0
    def test_initial_population(self):
        annots = self.create_annots(test.SMALL_DEL_FILE)
        #With zero GIBBs will not run
        estimator = LDAEstimator(annots, 2, .5, .5, .5, 0, 0, 1, 0)
        
        user_cnt = estimator._get_user_counts()
        topic_cnt = estimator._get_topic_counts()
        document_cnt = estimator._get_document_counts()

        self.assertEquals(user_cnt[0], 4)
        self.assertEquals(user_cnt[1], 4)
        self.assertEquals(user_cnt[2], 2)
        
        self.assertEquals(document_cnt[0], 5)
        self.assertEquals(document_cnt[1], 1)
        self.assertEquals(document_cnt[2], 2)
        self.assertEquals(document_cnt[3], 1)
        self.assertEquals(document_cnt[4], 1)
        
        #10 assignments = 10 topics
        self.assertEquals(10, sum(topic_cnt))

        user_topic_cnt = estimator._get_user_topic_counts()
        topic_document_cnt = estimator._get_topic_document_counts()
        topic_term_cnt = estimator._get_topic_term_counts()

        #We can only test shapes and sum, since assignments are random
        self.assertEqual((3, 2), user_topic_cnt.shape)
        self.assertEqual((2, 5), topic_document_cnt.shape)
        self.assertEqual((2, 6), topic_term_cnt.shape)
        
        self.assertEqual(10, user_topic_cnt.sum())
        self.assertEqual(10, topic_document_cnt.sum())
        self.assertEqual(10, topic_term_cnt.sum())

        topic_assigments = estimator._get_topic_assignments()
        self.assertEqual(10, len(topic_assigments))
        self.assertEqual(10, estimator._get_topic_counts().sum())

        #Were the topics populated correctly?
        for annot in annots:
            aux = (annot['user'], annot['item'], annot['tag'])
            self.assertTrue(aux in topic_assigments)
            
        #Simple sanity check on topic assigmnets. Check if topics have valid
        #ids and if count matches count matrix        
        from collections import Counter
        c = Counter(topic_assigments.values())
        for topic in c:
            self.assertTrue(topic in [0, 1])
            self.assertTrue(c[topic] == topic_cnt[topic])
コード例 #3
0
def create_lda_estimator(annotations_it, gamma, num_items, num_tags, 
        num_topics=200):
    '''
    Creates the lda estimator with the parameters described in [1]_. Alpha and
    Beta are defined as a function of the number of items and tags, thus only
    gamma is needed to be varied. 
    
    References
    ----------
    ..[1] Harvey, M., Ruthven, I., & Carman, M. J. (2011). 
    "Improving social bookmark search using personalised latent variable 
    language models." 
    Proceedings of the fourth ACM international conference on Web search and 
    data mining - WSDM  ’11. doi:10.1145/1935826.1935898
    '''
    
    alpha = 0.1 / num_items
    beta = 0.1 / num_tags
    iterations = 300
    burn_in = 200
    sample_every = 5 #based on the author thesis
    seed = 0 #time based seed
    lda_estimator = LDAEstimator(annotations_it, num_topics, alpha, beta, 
            gamma, iterations, burn_in, sample_every, seed)
    return lda_estimator
コード例 #4
0
    def test_gibbs_sample_with_same_sample_seed(self):
        annots = self.create_annots(test.DELICIOUS_FILE)
        
        #Last two parameters -> sample_every=1, seed=0
        estimator_seed_one_a = LDAEstimator(annots, 10, .5, .5, .5, 5, 2, 1, 1)
        estimator_seed_one_b = LDAEstimator(annots, 10, .5, .5, .5, 5, 2, 1, 1)
        
        ut_1a = estimator_seed_one_a._get_user_topic_prb()
        td_1a = estimator_seed_one_a._get_topic_document_prb()
        tt_1a = estimator_seed_one_a._get_topic_term_prb()

        ut_1b = estimator_seed_one_b._get_user_topic_prb()
        td_1b = estimator_seed_one_b._get_topic_document_prb()
        tt_1b = estimator_seed_one_b._get_topic_term_prb()
        
        self.assertFalse((ut_1a - ut_1b).any())
        self.assertFalse((td_1a - td_1b).any())
        self.assertFalse((tt_1a - tt_1b).any())
コード例 #5
0
    def test_gibbs_sample_with_sample_user_every(self):
        annots = self.create_annots(test.DELICIOUS_FILE)
        
        #Last two parameters -> sample_every=1, seed=1
        estimator_seed_one_a = LDAEstimator(annots, 10, .5, .5, .5, 5, 2, 1, 1)
        
        #Last two parameters -> sample_every=3, seed=1
        estimator_seed_one_b = LDAEstimator(annots, 10, .5, .5, .5, 5, 2, 3, 1)
        
        ut_1a = estimator_seed_one_a._get_user_topic_prb()
        td_1a = estimator_seed_one_a._get_topic_document_prb()
        tt_1a = estimator_seed_one_a._get_topic_term_prb()

        ut_1b = estimator_seed_one_b._get_user_topic_prb()
        td_1b = estimator_seed_one_b._get_topic_document_prb()
        tt_1b = estimator_seed_one_b._get_topic_term_prb()
        
        #If sum is diff 0 at least one different cell in matrices
        self.assertTrue(np.sum(ut_1a - ut_1b) != 0)
        self.assertTrue(np.sum(td_1a - td_1b) != 0)
        self.assertTrue(np.sum(tt_1a - tt_1b) != 0)
コード例 #6
0
def create_lda_estimator(annotations_it, num_docs, num_tags):
    '''
    Creates the lda estimator with the parameters described in [1]_
    
    References
    ----------
    ..[1] Harvey, M., Ruthven, I., & Carman, M. J. (2011). 
    "Improving social bookmark search using personalised latent variable 
    language models." 
    Proceedings of the fourth ACM international conference on Web search and 
    data mining - WSDM  ’11. doi:10.1145/1935826.1935898
    '''

    num_topics = 200
    alpha = 0.1 * num_docs
    beta = 0.1 * num_tags
    gamma = 25
    iterations = 300
    burn_in = 200
    lda_estimator = LDAEstimator(annotations_it, num_topics, alpha, beta,
                                 gamma, iterations, burn_in)
    return lda_estimator
コード例 #7
0
    def test_valid_probabilities(self):

        def isvalid(probs):
            return probs.sum() <= 1.00001 and probs.sum() >= 0.99999 and \
                (probs > 0).all() and \
                (probs < 1).all()

        annots = self.create_annots(test.SMALL_DEL_FILE)
        estimator = LDAEstimator(annots, 2, .1, .2, .3, 2, 0, 1, 0)
        
        gamma = np.arange(5)
        prob_items = estimator.prob_items(gamma)
        prob_items_tag = estimator.prob_items_given_tag(0, gamma)
        prob_items_user = estimator.prob_items_given_user(0, gamma)
        prob_items_user_tag = estimator.prob_items_given_user_tag(0, 0, gamma)
        
        self.assertTrue(isvalid(prob_items))
        self.assertTrue(isvalid(prob_items_tag))
        self.assertTrue(isvalid(prob_items_user))
        self.assertTrue(isvalid(prob_items_user_tag))
コード例 #8
0
    def test_gibbs_sample(self):
        
        #Runs everything on a large dataset

        annots = self.create_annots(test.DELICIOUS_FILE)
        estimator = LDAEstimator(annots, 10, .5, .5, .5, 5, 2, 1, 0)
        
        self.assertEqual(estimator.get_iter(), 4)
        
        ut = estimator._get_user_topic_prb()
        td = estimator._get_topic_document_prb()
        tt = estimator._get_topic_term_prb()
        
        self.assertTrue(ut.any())
        self.assertTrue(td.any())
        self.assertTrue(tt.any())
        
        self.assertTrue((ut >= 0).all())
        self.assertTrue((td >= 0).all())
        self.assertTrue((tt >= 0).all())

        self.assertTrue((ut <= 1).all())
        self.assertTrue((td <= 1).all())
        self.assertTrue((tt <= 1).all())
コード例 #9
0
    def test_valid_run(self):

        def isvalid(probs):
            return probs.sum() <= 1.00001 and probs.sum() >= 0.99999 and \
                (probs > 0).all() and \
                (probs < 1).all()

        annots = self.create_annots(test.SMALL_DEL_FILE)
        estimator = LDAEstimator(annots, 200, .001, .002, .003, 100, 50, 5, 0)
        
        gamma = np.arange(5)
        prob_items = estimator.prob_items(gamma)
        prob_items_tag = estimator.prob_items_given_tag(0, gamma)
        prob_items_user = estimator.prob_items_given_user(0, gamma)
        prob_items_user_tag = estimator.prob_items_given_user_tag(0, 0, gamma)
        
        self.assertTrue(isvalid(prob_items))
        self.assertTrue(isvalid(prob_items_tag))
        self.assertTrue(isvalid(prob_items_user))
        self.assertTrue(isvalid(prob_items_user_tag))
        
        self.assertTrue(estimator.chain_likelihood().all())
        
        self.assertTrue((estimator._get_user_topic_prb() >= 0).all())
        self.assertTrue((estimator._get_topic_document_prb() >= 0).all())
        self.assertTrue((estimator._get_topic_term_prb() >= 0).all())
        
        self.assertTrue((estimator._get_user_topic_prb() <= 1).all())
        self.assertTrue((estimator._get_topic_document_prb() <= 1).all())
        self.assertTrue((estimator._get_topic_term_prb() <= 1).all())
        
        self.assertTrue((estimator._get_user_topic_prb()).any())
        self.assertTrue((estimator._get_topic_document_prb()).any())
        self.assertTrue((estimator._get_topic_term_prb()).any())
コード例 #10
0
    def test_gibbs_update(self):
        
        #This test checks if topic assignment are decrased and re-increased
        
        annots = self.create_annots(test.DELICIOUS_FILE)
        estimator = LDAEstimator(annots, 2, .5, .5, .5, 0, 0, 1, 0)
        changed = False
        for annot, old_topic in estimator._get_topic_assignments().items():
            user, document, term = annot
            
            old_ut = estimator._get_user_topic_counts()[user, old_topic]
            old_td = estimator._get_topic_document_counts()[old_topic, document]
            old_tr = estimator._get_topic_term_counts()[old_topic, term]
            
            new_topic = estimator._gibbs_update(user, old_topic, document, 
                                                term, 0)
            new_ut = estimator._get_user_topic_counts()[user, old_topic]
            new_td = estimator._get_topic_document_counts()[old_topic, document]
            new_tr = estimator._get_topic_term_counts()[old_topic, term]
            
            if old_topic != new_topic:
                changed = True
                self.assertEqual(new_ut, old_ut - 1)
                self.assertEqual(new_td, old_td - 1)
                self.assertEqual(new_tr, old_tr - 1)
            else:
                self.assertEqual(new_ut, old_ut)
                self.assertEqual(new_td, old_td)
                self.assertEqual(new_tr, old_tr)                

        self.assertTrue(changed)
        self.assertEqual(len(estimator._get_topic_assignments()), 
                         estimator._get_topic_counts().sum())