def setUp(self):
     model = Model(20)
     model.load('../testdata/lda_model')
     vocabulary = Vocabulary()
     vocabulary.load('../testdata/vocabulary.dat')
     self.multi_chain_gibbs_sampler = \
             MultiChainGibbsSampler(model, vocabulary, 10, 10, 5)
Esempio n. 2
0
class SparseLDATrainGibbsSamplerTest(unittest.TestCase):
    def setUp(self):
        self.model = Model(20)
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../testdata/vocabulary.dat')
        self.sparselda_train_gibbs_sampler = \
                SparseLDATrainGibbsSampler(self.model, self.vocabulary)

    def test_load_corpus(self):
        self.sparselda_train_gibbs_sampler.load_corpus('../testdata/corpus')
        self.assertEqual(4, len(self.sparselda_train_gibbs_sampler.documents))

    def test_gibbs_sampling(self):
        self.sparselda_train_gibbs_sampler.load_corpus('../testdata/corpus')
        rand = random.Random()
        for i in xrange(100):
            self.sparselda_train_gibbs_sampler.gibbs_sampling(rand)
            if (i + 1) % 10 == 0:
                self.sparselda_train_gibbs_sampler.save_checkpoint(
                    '../testdata/checkpoint', i + 1)
        self.sparselda_train_gibbs_sampler.save_model(
            '../testdata/train_model', 100)

    def test_load_checkpoint(self):
        cur_iteration = self.sparselda_train_gibbs_sampler.load_checkpoint(
            '../testdata/checkpoint')
        rand = random.Random()
        for i in xrange(cur_iteration, 200):
            self.sparselda_train_gibbs_sampler.gibbs_sampling(rand)
            if (i + 1) % 10 == 0:
                self.sparselda_train_gibbs_sampler.save_checkpoint(
                    '../testdata/checkpoint', i + 1)
Esempio n. 3
0
class ModelEvaluatorTest(unittest.TestCase):
    def setUp(self):
        self.model = Model(20)
        self.model.load('../testdata/lda_model')
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../testdata/vocabulary.dat')

    def test_compute_loglikelihood(self):
        doc_tokens = [
            'macbook',
            'ipad',  # exist in vocabulary and model
            'mac os x',
            'chrome',  # only exist in vocabulary
            'nokia',
            'null'
        ]  # inexistent
        document = Document(self.model.num_topics)
        rand = random.Random()
        rand.seed(0)
        document.parse_from_tokens(doc_tokens, rand, self.vocabulary,
                                   self.model)
        documents = [document, document]
        self.assertEqual(
            -14.113955684239654,
            model_evaluator.compute_loglikelihood(self.model, self.vocabulary,
                                                  documents))
 def setUp(self):
     model = Model(20)
     model.load('../testdata/lda_model')
     vocabulary = Vocabulary()
     vocabulary.load('../testdata/vocabulary.dat')
     self.multi_chain_gibbs_sampler = \
             MultiChainGibbsSampler(model, vocabulary, 10, 10, 5)
class SparseLDATrainGibbsSamplerTest(unittest.TestCase):

    def setUp(self):
        self.model = Model(20)
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../testdata/vocabulary.dat')
        self.sparselda_train_gibbs_sampler = \
                SparseLDATrainGibbsSampler(self.model, self.vocabulary)

    def test_load_corpus(self):
        self.sparselda_train_gibbs_sampler.load_corpus('../testdata/corpus')
        self.assertEqual(4, len(self.sparselda_train_gibbs_sampler.documents))

    def test_gibbs_sampling(self):
        self.sparselda_train_gibbs_sampler.load_corpus('../testdata/corpus')
        rand = random.Random()
        for i in xrange(100):
            self.sparselda_train_gibbs_sampler.gibbs_sampling(rand)
            if (i + 1) % 10 == 0:
                self.sparselda_train_gibbs_sampler.save_checkpoint(
                        '../testdata/checkpoint', i + 1)
        self.sparselda_train_gibbs_sampler.save_model(
                '../testdata/train_model', 100)

    def test_load_checkpoint(self):
        cur_iteration = self.sparselda_train_gibbs_sampler.load_checkpoint(
                '../testdata/checkpoint')
        rand = random.Random()
        for i in xrange(cur_iteration, 200):
            self.sparselda_train_gibbs_sampler.gibbs_sampling(rand)
            if (i + 1) % 10 == 0:
                self.sparselda_train_gibbs_sampler.save_checkpoint(
                        '../testdata/checkpoint', i + 1)
Esempio n. 6
0
 def setUp(self):
     model = Model(20)
     model.load('../testdata/lda_model')
     vocabulary = Vocabulary()
     vocabulary.load('../testdata/vocabulary.dat')
     self.sparselda_gibbs_sampler = \
             SparseLDAGibbsSampler(model, vocabulary, 10, 5)
 def setUp(self):
     model = Model(20)
     model.load('../testdata/lda_model')
     vocabulary = Vocabulary()
     vocabulary.load('../testdata/vocabulary.dat')
     self.sparselda_gibbs_sampler = \
             SparseLDAGibbsSampler(model, vocabulary, 10, 5)
Esempio n. 8
0
class TopicWordsStatTest(unittest.TestCase):
    def setUp(self):
        self.model = Model(20)
        self.model.load("../testdata/lda_model")
        self.vocabulary = Vocabulary()
        self.vocabulary.load("../testdata/vocabulary.dat")

        self.topic_words_stat = TopicWordsStat(self.model, self.vocabulary)

    def test_save(self):
        print self.topic_words_stat.save("../testdata/topic_top_words.dat", 0.8)

    def test_get_topic_top_words(self):
        print self.topic_words_stat.get_topic_top_words(0.8)

    def test_compute_topic_word_distribution(self):
        print self.topic_words_stat.compute_topic_word_distribution()
Esempio n. 9
0
def main(args):
    model = Model(0)
    model.load(args.model_dir)
    vocabulary = Vocabulary()
    vocabulary.load(args.vocabulary)
    multi_chain_gibbs_sampler = MultiChainGibbsSampler(model, vocabulary,
            args.num_markov_chains, args.total_iterations,
            args.burn_in_iterations)

    fp = open(args.documents, 'r')
    for doc_str in fp.readlines():
        doc_str = doc_str.decode('gbk')
        doc_tokens = doc_str.strip().split('\t')
        topic_dist = multi_chain_gibbs_sampler.infer_topics(doc_tokens)
        print doc_str
        print topic_dist
    fp.close()
def main(args):
    model = Model(0)
    model.load(args.model_dir)
    vocabulary = Vocabulary()
    vocabulary.load(args.vocabulary)
    multi_chain_gibbs_sampler = MultiChainGibbsSampler(model, vocabulary,
                                                       args.num_markov_chains,
                                                       args.total_iterations,
                                                       args.burn_in_iterations)

    fp = open(args.documents, 'r')
    for doc_str in fp.readlines():
        doc_str = doc_str.decode('gbk')
        doc_tokens = doc_str.strip().split('\t')
        topic_dist = multi_chain_gibbs_sampler.infer_topics(doc_tokens)
        print doc_str
        print topic_dist
    fp.close()
Esempio n. 11
0
class TopicWordsStatTest(unittest.TestCase):
    def setUp(self):
        self.model = Model(20)
        self.model.load('../testdata/lda_model')
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../testdata/vocabulary.dat')

        self.topic_words_stat = TopicWordsStat(self.model, self.vocabulary)

    def test_save(self):
        print self.topic_words_stat.save('../testdata/topic_top_words.dat',
                                         0.8)

    def test_get_topic_top_words(self):
        print self.topic_words_stat.get_topic_top_words(0.8)

    def test_compute_topic_word_distribution(self):
        print self.topic_words_stat.compute_topic_word_distribution()
Esempio n. 12
0
class ModelEvaluatorTest(unittest.TestCase):

    def setUp(self):
        self.model = Model(20)
        self.model.load('../testdata/lda_model')
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../testdata/vocabulary.dat')

    def test_compute_loglikelihood(self):
        doc_tokens = ['macbook', 'ipad',  # exist in vocabulary and model
                'mac os x', 'chrome',  # only exist in vocabulary
                'nokia', 'null']  # inexistent
        document = Document(self.model.num_topics)
        rand = random.Random()
        rand.seed(0)
        document.parse_from_tokens(
                doc_tokens, rand, self.vocabulary, self.model)
        documents = [document, document]
        self.assertEqual(-14.113955684239654,
                model_evaluator.compute_loglikelihood(self.model, self.vocabulary, documents))
Esempio n. 13
0
def main(args):
    model = Model(args.num_topics, args.topic_prior, args.word_prior)
    vocabulary = Vocabulary()
    vocabulary.load(args.vocabulary_file)
    sparselda_train_gibbs_sampler = SparseLDATrainGibbsSampler(
        model, vocabulary)
    sparselda_train_gibbs_sampler.load_corpus(args.corpus_dir)

    rand = random.Random()

    for i in xrange(args.total_iterations):
        logging.info('sparselda trainer, gibbs sampling iteration %d.' %
                     (i + 1))
        sparselda_train_gibbs_sampler.gibbs_sampling(rand)

        # dump lda model
        if i == 0 or (i + 1) % args.save_model_interval == 0:
            logging.info('iteration %d start saving lda model.' % (i + 1))
            sparselda_train_gibbs_sampler.save_model(args.model_dir, i + 1)
            topic_words_stat = TopicWordsStat(model, vocabulary)
            topic_words_stat.save(
                '%s/topic_top_words.%d' % (args.model_dir, i + 1),
                args.topic_word_accumulated_prob_threshold)
            logging.info('iteration %d save lda model ok.' % (i + 1))

        # dump checkpoint
        if i == 0 or (i + 1) % args.save_checkpoint_interval == 0:
            logging.info('iteration %d start saving checkpoint.' % (i + 1))
            sparselda_train_gibbs_sampler.save_checkpoint(
                args.checkpoint_dir, i + 1)
            logging.info('iteration %d save checkpoint ok.' % (i + 1))

        # compute the loglikelihood
        if i == 0 or (i + 1) % args.compute_loglikelihood_interval == 0:
            logging.info('iteration %d start computing loglikelihood.' %
                         (i + 1))
            model_evaluator = ModelEvaluator(model, vocabulary)
            ll = model_evaluator.compute_loglikelihood(
                sparselda_train_gibbs_sampler.documents)
            logging.info('iteration %d loglikelihood is %f.' % (i + 1, ll))
Esempio n. 14
0
def main(args):
    model = Model(args.num_topics, args.topic_prior, args.word_prior)
    vocabulary = Vocabulary()
    vocabulary.load(args.vocabulary_file)
    sparselda_train_gibbs_sampler = SparseLDATrainGibbsSampler(
            model, vocabulary)
    sparselda_train_gibbs_sampler.load_corpus(args.corpus_dir)

    rand = random.Random()

    for i in xrange(args.total_iterations):
        logging.info('sparselda trainer, gibbs sampling iteration %d.'
                % (i + 1))
        sparselda_train_gibbs_sampler.gibbs_sampling(rand)

        # dump lda model
        if i == 0 or (i + 1) % args.save_model_interval == 0:
            logging.info('iteration %d start saving lda model.' % (i + 1))
            sparselda_train_gibbs_sampler.save_model(args.model_dir, i + 1)
            topic_words_stat = TopicWordsStat(model, vocabulary)
            topic_words_stat.save(
                    '%s/topic_top_words.%d' % (args.model_dir, i + 1),
                    args.topic_word_accumulated_prob_threshold)
            logging.info('iteration %d save lda model ok.' % (i + 1))

        # dump checkpoint
        if i == 0 or (i + 1) % args.save_checkpoint_interval == 0:
            logging.info('iteration %d start saving checkpoint.' % (i + 1))
            sparselda_train_gibbs_sampler.save_checkpoint(
                    args.checkpoint_dir, i + 1)
            logging.info('iteration %d save checkpoint ok.' % (i + 1))

        # compute the loglikelihood
        if i == 0 or (i + 1) % args.compute_loglikelihood_interval == 0:
            logging.info('iteration %d start computing loglikelihood.' % (i + 1))
            model_evaluator = ModelEvaluator(model, vocabulary)
            ll = model_evaluator.compute_loglikelihood(
                    sparselda_train_gibbs_sampler.documents)
            logging.info('iteration %d loglikelihood is %f.' % (i + 1, ll))