def setUp(self): model = Model(20) model.load('../testdata/lda_model') vocabulary = Vocabulary() vocabulary.load('../testdata/vocabulary.dat') self.multi_chain_gibbs_sampler = \ MultiChainGibbsSampler(model, vocabulary, 10, 10, 5)
class SparseLDATrainGibbsSamplerTest(unittest.TestCase): def setUp(self): self.model = Model(20) self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') self.sparselda_train_gibbs_sampler = \ SparseLDATrainGibbsSampler(self.model, self.vocabulary) def test_load_corpus(self): self.sparselda_train_gibbs_sampler.load_corpus('../testdata/corpus') self.assertEqual(4, len(self.sparselda_train_gibbs_sampler.documents)) def test_gibbs_sampling(self): self.sparselda_train_gibbs_sampler.load_corpus('../testdata/corpus') rand = random.Random() for i in xrange(100): self.sparselda_train_gibbs_sampler.gibbs_sampling(rand) if (i + 1) % 10 == 0: self.sparselda_train_gibbs_sampler.save_checkpoint( '../testdata/checkpoint', i + 1) self.sparselda_train_gibbs_sampler.save_model( '../testdata/train_model', 100) def test_load_checkpoint(self): cur_iteration = self.sparselda_train_gibbs_sampler.load_checkpoint( '../testdata/checkpoint') rand = random.Random() for i in xrange(cur_iteration, 200): self.sparselda_train_gibbs_sampler.gibbs_sampling(rand) if (i + 1) % 10 == 0: self.sparselda_train_gibbs_sampler.save_checkpoint( '../testdata/checkpoint', i + 1)
class ModelEvaluatorTest(unittest.TestCase): def setUp(self): self.model = Model(20) self.model.load('../testdata/lda_model') self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') def test_compute_loglikelihood(self): doc_tokens = [ 'macbook', 'ipad', # exist in vocabulary and model 'mac os x', 'chrome', # only exist in vocabulary 'nokia', 'null' ] # inexistent document = Document(self.model.num_topics) rand = random.Random() rand.seed(0) document.parse_from_tokens(doc_tokens, rand, self.vocabulary, self.model) documents = [document, document] self.assertEqual( -14.113955684239654, model_evaluator.compute_loglikelihood(self.model, self.vocabulary, documents))
def setUp(self): model = Model(20) model.load('../testdata/lda_model') vocabulary = Vocabulary() vocabulary.load('../testdata/vocabulary.dat') self.sparselda_gibbs_sampler = \ SparseLDAGibbsSampler(model, vocabulary, 10, 5)
class TopicWordsStatTest(unittest.TestCase): def setUp(self): self.model = Model(20) self.model.load("../testdata/lda_model") self.vocabulary = Vocabulary() self.vocabulary.load("../testdata/vocabulary.dat") self.topic_words_stat = TopicWordsStat(self.model, self.vocabulary) def test_save(self): print self.topic_words_stat.save("../testdata/topic_top_words.dat", 0.8) def test_get_topic_top_words(self): print self.topic_words_stat.get_topic_top_words(0.8) def test_compute_topic_word_distribution(self): print self.topic_words_stat.compute_topic_word_distribution()
def main(args): model = Model(0) model.load(args.model_dir) vocabulary = Vocabulary() vocabulary.load(args.vocabulary) multi_chain_gibbs_sampler = MultiChainGibbsSampler(model, vocabulary, args.num_markov_chains, args.total_iterations, args.burn_in_iterations) fp = open(args.documents, 'r') for doc_str in fp.readlines(): doc_str = doc_str.decode('gbk') doc_tokens = doc_str.strip().split('\t') topic_dist = multi_chain_gibbs_sampler.infer_topics(doc_tokens) print doc_str print topic_dist fp.close()
class TopicWordsStatTest(unittest.TestCase): def setUp(self): self.model = Model(20) self.model.load('../testdata/lda_model') self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') self.topic_words_stat = TopicWordsStat(self.model, self.vocabulary) def test_save(self): print self.topic_words_stat.save('../testdata/topic_top_words.dat', 0.8) def test_get_topic_top_words(self): print self.topic_words_stat.get_topic_top_words(0.8) def test_compute_topic_word_distribution(self): print self.topic_words_stat.compute_topic_word_distribution()
class ModelEvaluatorTest(unittest.TestCase): def setUp(self): self.model = Model(20) self.model.load('../testdata/lda_model') self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') def test_compute_loglikelihood(self): doc_tokens = ['macbook', 'ipad', # exist in vocabulary and model 'mac os x', 'chrome', # only exist in vocabulary 'nokia', 'null'] # inexistent document = Document(self.model.num_topics) rand = random.Random() rand.seed(0) document.parse_from_tokens( doc_tokens, rand, self.vocabulary, self.model) documents = [document, document] self.assertEqual(-14.113955684239654, model_evaluator.compute_loglikelihood(self.model, self.vocabulary, documents))
def main(args): model = Model(args.num_topics, args.topic_prior, args.word_prior) vocabulary = Vocabulary() vocabulary.load(args.vocabulary_file) sparselda_train_gibbs_sampler = SparseLDATrainGibbsSampler( model, vocabulary) sparselda_train_gibbs_sampler.load_corpus(args.corpus_dir) rand = random.Random() for i in xrange(args.total_iterations): logging.info('sparselda trainer, gibbs sampling iteration %d.' % (i + 1)) sparselda_train_gibbs_sampler.gibbs_sampling(rand) # dump lda model if i == 0 or (i + 1) % args.save_model_interval == 0: logging.info('iteration %d start saving lda model.' % (i + 1)) sparselda_train_gibbs_sampler.save_model(args.model_dir, i + 1) topic_words_stat = TopicWordsStat(model, vocabulary) topic_words_stat.save( '%s/topic_top_words.%d' % (args.model_dir, i + 1), args.topic_word_accumulated_prob_threshold) logging.info('iteration %d save lda model ok.' % (i + 1)) # dump checkpoint if i == 0 or (i + 1) % args.save_checkpoint_interval == 0: logging.info('iteration %d start saving checkpoint.' % (i + 1)) sparselda_train_gibbs_sampler.save_checkpoint( args.checkpoint_dir, i + 1) logging.info('iteration %d save checkpoint ok.' % (i + 1)) # compute the loglikelihood if i == 0 or (i + 1) % args.compute_loglikelihood_interval == 0: logging.info('iteration %d start computing loglikelihood.' % (i + 1)) model_evaluator = ModelEvaluator(model, vocabulary) ll = model_evaluator.compute_loglikelihood( sparselda_train_gibbs_sampler.documents) logging.info('iteration %d loglikelihood is %f.' % (i + 1, ll))