def test_random_seed(self): if not self.mallet_path: return # test that 2 models created with the same random_seed are equal in their topics treatment SEED = 10 NUM_TOPICS = 10 ITER = 500 tm1 = ldamallet.LdaMallet( self.mallet_path, corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary, random_seed=SEED, iterations=ITER, ) tm2 = ldamallet.LdaMallet( self.mallet_path, corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary, random_seed=SEED, iterations=ITER, ) self.assertTrue(np.allclose(tm1.word_topics, tm2.word_topics)) for doc in corpus: tm1_vector = matutils.sparse2full(tm1[doc], NUM_TOPICS) tm2_vector = matutils.sparse2full(tm2[doc], NUM_TOPICS) self.assertTrue(np.allclose(tm1_vector, tm2_vector))
def testTransform(self): if not self.mallet_path: return passed = False for i in range(5): # restart at most 5 times # create the transformation model model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=200) # transform one document doc = list(corpus)[0] transformed = model[doc] vec = matutils.sparse2full( transformed, 2) # convert to dense vector, for easier equality tests expected = [0.49, 0.51] passed = numpy.allclose( sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering if passed: break logging.warning( "LDA failed to converge on attempt %i (got %s, expected %s)" % (i, sorted(vec), sorted(expected))) self.assertTrue(passed)
def testMallet2Model(self): if not self.mallet_path: return tm1 = ldamallet.LdaMallet(self.mallet_path, corpus=corpus, num_topics=2, id2word=dictionary) tm2 = ldamallet.malletmodel2ldamodel(tm1) # set num_topics=-1 to exclude random influence self.assertEqual(tm1.show_topics(-1, 10), tm2.show_topics(-1, 10)) for document in corpus: element1_1, element1_2 = tm1[document][0] element2_1, element2_2 = tm2[document][0] self.assertAlmostEqual(element1_1, element2_1) self.assertAlmostEqual(element1_2, element2_2, 1) element1_1, element1_2 = tm1[document][1] element2_1, element2_2 = tm2[document][1] self.assertAlmostEqual(element1_1, element2_1) self.assertAlmostEqual(element1_2, element2_2, 1) logging.debug('%d %d', element1_1, element2_1) logging.debug('%d %d', element1_2, element2_2) logging.debug('%s %s', tm1[document][1], tm2[document][1])
def testLargeMmap(self): if not self.mallet_path: return model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) # simulate storing large arrays separately model.save(testfile(), sep_limit=0) model2 = ldamodel.LdaModel.load(testfile()) self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics)) tstvec = [] self.assertTrue(numpy.allclose( model[tstvec], model2[tstvec])) # try projecting an empty vector # test loading the large model arrays with mmap model2 = ldamodel.LdaModel.load(testfile(), mmap='r') self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics)) tstvec = [] self.assertTrue(numpy.allclose( model[tstvec], model2[tstvec])) # try projecting an empty vector
def build_model(year, topwords_folder, model_folder, year_token_chunk, num_topics, num_iterations, write_output=False, num_words=50): print("building model...") print("- number of documents:", len(year_token_chunk)) print("- number of topics:", num_topics) build_dict = Dictionary(year_token_chunk) build_corpus = [build_dict.doc2bow(text) for text in year_token_chunk] malletmodel = ldamallet.LdaMallet(PATH_TO_MALLET, build_corpus, num_topics=num_topics, id2word=build_dict, iterations=num_iterations) malletmodel.save(model_folder + "/MALLET-k-" + str(num_topics) + "-" + str(year) + ".model") # evaluate the model coherence = CoherenceModel(model=malletmodel, texts=year_token_chunk, corpus=build_corpus, dictionary=build_dict, coherence="c_v") coh_score = coherence.get_coherence() print("- coherence score:", coh_score) ## write output if write_output: print("writing output...") x = malletmodel.show_topics(num_topics=num_topics, num_words=num_words, formatted=False) topicwords = [(tp[0], [(wd[0], wd[1]) for wd in tp[1]]) for tp in x] # topicsfile = open("output/{root_output_dir}/topic-words-" + str(year) + "k-" + str(num_topics) + ".csv", "w") topic_df = pd.DataFrame() for t in range(num_topics): topic = topicwords[t][0] words = topicwords[t][1] sorted_words = sorted(words, key=lambda x: x[1], reverse=True) topic_df["topic_" + str(topic)] = sorted_words topic_df.to_csv(topwords_folder + "/topic-words-" + str(year) + "k-" + str(num_topics) + ".csv", index=False) return coh_score
def train(self): self.model = ldamallet.LdaMallet( self.mallet_path, self.corpus, **dict(self.model_params, id2word=self.corpus.dictionary, workers=self.workers)) self.save_article_topics() self.save_topic_words()
def setUp(self): mallet_home = os.environ.get('MALLET_HOME', None) self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None if not self.mallet_path: raise unittest.SkipTest("MALLET_HOME not specified. Skipping Mallet tests.") self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) # self.model is used in TestBaseTopicModel self.model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=1)
def ldaMalletLoad(): ''' Train and load existing lda(mallet) model Note: lda(mallet) model has an unfixed problem, see "4.3.2 gensim:..." in README.md ''' if not path.exists(ModelDir+LDA_MalletModelName): lda_corpus, id2word = genCorpus(corpus_name=ModelDir+CORP_NAME, f_inputs=[DataDir+TrainFile for TrainFile in TrainFiles]) lda_model = ldamallet.LdaMallet(ModelDir+'mallet-2.0.8/bin/mallet', corpus=lda_corpus, num_topics=TOPICS_NUM, id2word=id2word, workers=CPU_NUM, iterations=ITERS) lda_model.save(ModelDir+LDA_MalletModelName) return ldamallet.LdaMallet.load(ModelDir+LDA_MalletModelName)
def testLargeMmapCompressed(self): if not self.mallet_path: return fname = testfile() + '.gz' model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) # simulate storing large arrays separately model.save(fname, sep_limit=0) # test loading the large model arrays with mmap self.assertRaises(IOError, ldamodel.LdaModel.load, fname, mmap='r')
def testPersistenceCompressed(self): if not self.mallet_path: return fname = testfile() + '.gz' model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) model.save(fname) model2 = ldamallet.LdaMallet.load(fname, mmap=None) self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics)) tstvec = [] self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
def testMallet2Model(self): if not self.mallet_path: return passed = False tm1 = ldamallet.LdaMallet(self.mallet_path, corpus=corpus, num_topics=2, id2word=dictionary) tm2 = ldamallet.malletmodel2ldamodel(tm1) for document in corpus: self.assertEqual(tm1[document][0], tm2[document][0]) self.assertEqual(tm1[document][1], tm2[document][1]) logging.debug('%d %d', tm1[document][0], tm2[document][0]) logging.debug('%d %d', tm1[document][1], tm2[document][1])
def testPersistence(self): if not self.mallet_path: return fname = get_tmpfile('gensim_models_lda_mallet.tst') model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) model.save(fname) model2 = ldamallet.LdaMallet.load(fname) self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(np.allclose(model.word_topics, model2.word_topics)) tstvec = [] self.assertTrue(np.allclose( model[tstvec], model2[tstvec])) # try projecting an empty vector
def testMallet2ModelOn20NewsGroups(self): corpus = [ simple_preprocess(doc["data"]) for doc in api.load("20-newsgroups") ] dictionary = Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] lda_mallet_model = ldamallet.LdaMallet(self.mallet_path, corpus=corpus, num_topics=20, id2word=dictionary, iterations=500) lda_gensim_model = ldamallet.malletmodel2ldamodel(lda_mallet_model, iterations=1000) self.assertEqual(lda_mallet_model.show_topics(20, 50), lda_gensim_model.show_topics(20, 50))
def testLargeMmap(self): if not self.mallet_path: return fname = get_tmpfile('gensim_models_lda_mallet.tst') model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100) # simulate storing large arrays separately model.save(fname, sep_limit=0) # test loading the large model arrays with mmap model2 = ldamodel.LdaModel.load(fname, mmap='r') self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(isinstance(model2.word_topics, np.memmap)) self.assertTrue(np.allclose(model.word_topics, model2.word_topics)) tstvec = [] self.assertTrue(np.allclose( model[tstvec], model2[tstvec])) # try projecting an empty vector
def testMallet2Model(self): if not self.mallet_path: return passed = False tm1 = ldamallet.LdaMallet(self.mallet_path, corpus=corpus, num_topics=2, id2word=dictionary) tm2 = ldamallet.malletmodel2ldamodel(tm1) for document in corpus: element1_1, element1_2 = tm1[document][0] element2_1, element2_2 = tm2[document][0] self.assertAlmostEqual(element1_1, element2_1) self.assertAlmostEqual(element1_2, element2_2, 1) element1_1, element1_2 = tm1[document][1] element2_1, element2_2 = tm2[document][1] self.assertAlmostEqual(element1_1, element2_1) self.assertAlmostEqual(element1_2, element2_2, 1) logging.debug('%d %d', element1_1, element2_1) logging.debug('%d %d', element1_2, element2_2) logging.debug('%d %d', tm1[document][1], tm2[document][1])