def test_random_seed(self):
        if not self.mallet_path:
            return

        # test that 2 models created with the same random_seed are equal in their topics treatment
        SEED = 10
        NUM_TOPICS = 10
        ITER = 500

        tm1 = ldamallet.LdaMallet(
            self.mallet_path,
            corpus=corpus,
            num_topics=NUM_TOPICS,
            id2word=dictionary,
            random_seed=SEED,
            iterations=ITER,
        )

        tm2 = ldamallet.LdaMallet(
            self.mallet_path,
            corpus=corpus,
            num_topics=NUM_TOPICS,
            id2word=dictionary,
            random_seed=SEED,
            iterations=ITER,
        )
        self.assertTrue(np.allclose(tm1.word_topics, tm2.word_topics))

        for doc in corpus:
            tm1_vector = matutils.sparse2full(tm1[doc], NUM_TOPICS)
            tm2_vector = matutils.sparse2full(tm2[doc], NUM_TOPICS)

            self.assertTrue(np.allclose(tm1_vector, tm2_vector))
    def testTransform(self):
        if not self.mallet_path:
            return
        passed = False
        for i in range(5):  # restart at most 5 times
            # create the transformation model
            model = ldamallet.LdaMallet(self.mallet_path,
                                        corpus,
                                        id2word=dictionary,
                                        num_topics=2,
                                        iterations=200)

            # transform one document
            doc = list(corpus)[0]
            transformed = model[doc]

            vec = matutils.sparse2full(
                transformed,
                2)  # convert to dense vector, for easier equality tests
            expected = [0.49, 0.51]
            passed = numpy.allclose(
                sorted(vec), sorted(expected),
                atol=1e-2)  # must contain the same values, up to re-ordering
            if passed:
                break
            logging.warning(
                "LDA failed to converge on attempt %i (got %s, expected %s)" %
                (i, sorted(vec), sorted(expected)))
        self.assertTrue(passed)
Exemple #3
0
    def testMallet2Model(self):
        if not self.mallet_path:
            return

        tm1 = ldamallet.LdaMallet(self.mallet_path,
                                  corpus=corpus,
                                  num_topics=2,
                                  id2word=dictionary)
        tm2 = ldamallet.malletmodel2ldamodel(tm1)

        # set num_topics=-1 to exclude random influence
        self.assertEqual(tm1.show_topics(-1, 10), tm2.show_topics(-1, 10))

        for document in corpus:
            element1_1, element1_2 = tm1[document][0]
            element2_1, element2_2 = tm2[document][0]
            self.assertAlmostEqual(element1_1, element2_1)
            self.assertAlmostEqual(element1_2, element2_2, 1)
            element1_1, element1_2 = tm1[document][1]
            element2_1, element2_2 = tm2[document][1]
            self.assertAlmostEqual(element1_1, element2_1)
            self.assertAlmostEqual(element1_2, element2_2, 1)
            logging.debug('%d %d', element1_1, element2_1)
            logging.debug('%d %d', element1_2, element2_2)
            logging.debug('%s %s', tm1[document][1], tm2[document][1])
Exemple #4
0
    def testLargeMmap(self):
        if not self.mallet_path:
            return
        model = ldamallet.LdaMallet(self.mallet_path,
                                    self.corpus,
                                    num_topics=2,
                                    iterations=100)

        # simulate storing large arrays separately
        model.save(testfile(), sep_limit=0)

        model2 = ldamodel.LdaModel.load(testfile())
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics))
        tstvec = []
        self.assertTrue(numpy.allclose(
            model[tstvec], model2[tstvec]))  # try projecting an empty vector

        # test loading the large model arrays with mmap
        model2 = ldamodel.LdaModel.load(testfile(), mmap='r')
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics))
        tstvec = []
        self.assertTrue(numpy.allclose(
            model[tstvec], model2[tstvec]))  # try projecting an empty vector
Exemple #5
0
def build_model(year,
                topwords_folder,
                model_folder,
                year_token_chunk,
                num_topics,
                num_iterations,
                write_output=False,
                num_words=50):

    print("building model...")
    print("- number of documents:", len(year_token_chunk))
    print("- number of topics:", num_topics)

    build_dict = Dictionary(year_token_chunk)
    build_corpus = [build_dict.doc2bow(text) for text in year_token_chunk]
    malletmodel = ldamallet.LdaMallet(PATH_TO_MALLET,
                                      build_corpus,
                                      num_topics=num_topics,
                                      id2word=build_dict,
                                      iterations=num_iterations)

    malletmodel.save(model_folder + "/MALLET-k-" + str(num_topics) + "-" +
                     str(year) + ".model")

    # evaluate the model
    coherence = CoherenceModel(model=malletmodel,
                               texts=year_token_chunk,
                               corpus=build_corpus,
                               dictionary=build_dict,
                               coherence="c_v")
    coh_score = coherence.get_coherence()
    print("- coherence score:", coh_score)

    ## write output
    if write_output:

        print("writing output...")
        x = malletmodel.show_topics(num_topics=num_topics,
                                    num_words=num_words,
                                    formatted=False)

        topicwords = [(tp[0], [(wd[0], wd[1]) for wd in tp[1]]) for tp in x]

        # topicsfile = open("output/{root_output_dir}/topic-words-" + str(year) + "k-" + str(num_topics) + ".csv", "w")

        topic_df = pd.DataFrame()

        for t in range(num_topics):

            topic = topicwords[t][0]
            words = topicwords[t][1]
            sorted_words = sorted(words, key=lambda x: x[1], reverse=True)
            topic_df["topic_" + str(topic)] = sorted_words

        topic_df.to_csv(topwords_folder + "/topic-words-" + str(year) + "k-" +
                        str(num_topics) + ".csv",
                        index=False)

    return coh_score
Exemple #6
0
 def train(self):
     self.model = ldamallet.LdaMallet(
         self.mallet_path, self.corpus,
         **dict(self.model_params,
                id2word=self.corpus.dictionary,
                workers=self.workers))
     self.save_article_topics()
     self.save_topic_words()
    def setUp(self):
        mallet_home = os.environ.get('MALLET_HOME', None)
        self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None
        if not self.mallet_path:
            raise unittest.SkipTest("MALLET_HOME not specified. Skipping Mallet tests.")
        self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))

        # self.model is used in TestBaseTopicModel
        self.model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=1)
Exemple #8
0
def ldaMalletLoad():
    '''
    Train and load existing lda(mallet) model
    Note: lda(mallet) model has an unfixed problem, see "4.3.2 gensim:..." in README.md
    '''
    if not path.exists(ModelDir+LDA_MalletModelName):
        lda_corpus, id2word = genCorpus(corpus_name=ModelDir+CORP_NAME, f_inputs=[DataDir+TrainFile for TrainFile in TrainFiles])
        lda_model = ldamallet.LdaMallet(ModelDir+'mallet-2.0.8/bin/mallet', corpus=lda_corpus, 
            num_topics=TOPICS_NUM, id2word=id2word, workers=CPU_NUM, iterations=ITERS)
        lda_model.save(ModelDir+LDA_MalletModelName)
    return ldamallet.LdaMallet.load(ModelDir+LDA_MalletModelName)
Exemple #9
0
    def testLargeMmapCompressed(self):
        if not self.mallet_path:
            return
        fname = testfile() + '.gz'
        model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100)

        # simulate storing large arrays separately
        model.save(fname, sep_limit=0)

        # test loading the large model arrays with mmap
        self.assertRaises(IOError, ldamodel.LdaModel.load, fname, mmap='r')
Exemple #10
0
 def testPersistenceCompressed(self):
     if not self.mallet_path:
         return
     fname = testfile() + '.gz'
     model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100)
     model.save(fname)
     model2 = ldamallet.LdaMallet.load(fname, mmap=None)
     self.assertEqual(model.num_topics, model2.num_topics)
     self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics))
     tstvec = []
     self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
 def testMallet2Model(self):
     if not self.mallet_path:
         return
     passed = False
     tm1 = ldamallet.LdaMallet(self.mallet_path, corpus=corpus, num_topics=2, id2word=dictionary)
     tm2 = ldamallet.malletmodel2ldamodel(tm1)
     for document in corpus:
         self.assertEqual(tm1[document][0], tm2[document][0])
         self.assertEqual(tm1[document][1], tm2[document][1])
         logging.debug('%d %d', tm1[document][0], tm2[document][0])
         logging.debug('%d %d', tm1[document][1], tm2[document][1])
Exemple #12
0
 def testPersistence(self):
     if not self.mallet_path:
         return
     fname = get_tmpfile('gensim_models_lda_mallet.tst')
     model = ldamallet.LdaMallet(self.mallet_path,
                                 self.corpus,
                                 num_topics=2,
                                 iterations=100)
     model.save(fname)
     model2 = ldamallet.LdaMallet.load(fname)
     self.assertEqual(model.num_topics, model2.num_topics)
     self.assertTrue(np.allclose(model.word_topics, model2.word_topics))
     tstvec = []
     self.assertTrue(np.allclose(
         model[tstvec], model2[tstvec]))  # try projecting an empty vector
Exemple #13
0
    def testMallet2ModelOn20NewsGroups(self):
        corpus = [
            simple_preprocess(doc["data"]) for doc in api.load("20-newsgroups")
        ]
        dictionary = Dictionary(corpus)

        corpus = [dictionary.doc2bow(text) for text in corpus]

        lda_mallet_model = ldamallet.LdaMallet(self.mallet_path,
                                               corpus=corpus,
                                               num_topics=20,
                                               id2word=dictionary,
                                               iterations=500)

        lda_gensim_model = ldamallet.malletmodel2ldamodel(lda_mallet_model,
                                                          iterations=1000)
        self.assertEqual(lda_mallet_model.show_topics(20, 50),
                         lda_gensim_model.show_topics(20, 50))
Exemple #14
0
    def testLargeMmap(self):
        if not self.mallet_path:
            return
        fname = get_tmpfile('gensim_models_lda_mallet.tst')
        model = ldamallet.LdaMallet(self.mallet_path,
                                    self.corpus,
                                    num_topics=2,
                                    iterations=100)

        # simulate storing large arrays separately
        model.save(fname, sep_limit=0)

        # test loading the large model arrays with mmap
        model2 = ldamodel.LdaModel.load(fname, mmap='r')
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(isinstance(model2.word_topics, np.memmap))
        self.assertTrue(np.allclose(model.word_topics, model2.word_topics))
        tstvec = []
        self.assertTrue(np.allclose(
            model[tstvec], model2[tstvec]))  # try projecting an empty vector
Exemple #15
0
 def testMallet2Model(self):
     if not self.mallet_path:
         return
     passed = False
     tm1 = ldamallet.LdaMallet(self.mallet_path,
                               corpus=corpus,
                               num_topics=2,
                               id2word=dictionary)
     tm2 = ldamallet.malletmodel2ldamodel(tm1)
     for document in corpus:
         element1_1, element1_2 = tm1[document][0]
         element2_1, element2_2 = tm2[document][0]
         self.assertAlmostEqual(element1_1, element2_1)
         self.assertAlmostEqual(element1_2, element2_2, 1)
         element1_1, element1_2 = tm1[document][1]
         element2_1, element2_2 = tm2[document][1]
         self.assertAlmostEqual(element1_1, element2_1)
         self.assertAlmostEqual(element1_2, element2_2, 1)
         logging.debug('%d %d', element1_1, element2_1)
         logging.debug('%d %d', element1_2, element2_2)
         logging.debug('%d %d', tm1[document][1], tm2[document][1])