Python TopicModel Examples, textacy.tm.TopicModel Python Examples

Example #1

0

Show file

File: test_topic_model.py Project: jakemcc/textacy

 def setUp(self):
     texts = [
         "Mary had a little lamb. Its fleece was white as snow.",
         "Everywhere that Mary went the lamb was sure to go.",
         "It followed her to school one day, which was against the rule.",
         "It made the children laugh and play to see a lamb at school.",
         "And so the teacher turned it out, but still it lingered near.",
         "It waited patiently about until Mary did appear.",
         "Why does the lamb love Mary so? The eager children cry.",
         "Mary loves the lamb, you know, the teacher did reply."
     ]
     textcorpus = TextCorpus.from_texts('en', texts)
     term_lists = [
         doc.as_terms_list(words=True, ngrams=False, named_entities=False)
         for doc in textcorpus
     ]
     self.doc_term_matrix, self.id2term = build_doc_term_matrix(
         term_lists,
         weighting='tf',
         normalize=False,
         sublinear_tf=False,
         smooth_idf=True,
         min_df=1,
         max_df=1.0,
         min_ic=0.0,
         max_n_terms=None)
     self.model = TopicModel('nmf', n_topics=5)
     self.model.fit(self.doc_term_matrix)
     self.tempdir = tempfile.mkdtemp(prefix='test_topic_model',
                                     dir=os.path.dirname(
                                         os.path.abspath(__file__)))

Example #2

0

Show file

def test_save_load(tmpdir, model):
    filepath = str(tmpdir.join("model.pkl"))
    expected = model.model.components_
    model.save(filepath)
    tmp_model = TopicModel.load(filepath)
    observed = tmp_model.model.components_
    assert observed.shape == expected.shape
    assert np.equal(observed, expected).all()

Example #3

0

Show file

File: test_topic_model.py Project: jakemcc/textacy

 def test_save_load(self):
     filename = os.path.join(self.tempdir, 'model.pkl')
     expected = self.model.model.components_
     self.model.save(filename)
     tmp_model = TopicModel.load(filename)
     observed = tmp_model.model.components_
     self.assertEqual(observed.shape, expected.shape)
     self.assertTrue(np.equal(observed, expected).all())

Example #4

0

Show file

File: test_topic_model.py Project: EricSchles/textacy

 def test_save_load(self):
     filename = os.path.join(self.tempdir, 'model.pkl')
     expected = self.model.model.components_
     self.model.save(filename)
     tmp_model = TopicModel.load(filename)
     observed = tmp_model.model.components_
     self.assertEqual(observed.shape, expected.shape)
     self.assertTrue(np.equal(observed, expected).all())

Example #5

0

Show file

def test_duck_typing():
    class TrainedDummyModel():
        def __init__(self):
            self.n_topics = 5
            self.components_ = np.array([[0, 0, 0, 1], [1, 0, 0, 0]])

        def transform(self, text):
            return text

    dummy = TrainedDummyModel()
    tmodel = TopicModel(dummy)

    assert tmodel.n_topics == dummy.n_topics
    assert tmodel.model.transform == dummy.transform
    np.testing.assert_array_equal(tmodel.model.components_, dummy.components_)

Example #6

0

Show file

File: test_topic_model.py Project: EricSchles/textacy

 def setUp(self):
     texts = ["Mary had a little lamb. Its fleece was white as snow.",
              "Everywhere that Mary went the lamb was sure to go.",
              "It followed her to school one day, which was against the rule.",
              "It made the children laugh and play to see a lamb at school.",
              "And so the teacher turned it out, but still it lingered near.",
              "It waited patiently about until Mary did appear.",
              "Why does the lamb love Mary so? The eager children cry.",
              "Mary loves the lamb, you know, the teacher did reply."]
     textcorpus = TextCorpus.from_texts('en', texts)
     term_lists = [doc.as_terms_list(words=True, ngrams=False, named_entities=False)
                   for doc in textcorpus]
     self.doc_term_matrix, self.id2term = build_doc_term_matrix(
         term_lists,
         weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True,
         min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None)
     self.model = TopicModel('nmf', n_topics=5)
     self.model.fit(self.doc_term_matrix)
     self.tempdir = tempfile.mkdtemp(
         prefix='test_topic_model', dir=os.path.dirname(os.path.abspath(__file__)))

Example #7

0

Show file

File: test_readme.py Project: zf109/textacy

def test_vectorization_and_topic_modeling_functionality(corpus):
    n_topics = 10
    top_n = 10
    vectorizer = Vectorizer(
        tf_type="linear",
        apply_idf=True,
        idf_type="smooth",
        norm=None,
        min_df=2,
        max_df=0.95,
    )
    doc_term_matrix = vectorizer.fit_transform(
        (doc._.to_terms_list(ngrams=1, entities=True, as_strings=True)
         for doc in corpus))
    model = TopicModel("nmf", n_topics=n_topics)
    model.fit(doc_term_matrix)
    doc_topic_matrix = model.transform(doc_term_matrix)
    assert isinstance(doc_term_matrix, sp.csr_matrix)
    assert isinstance(doc_topic_matrix, np.ndarray)
    assert doc_topic_matrix.shape[1] == n_topics
    for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term,
                                                      top_n=top_n):
        assert isinstance(topic_idx, int)
        assert len(top_terms) == top_n

Example #8

0

Show file

def test_init_model():
    expecteds = (NMF, LatentDirichletAllocation, TruncatedSVD)
    models = ["nmf", "lda", "lsa"]
    for model, expected in zip(models, expecteds):
        assert isinstance(TopicModel(model).model, expected)

Example #9

0

Show file

def test_n_topics():
    for model in ["nmf", "lda", "lsa"]:
        assert TopicModel(model, n_topics=20).n_topics == 20

Example #10

0

Show file

def model(doc_term_matrix):
    model_ = TopicModel("nmf", n_topics=5)
    model_.fit(doc_term_matrix)
    return model_

Example #11

0

Show file

File: test_topic_model.py Project: jakemcc/textacy

 def test_init_model(self):
     expecteds = (NMF, LatentDirichletAllocation, TruncatedSVD)
     models = ('nmf', 'lda', 'lsa')
     for model, expected in zip(models, expecteds):
         self.assertTrue(isinstance(TopicModel(model).model, expected))

Example #12

0

Show file

File: test_topic_model.py Project: jakemcc/textacy

 def test_n_topics(self):
     for model in ['nmf', 'lda', 'lsa']:
         self.assertEqual(TopicModel(model, n_topics=20).n_topics, 20)

Example #13

0

Show file

File: test_topic_model.py Project: jakemcc/textacy

class TopicModelTestCase(unittest.TestCase):
    def setUp(self):
        texts = [
            "Mary had a little lamb. Its fleece was white as snow.",
            "Everywhere that Mary went the lamb was sure to go.",
            "It followed her to school one day, which was against the rule.",
            "It made the children laugh and play to see a lamb at school.",
            "And so the teacher turned it out, but still it lingered near.",
            "It waited patiently about until Mary did appear.",
            "Why does the lamb love Mary so? The eager children cry.",
            "Mary loves the lamb, you know, the teacher did reply."
        ]
        textcorpus = TextCorpus.from_texts('en', texts)
        term_lists = [
            doc.as_terms_list(words=True, ngrams=False, named_entities=False)
            for doc in textcorpus
        ]
        self.doc_term_matrix, self.id2term = build_doc_term_matrix(
            term_lists,
            weighting='tf',
            normalize=False,
            sublinear_tf=False,
            smooth_idf=True,
            min_df=1,
            max_df=1.0,
            min_ic=0.0,
            max_n_terms=None)
        self.model = TopicModel('nmf', n_topics=5)
        self.model.fit(self.doc_term_matrix)
        self.tempdir = tempfile.mkdtemp(prefix='test_topic_model',
                                        dir=os.path.dirname(
                                            os.path.abspath(__file__)))

    def test_n_topics(self):
        for model in ['nmf', 'lda', 'lsa']:
            self.assertEqual(TopicModel(model, n_topics=20).n_topics, 20)

    def test_init_model(self):
        expecteds = (NMF, LatentDirichletAllocation, TruncatedSVD)
        models = ('nmf', 'lda', 'lsa')
        for model, expected in zip(models, expecteds):
            self.assertTrue(isinstance(TopicModel(model).model, expected))

    def test_save_load(self):
        filename = os.path.join(self.tempdir, 'model.pkl')
        expected = self.model.model.components_
        self.model.save(filename)
        tmp_model = TopicModel.load(filename)
        observed = tmp_model.model.components_
        self.assertEqual(observed.shape, expected.shape)
        self.assertTrue(np.equal(observed, expected).all())

    def test_transform(self):
        expected = (self.doc_term_matrix.shape[0], self.model.n_topics)
        observed = self.model.transform(self.doc_term_matrix).shape
        self.assertEqual(observed, expected)

    def test_get_doc_topic_matrix(self):
        expected = np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
        observed = self.model.get_doc_topic_matrix(self.doc_term_matrix,
                                                   normalize=True).sum(axis=1)
        self.assertTrue(np.equal(observed, expected).all())

    def test_get_doc_topic_matrix_nonnormalized(self):
        expected = self.model.transform(self.doc_term_matrix)
        observed = self.model.get_doc_topic_matrix(self.doc_term_matrix,
                                                   normalize=False)
        self.assertTrue(np.equal(observed, expected).all())

    def test_top_topic_terms_topics(self):
        self.assertEqual(
            len(list(self.model.top_topic_terms(self.id2term, topics=-1))),
            self.model.n_topics)
        self.assertEqual(
            len(list(self.model.top_topic_terms(self.id2term, topics=0))), 1)
        self.assertEqual([
            topic_idx
            for topic_idx, _ in self.model.top_topic_terms(self.id2term,
                                                           topics=(1, 2, 3))
        ], [1, 2, 3])

    def test_top_topic_terms_top_n(self):
        self.assertEqual(
            len(
                list(
                    self.model.top_topic_terms(self.id2term,
                                               topics=0,
                                               top_n=10))[0][1]), 10)
        self.assertEqual(
            len(
                list(
                    self.model.top_topic_terms(self.id2term, topics=0,
                                               top_n=5))[0][1]), 5)

    def test_top_topic_terms_weights(self):
        observed = list(
            self.model.top_topic_terms(self.id2term,
                                       topics=-1,
                                       top_n=10,
                                       weights=True))
        self.assertTrue(isinstance(observed[0][1][0], tuple))
        for topic_idx, term_weights in observed:
            for i in range(len(term_weights) - 1):
                self.assertTrue(term_weights[i][1] >= term_weights[i + 1][1])

    def tearDown(self):
        for fname in os.listdir(self.tempdir):
            os.remove(os.path.join(self.tempdir, fname))
        os.rmdir(self.tempdir)

Example #14

0

Show file

File: test_topic_model.py Project: EricSchles/textacy

class TopicModelTestCase(unittest.TestCase):

    def setUp(self):
        texts = ["Mary had a little lamb. Its fleece was white as snow.",
                 "Everywhere that Mary went the lamb was sure to go.",
                 "It followed her to school one day, which was against the rule.",
                 "It made the children laugh and play to see a lamb at school.",
                 "And so the teacher turned it out, but still it lingered near.",
                 "It waited patiently about until Mary did appear.",
                 "Why does the lamb love Mary so? The eager children cry.",
                 "Mary loves the lamb, you know, the teacher did reply."]
        textcorpus = TextCorpus.from_texts('en', texts)
        term_lists = [doc.as_terms_list(words=True, ngrams=False, named_entities=False)
                      for doc in textcorpus]
        self.doc_term_matrix, self.id2term = build_doc_term_matrix(
            term_lists,
            weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True,
            min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None)
        self.model = TopicModel('nmf', n_topics=5)
        self.model.fit(self.doc_term_matrix)
        self.tempdir = tempfile.mkdtemp(
            prefix='test_topic_model', dir=os.path.dirname(os.path.abspath(__file__)))

    def test_n_topics(self):
        for model in ['nmf', 'lda', 'lsa']:
            self.assertEqual(TopicModel(model, n_topics=20).n_topics, 20)

    def test_init_model(self):
        expecteds = (NMF, LatentDirichletAllocation, TruncatedSVD)
        models = ('nmf', 'lda', 'lsa')
        for model, expected in zip(models, expecteds):
            self.assertTrue(isinstance(TopicModel(model).model, expected))

    def test_save_load(self):
        filename = os.path.join(self.tempdir, 'model.pkl')
        expected = self.model.model.components_
        self.model.save(filename)
        tmp_model = TopicModel.load(filename)
        observed = tmp_model.model.components_
        self.assertEqual(observed.shape, expected.shape)
        self.assertTrue(np.equal(observed, expected).all())

    def test_transform(self):
        expected = (self.doc_term_matrix.shape[0], self.model.n_topics)
        observed = self.model.transform(self.doc_term_matrix).shape
        self.assertEqual(observed, expected)

    def test_get_doc_topic_matrix(self):
        expected = np.array([1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0])
        observed = self.model.get_doc_topic_matrix(self.doc_term_matrix,
                                                   normalize=True).sum(axis=1)
        self.assertTrue(np.equal(observed, expected).all())

    def test_get_doc_topic_matrix_nonnormalized(self):
        expected = self.model.transform(self.doc_term_matrix)
        observed = self.model.get_doc_topic_matrix(self.doc_term_matrix,
                                                   normalize=False)
        self.assertTrue(np.equal(observed, expected).all())

    def test_top_topic_terms_topics(self):
        self.assertEqual(
            len(list(self.model.top_topic_terms(self.id2term, topics=-1))),
            self.model.n_topics)
        self.assertEqual(
            len(list(self.model.top_topic_terms(self.id2term, topics=0))), 1)
        self.assertEqual(
            [topic_idx for topic_idx, _
             in self.model.top_topic_terms(self.id2term, topics=(1, 2, 3))],
            [1, 2, 3])

    def test_top_topic_terms_top_n(self):
        self.assertEqual(
            len(list(self.model.top_topic_terms(self.id2term, topics=0, top_n=10))[0][1]),
            10)
        self.assertEqual(
            len(list(self.model.top_topic_terms(self.id2term, topics=0, top_n=5))[0][1]),
            5)

    def test_top_topic_terms_weights(self):
        observed = list(self.model.top_topic_terms(self.id2term, topics=-1,
                                                   top_n=10, weights=True))
        self.assertTrue(isinstance(observed[0][1][0], tuple))
        for topic_idx, term_weights in observed:
            for i in range(len(term_weights) - 1):
                self.assertTrue(term_weights[i][1] >= term_weights[i+1][1])

    def tearDown(self):
        for fname in os.listdir(self.tempdir):
            os.remove(os.path.join(self.tempdir, fname))
        os.rmdir(self.tempdir)

Example #15

0

Show file

File: topic_analysis_hits.py Project: mlapierre/SoDS17

                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)

# In[63]:

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

# In[78]:

from textacy.tm import TopicModel
model = TopicModel('lda', n_topics=10)
model.fit(tf)
model

# In[80]:

import matplotlib.pyplot as plt
model.termite_plot(tf,
                   tf_feature_names,
                   topics=-1,
                   n_terms=50,
                   highlight_topics=[2, 3, 4, 8])
plt.show()

# In[82]: