Example #1
0
 def setUp(self):
     texts = [
         "Mary had a little lamb. Its fleece was white as snow.",
         "Everywhere that Mary went the lamb was sure to go.",
         "It followed her to school one day, which was against the rule.",
         "It made the children laugh and play to see a lamb at school.",
         "And so the teacher turned it out, but still it lingered near.",
         "It waited patiently about until Mary did appear.",
         "Why does the lamb love Mary so? The eager children cry.",
         "Mary loves the lamb, you know, the teacher did reply."
     ]
     textcorpus = TextCorpus.from_texts('en', texts)
     term_lists = [
         doc.as_terms_list(words=True, ngrams=False, named_entities=False)
         for doc in textcorpus
     ]
     self.doc_term_matrix, self.id2term = build_doc_term_matrix(
         term_lists,
         weighting='tf',
         normalize=False,
         sublinear_tf=False,
         smooth_idf=True,
         min_df=1,
         max_df=1.0,
         min_ic=0.0,
         max_n_terms=None)
     self.model = TopicModel('nmf', n_topics=5)
     self.model.fit(self.doc_term_matrix)
     self.tempdir = tempfile.mkdtemp(prefix='test_topic_model',
                                     dir=os.path.dirname(
                                         os.path.abspath(__file__)))
Example #2
0
def test_save_load(tmpdir, model):
    filepath = str(tmpdir.join("model.pkl"))
    expected = model.model.components_
    model.save(filepath)
    tmp_model = TopicModel.load(filepath)
    observed = tmp_model.model.components_
    assert observed.shape == expected.shape
    assert np.equal(observed, expected).all()
Example #3
0
 def test_save_load(self):
     filename = os.path.join(self.tempdir, 'model.pkl')
     expected = self.model.model.components_
     self.model.save(filename)
     tmp_model = TopicModel.load(filename)
     observed = tmp_model.model.components_
     self.assertEqual(observed.shape, expected.shape)
     self.assertTrue(np.equal(observed, expected).all())
Example #4
0
 def test_save_load(self):
     filename = os.path.join(self.tempdir, 'model.pkl')
     expected = self.model.model.components_
     self.model.save(filename)
     tmp_model = TopicModel.load(filename)
     observed = tmp_model.model.components_
     self.assertEqual(observed.shape, expected.shape)
     self.assertTrue(np.equal(observed, expected).all())
Example #5
0
def test_duck_typing():
    class TrainedDummyModel():
        def __init__(self):
            self.n_topics = 5
            self.components_ = np.array([[0, 0, 0, 1], [1, 0, 0, 0]])

        def transform(self, text):
            return text

    dummy = TrainedDummyModel()
    tmodel = TopicModel(dummy)

    assert tmodel.n_topics == dummy.n_topics
    assert tmodel.model.transform == dummy.transform
    np.testing.assert_array_equal(tmodel.model.components_, dummy.components_)
Example #6
0
 def setUp(self):
     texts = ["Mary had a little lamb. Its fleece was white as snow.",
              "Everywhere that Mary went the lamb was sure to go.",
              "It followed her to school one day, which was against the rule.",
              "It made the children laugh and play to see a lamb at school.",
              "And so the teacher turned it out, but still it lingered near.",
              "It waited patiently about until Mary did appear.",
              "Why does the lamb love Mary so? The eager children cry.",
              "Mary loves the lamb, you know, the teacher did reply."]
     textcorpus = TextCorpus.from_texts('en', texts)
     term_lists = [doc.as_terms_list(words=True, ngrams=False, named_entities=False)
                   for doc in textcorpus]
     self.doc_term_matrix, self.id2term = build_doc_term_matrix(
         term_lists,
         weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True,
         min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None)
     self.model = TopicModel('nmf', n_topics=5)
     self.model.fit(self.doc_term_matrix)
     self.tempdir = tempfile.mkdtemp(
         prefix='test_topic_model', dir=os.path.dirname(os.path.abspath(__file__)))
Example #7
0
def test_vectorization_and_topic_modeling_functionality(corpus):
    n_topics = 10
    top_n = 10
    vectorizer = Vectorizer(
        tf_type="linear",
        apply_idf=True,
        idf_type="smooth",
        norm=None,
        min_df=2,
        max_df=0.95,
    )
    doc_term_matrix = vectorizer.fit_transform(
        (doc._.to_terms_list(ngrams=1, entities=True, as_strings=True)
         for doc in corpus))
    model = TopicModel("nmf", n_topics=n_topics)
    model.fit(doc_term_matrix)
    doc_topic_matrix = model.transform(doc_term_matrix)
    assert isinstance(doc_term_matrix, sp.csr_matrix)
    assert isinstance(doc_topic_matrix, np.ndarray)
    assert doc_topic_matrix.shape[1] == n_topics
    for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term,
                                                      top_n=top_n):
        assert isinstance(topic_idx, int)
        assert len(top_terms) == top_n
Example #8
0
def test_init_model():
    expecteds = (NMF, LatentDirichletAllocation, TruncatedSVD)
    models = ["nmf", "lda", "lsa"]
    for model, expected in zip(models, expecteds):
        assert isinstance(TopicModel(model).model, expected)
Example #9
0
def test_n_topics():
    for model in ["nmf", "lda", "lsa"]:
        assert TopicModel(model, n_topics=20).n_topics == 20
Example #10
0
def model(doc_term_matrix):
    model_ = TopicModel("nmf", n_topics=5)
    model_.fit(doc_term_matrix)
    return model_
Example #11
0
 def test_init_model(self):
     expecteds = (NMF, LatentDirichletAllocation, TruncatedSVD)
     models = ('nmf', 'lda', 'lsa')
     for model, expected in zip(models, expecteds):
         self.assertTrue(isinstance(TopicModel(model).model, expected))
Example #12
0
 def test_n_topics(self):
     for model in ['nmf', 'lda', 'lsa']:
         self.assertEqual(TopicModel(model, n_topics=20).n_topics, 20)
Example #13
0
class TopicModelTestCase(unittest.TestCase):
    def setUp(self):
        texts = [
            "Mary had a little lamb. Its fleece was white as snow.",
            "Everywhere that Mary went the lamb was sure to go.",
            "It followed her to school one day, which was against the rule.",
            "It made the children laugh and play to see a lamb at school.",
            "And so the teacher turned it out, but still it lingered near.",
            "It waited patiently about until Mary did appear.",
            "Why does the lamb love Mary so? The eager children cry.",
            "Mary loves the lamb, you know, the teacher did reply."
        ]
        textcorpus = TextCorpus.from_texts('en', texts)
        term_lists = [
            doc.as_terms_list(words=True, ngrams=False, named_entities=False)
            for doc in textcorpus
        ]
        self.doc_term_matrix, self.id2term = build_doc_term_matrix(
            term_lists,
            weighting='tf',
            normalize=False,
            sublinear_tf=False,
            smooth_idf=True,
            min_df=1,
            max_df=1.0,
            min_ic=0.0,
            max_n_terms=None)
        self.model = TopicModel('nmf', n_topics=5)
        self.model.fit(self.doc_term_matrix)
        self.tempdir = tempfile.mkdtemp(prefix='test_topic_model',
                                        dir=os.path.dirname(
                                            os.path.abspath(__file__)))

    def test_n_topics(self):
        for model in ['nmf', 'lda', 'lsa']:
            self.assertEqual(TopicModel(model, n_topics=20).n_topics, 20)

    def test_init_model(self):
        expecteds = (NMF, LatentDirichletAllocation, TruncatedSVD)
        models = ('nmf', 'lda', 'lsa')
        for model, expected in zip(models, expecteds):
            self.assertTrue(isinstance(TopicModel(model).model, expected))

    def test_save_load(self):
        filename = os.path.join(self.tempdir, 'model.pkl')
        expected = self.model.model.components_
        self.model.save(filename)
        tmp_model = TopicModel.load(filename)
        observed = tmp_model.model.components_
        self.assertEqual(observed.shape, expected.shape)
        self.assertTrue(np.equal(observed, expected).all())

    def test_transform(self):
        expected = (self.doc_term_matrix.shape[0], self.model.n_topics)
        observed = self.model.transform(self.doc_term_matrix).shape
        self.assertEqual(observed, expected)

    def test_get_doc_topic_matrix(self):
        expected = np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
        observed = self.model.get_doc_topic_matrix(self.doc_term_matrix,
                                                   normalize=True).sum(axis=1)
        self.assertTrue(np.equal(observed, expected).all())

    def test_get_doc_topic_matrix_nonnormalized(self):
        expected = self.model.transform(self.doc_term_matrix)
        observed = self.model.get_doc_topic_matrix(self.doc_term_matrix,
                                                   normalize=False)
        self.assertTrue(np.equal(observed, expected).all())

    def test_top_topic_terms_topics(self):
        self.assertEqual(
            len(list(self.model.top_topic_terms(self.id2term, topics=-1))),
            self.model.n_topics)
        self.assertEqual(
            len(list(self.model.top_topic_terms(self.id2term, topics=0))), 1)
        self.assertEqual([
            topic_idx
            for topic_idx, _ in self.model.top_topic_terms(self.id2term,
                                                           topics=(1, 2, 3))
        ], [1, 2, 3])

    def test_top_topic_terms_top_n(self):
        self.assertEqual(
            len(
                list(
                    self.model.top_topic_terms(self.id2term,
                                               topics=0,
                                               top_n=10))[0][1]), 10)
        self.assertEqual(
            len(
                list(
                    self.model.top_topic_terms(self.id2term, topics=0,
                                               top_n=5))[0][1]), 5)

    def test_top_topic_terms_weights(self):
        observed = list(
            self.model.top_topic_terms(self.id2term,
                                       topics=-1,
                                       top_n=10,
                                       weights=True))
        self.assertTrue(isinstance(observed[0][1][0], tuple))
        for topic_idx, term_weights in observed:
            for i in range(len(term_weights) - 1):
                self.assertTrue(term_weights[i][1] >= term_weights[i + 1][1])

    def tearDown(self):
        for fname in os.listdir(self.tempdir):
            os.remove(os.path.join(self.tempdir, fname))
        os.rmdir(self.tempdir)
Example #14
0
class TopicModelTestCase(unittest.TestCase):

    def setUp(self):
        texts = ["Mary had a little lamb. Its fleece was white as snow.",
                 "Everywhere that Mary went the lamb was sure to go.",
                 "It followed her to school one day, which was against the rule.",
                 "It made the children laugh and play to see a lamb at school.",
                 "And so the teacher turned it out, but still it lingered near.",
                 "It waited patiently about until Mary did appear.",
                 "Why does the lamb love Mary so? The eager children cry.",
                 "Mary loves the lamb, you know, the teacher did reply."]
        textcorpus = TextCorpus.from_texts('en', texts)
        term_lists = [doc.as_terms_list(words=True, ngrams=False, named_entities=False)
                      for doc in textcorpus]
        self.doc_term_matrix, self.id2term = build_doc_term_matrix(
            term_lists,
            weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True,
            min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None)
        self.model = TopicModel('nmf', n_topics=5)
        self.model.fit(self.doc_term_matrix)
        self.tempdir = tempfile.mkdtemp(
            prefix='test_topic_model', dir=os.path.dirname(os.path.abspath(__file__)))

    def test_n_topics(self):
        for model in ['nmf', 'lda', 'lsa']:
            self.assertEqual(TopicModel(model, n_topics=20).n_topics, 20)

    def test_init_model(self):
        expecteds = (NMF, LatentDirichletAllocation, TruncatedSVD)
        models = ('nmf', 'lda', 'lsa')
        for model, expected in zip(models, expecteds):
            self.assertTrue(isinstance(TopicModel(model).model, expected))

    def test_save_load(self):
        filename = os.path.join(self.tempdir, 'model.pkl')
        expected = self.model.model.components_
        self.model.save(filename)
        tmp_model = TopicModel.load(filename)
        observed = tmp_model.model.components_
        self.assertEqual(observed.shape, expected.shape)
        self.assertTrue(np.equal(observed, expected).all())

    def test_transform(self):
        expected = (self.doc_term_matrix.shape[0], self.model.n_topics)
        observed = self.model.transform(self.doc_term_matrix).shape
        self.assertEqual(observed, expected)

    def test_get_doc_topic_matrix(self):
        expected = np.array([1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0,  1.0])
        observed = self.model.get_doc_topic_matrix(self.doc_term_matrix,
                                                   normalize=True).sum(axis=1)
        self.assertTrue(np.equal(observed, expected).all())

    def test_get_doc_topic_matrix_nonnormalized(self):
        expected = self.model.transform(self.doc_term_matrix)
        observed = self.model.get_doc_topic_matrix(self.doc_term_matrix,
                                                   normalize=False)
        self.assertTrue(np.equal(observed, expected).all())

    def test_top_topic_terms_topics(self):
        self.assertEqual(
            len(list(self.model.top_topic_terms(self.id2term, topics=-1))),
            self.model.n_topics)
        self.assertEqual(
            len(list(self.model.top_topic_terms(self.id2term, topics=0))), 1)
        self.assertEqual(
            [topic_idx for topic_idx, _
             in self.model.top_topic_terms(self.id2term, topics=(1, 2, 3))],
            [1, 2, 3])

    def test_top_topic_terms_top_n(self):
        self.assertEqual(
            len(list(self.model.top_topic_terms(self.id2term, topics=0, top_n=10))[0][1]),
            10)
        self.assertEqual(
            len(list(self.model.top_topic_terms(self.id2term, topics=0, top_n=5))[0][1]),
            5)

    def test_top_topic_terms_weights(self):
        observed = list(self.model.top_topic_terms(self.id2term, topics=-1,
                                                   top_n=10, weights=True))
        self.assertTrue(isinstance(observed[0][1][0], tuple))
        for topic_idx, term_weights in observed:
            for i in range(len(term_weights) - 1):
                self.assertTrue(term_weights[i][1] >= term_weights[i+1][1])

    def tearDown(self):
        for fname in os.listdir(self.tempdir):
            os.remove(os.path.join(self.tempdir, fname))
        os.rmdir(self.tempdir)
Example #15
0
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)

# In[63]:

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

# In[78]:

from textacy.tm import TopicModel
model = TopicModel('lda', n_topics=10)
model.fit(tf)
model

# In[80]:

import matplotlib.pyplot as plt
model.termite_plot(tf,
                   tf_feature_names,
                   topics=-1,
                   n_terms=50,
                   highlight_topics=[2, 3, 4, 8])
plt.show()

# In[82]: