Beispiel #1
0
 def test_random_state(self):
     """Test random_state parameter
     """
     params = {
         'n_topics': 10,
         'words_per_topic': 20,
         'docs_per_topic': 1,
         'words_per_doc': 100,
         'random_state': 0
     }
     matrix_1 = make_doc_word_matrix(**params)
     matrix_2 = make_doc_word_matrix(**params)
     assert_array_equal(matrix_1.toarray(), matrix_2.toarray())
Beispiel #2
0
def test_hdp_score():
    """Test HDP score function
    """
    n_topics = 3
    n_topic_truncate = 10
    words_per_topic = 10
    tf = make_doc_word_matrix(n_topics=n_topics,
                              words_per_topic=words_per_topic,
                              docs_per_topic=50,
                              words_per_doc=50,
                              shuffle=True,
                              random_state=0)

    hdp1 = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate,
                                        n_doc_truncate=3,
                                        max_iter=1,
                                        random_state=0)

    hdp2 = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate,
                                        n_doc_truncate=3,
                                        max_iter=5,
                                        random_state=0)
    hdp1.fit(tf)
    hdp2.fit(tf)
    score_1 = hdp1.score(tf)
    score_2 = hdp2.score(tf)
    assert_greater_equal(score_2, score_1)
Beispiel #3
0
 def test_diag_matrix(self):
     """Test diag matrix"""
     n_topics = self.rand.randint(100, 200)
     params = {
         'n_topics': n_topics,
         'words_per_topic': 1,
         'docs_per_topic': 1,
         'words_per_doc': 1,
         'random_state': self.rand
     }
     matrix = make_doc_word_matrix(**params)
     dense = matrix.toarray()
     assert_array_equal(dense, np.eye(n_topics))
Beispiel #4
0
    def test_make_matrix_simple(self):
        """Test words per document
        """
        n_topics = self.rand.randint(100, 200)
        words_per_topic = 30
        words_per_doc = self.rand.randint(10, 20)

        params = {
            'n_topics': n_topics,
            'words_per_topic': words_per_topic,
            'docs_per_topic': 1,
            'words_per_doc': words_per_doc
        }
        matrix = make_doc_word_matrix(**params)
        dense = matrix.toarray()
        assert_equal(dense.shape[0], n_topics)
        assert_equal(dense.shape[1], words_per_topic * n_topics)
        row_sum = np.sum(dense, axis=1)
        assert_array_equal(row_sum, np.repeat(words_per_doc, dense.shape[0]))
Beispiel #5
0
    def test_make_matrix_words(self):
        """Test words in each doc are in the same topic
        """
        n_topics = self.rand.randint(100, 200)
        words_per_topic = 30
        words_per_doc = self.rand.randint(10, 20)

        params = {
            'n_topics': n_topics,
            'words_per_topic': words_per_topic,
            'docs_per_topic': 100,
            'words_per_doc': words_per_doc
        }
        matrix = make_doc_word_matrix(**params)
        dense = matrix.toarray()

        for i in xrange(dense.shape[0]):
            col_idx = np.where(dense[i, :] > 0)[0]
            max_idx = np.max(col_idx)
            min_idx = np.min(col_idx)
            assert_less(max_idx - min_idx, words_per_topic)
Beispiel #6
0
def test_hdp_partial_fit_with_fake_data():
    """Test HDP partial_fit with fake data

    Same as `test_hdp_fit_topics_with_fake_data` but
    use `partial_fit` to replace `fit`
    """

    n_topics = 3
    n_topic_truncate = 10
    topics_threshold = 0.1
    words_per_topic = 10
    tf = make_doc_word_matrix(n_topics=n_topics,
                              words_per_topic=words_per_topic,
                              docs_per_topic=100,
                              words_per_doc=50,
                              shuffle=True,
                              random_state=0)

    hdp = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate,
                                       n_doc_truncate=3,
                                       random_state=0)
    for _ in xrange(5):
        hdp.partial_fit(tf)
    _hdp_topic_check(hdp, n_topics, words_per_topic, topics_threshold)
Beispiel #7
0
def test_hdp_fit_topics_with_fake_data():
    """Test HDP fit with fake data

    Top words in large topics should be grouped correctly
    (small topic can be ignored.)
    """
    n_topics = 3
    n_topic_truncate = 10
    topics_threshold = 0.1
    words_per_topic = 10
    tf = make_doc_word_matrix(n_topics=n_topics,
                              words_per_topic=words_per_topic,
                              docs_per_topic=100,
                              words_per_doc=50,
                              shuffle=True,
                              random_state=0)

    hdp = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate,
                                       n_doc_truncate=3,
                                       max_iter=5,
                                       random_state=0)

    hdp.fit(tf)
    _hdp_topic_check(hdp, n_topics, words_per_topic, topics_threshold)
Beispiel #8
0
def print_top_words(model, n_words):
    topic_distr = model.topic_distribution()
    for topic_idx in range(model.lambda_.shape[0]):
        topic = model.lambda_[topic_idx, :]
        message = "Topic #%d (%.3f): " % (topic_idx, topic_distr[topic_idx])
        message += " ".join(
            [str(i) for i in topic.argsort()[:-n_words - 1:-1]])
        print(message)
    print()


rs = RandomState(100)

tf = make_doc_word_matrix(n_topics=5,
                          words_per_topic=10,
                          docs_per_topic=500,
                          words_per_doc=50,
                          shuffle=True,
                          random_state=rs)

hdp = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate,
                                   n_doc_truncate=n_doc_truncate,
                                   omega=2.0,
                                   alpha=1.0,
                                   max_iter=5,
                                   verbose=1,
                                   max_doc_update_iter=200,
                                   random_state=100)

hdp.fit(tf)

print("\nTopics in HDP model:")