Example #1
0
def test_hdp_score():
    """Test HDP score function
    """
    n_topics = 3
    n_topic_truncate = 10
    words_per_topic = 10
    tf = make_doc_word_matrix(n_topics=n_topics,
                              words_per_topic=words_per_topic,
                              docs_per_topic=50,
                              words_per_doc=50,
                              shuffle=True,
                              random_state=0)

    hdp1 = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate,
                                        n_doc_truncate=3,
                                        max_iter=1,
                                        random_state=0)

    hdp2 = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate,
                                        n_doc_truncate=3,
                                        max_iter=5,
                                        random_state=0)
    hdp1.fit(tf)
    hdp2.fit(tf)
    score_1 = hdp1.score(tf)
    score_2 = hdp2.score(tf)
    assert_greater_equal(score_2, score_1)
Example #2
0
def test_hdp_dense_input():
    """Dense and sparse input should be the same"""

    X = make_uniform_doc_word_matrix(n_topics=10,
                                     words_per_topic=3,
                                     docs_per_topic=3)
    dense_X = X.todense()
    array_X = X.toarray()

    params = {
        'n_topic_truncate': 20,
        'n_doc_truncate': 5,
        'learning_method': 'batch',
        'max_iter': 10,
        'random_state': 1,
    }
    hdp1 = HierarchicalDirichletProcess(**params)
    transformed_1 = hdp1.fit_transform(dense_X)

    hdp2 = HierarchicalDirichletProcess(**params)
    transformed_2 = hdp2.fit_transform(array_X)

    hdp3 = HierarchicalDirichletProcess(**params)
    transformed_3 = hdp3.fit_transform(X)

    assert_almost_equal(transformed_1, transformed_2)
    assert_almost_equal(transformed_2, transformed_3)
Example #3
0
def test_hdp_transform():
    """Test HDP transform"""

    X = make_uniform_doc_word_matrix(n_topics=10,
                                     words_per_topic=3,
                                     docs_per_topic=3)

    params = {
        'n_topic_truncate': 20,
        'n_doc_truncate': 5,
        'learning_method': 'batch',
        'max_iter': 10,
    }
    hdp = HierarchicalDirichletProcess(**params)

    assert_raises_regexp(NotFittedError, r"^no 'lambda_' attribute",
                         hdp.transform, X)
    hdp.fit(X)
    transformed = hdp.transform(X)
    assert_equal(transformed.shape[0], X.shape[0])
    assert_equal(transformed.shape[1], 20)
Example #4
0
def test_hdp_invalid_parameters():
    """Test HDP Invalid paramters
    """
    X = make_uniform_doc_word_matrix(n_topics=10,
                                     words_per_topic=3,
                                     docs_per_topic=3)

    hdp1 = HierarchicalDirichletProcess(n_topic_truncate=10,
                                        n_doc_truncate=3,
                                        max_iter=-1,
                                        random_state=0)

    assert_raises_regexp(ValueError, r"^Invalid ", hdp1.fit, X)

    hdp2 = HierarchicalDirichletProcess(n_topic_truncate=10,
                                        n_doc_truncate=3,
                                        learning_method='na',
                                        random_state=0)

    assert_raises_regexp(ValueError, r"^Invalid 'learning_method'", hdp2.fit,
                         X)
Example #5
0
def test_hdp_topic_distribution():
    """Test HDP topic_distribution"""

    X = make_uniform_doc_word_matrix(n_topics=10,
                                     words_per_topic=3,
                                     docs_per_topic=3)

    params = {
        'n_topic_truncate': 20,
        'n_doc_truncate': 5,
        'learning_method': 'batch',
        'max_iter': 10,
        'random_state': 1,
    }
    hdp = HierarchicalDirichletProcess(**params)

    assert_raises_regexp(NotFittedError, r"^no 'lambda_' attribute",
                         hdp.topic_distribution)
    hdp.fit(X)
    topic_distr = hdp.topic_distribution()
    assert_almost_equal(np.sum(topic_distr), 1.0)
Example #6
0
def test_likelihood_check():
    """Test enable doc_likelihood check

    The result should be the same no matter it
    is True or False.
    """
    X = make_uniform_doc_word_matrix(n_topics=10,
                                     words_per_topic=3,
                                     docs_per_topic=3)

    params = {
        'n_topic_truncate': 20,
        'n_doc_truncate': 5,
        'learning_method': 'batch',
        'max_iter': 10,
        'random_state': 1,
        'check_doc_likelihood': True,
        'evaluate_every': 1,
    }
    hdp1 = HierarchicalDirichletProcess(**params)
    ret1 = hdp1.fit_transform(X)

    params['check_doc_likelihood'] = False
    hdp2 = HierarchicalDirichletProcess(**params)
    ret2 = hdp2.fit_transform(X)
    assert_almost_equal(ret1, ret2)
Example #7
0
def test_hdp_partial_fit_with_fake_data():
    """Test HDP partial_fit with fake data

    Same as `test_hdp_fit_topics_with_fake_data` but
    use `partial_fit` to replace `fit`
    """

    n_topics = 3
    n_topic_truncate = 10
    topics_threshold = 0.1
    words_per_topic = 10
    tf = make_doc_word_matrix(n_topics=n_topics,
                              words_per_topic=words_per_topic,
                              docs_per_topic=100,
                              words_per_doc=50,
                              shuffle=True,
                              random_state=0)

    hdp = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate,
                                       n_doc_truncate=3,
                                       random_state=0)
    for _ in xrange(5):
        hdp.partial_fit(tf)
    _hdp_topic_check(hdp, n_topics, words_per_topic, topics_threshold)
Example #8
0
def test_hdp_fit_topics_with_fake_data():
    """Test HDP fit with fake data

    Top words in large topics should be grouped correctly
    (small topic can be ignored.)
    """
    n_topics = 3
    n_topic_truncate = 10
    topics_threshold = 0.1
    words_per_topic = 10
    tf = make_doc_word_matrix(n_topics=n_topics,
                              words_per_topic=words_per_topic,
                              docs_per_topic=100,
                              words_per_doc=50,
                              shuffle=True,
                              random_state=0)

    hdp = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate,
                                       n_doc_truncate=3,
                                       max_iter=5,
                                       random_state=0)

    hdp.fit(tf)
    _hdp_topic_check(hdp, n_topics, words_per_topic, topics_threshold)
Example #9
0
def test_hdp_fit_transform():
    """Test HDP fit_transform"""

    X = make_uniform_doc_word_matrix(n_topics=10,
                                     words_per_topic=3,
                                     docs_per_topic=3)

    params = {
        'n_topic_truncate': 20,
        'n_doc_truncate': 5,
        'learning_method': 'batch',
        'max_iter': 10,
        'random_state': 1,
    }
    hdp1 = HierarchicalDirichletProcess(**params)
    hdp1.fit(X)
    transformed_1 = hdp1.transform(X)

    hdp2 = HierarchicalDirichletProcess(**params)
    transformed_2 = hdp2.fit_transform(X)
    assert_almost_equal(transformed_1, transformed_2)
Example #10
0
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(train_samples)
print("done in %0.3fs." % (time() - t0))
print()

print("Fitting HDP models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (tf.shape[0], n_features))
hdp = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate,
                                   n_doc_truncate=n_doc_truncate,
                                   omega=2.0,
                                   alpha=1.0,
                                   kappa=0.7,
                                   tau=64.,
                                   max_iter=10,
                                   learning_method='online',
                                   batch_size=250,
                                   total_samples=1e6,
                                   max_doc_update_iter=200,
                                   verbose=1,
                                   mean_change_tol=1e-3,
                                   random_state=100)

for i in range(5):
    t0 = time()
    print("iter %d" % i)
    suffled_tf = shuffle(tf, random_state=rs)
    hdp.partial_fit(suffled_tf)
    print("done in %0.3fs." % (time() - t0))

print("\nTopics in HDP model:")
Example #11
0
def test_partial_fit_after_fit():
    """Test run partial_fit after fit

    partial_fit should reset global parameters
    """

    X = make_uniform_doc_word_matrix(n_topics=10,
                                     words_per_topic=3,
                                     docs_per_topic=3)

    params = {
        'n_topic_truncate': 20,
        'n_doc_truncate': 5,
        'learning_method': 'batch',
        'max_iter': 10,
        'random_state': 1,
    }
    hdp1 = HierarchicalDirichletProcess(**params)
    hdp1.fit(X)
    hdp1.partial_fit(X)
    hdp2 = HierarchicalDirichletProcess(**params)
    hdp2.partial_fit(X)
    assert_almost_equal(hdp1.transform(X), hdp2.transform(X))
Example #12
0
        topic = model.lambda_[topic_idx, :]
        message = "Topic #%d (%.3f): " % (topic_idx, topic_distr[topic_idx])
        message += " ".join(
            [str(i) for i in topic.argsort()[:-n_words - 1:-1]])
        print(message)
    print()


rs = RandomState(100)

tf = make_doc_word_matrix(n_topics=5,
                          words_per_topic=10,
                          docs_per_topic=500,
                          words_per_doc=50,
                          shuffle=True,
                          random_state=rs)

hdp = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate,
                                   n_doc_truncate=n_doc_truncate,
                                   omega=2.0,
                                   alpha=1.0,
                                   max_iter=5,
                                   verbose=1,
                                   max_doc_update_iter=200,
                                   random_state=100)

hdp.fit(tf)

print("\nTopics in HDP model:")
print_top_words(hdp, n_top_words)