def test_hdp_score(): """Test HDP score function """ n_topics = 3 n_topic_truncate = 10 words_per_topic = 10 tf = make_doc_word_matrix(n_topics=n_topics, words_per_topic=words_per_topic, docs_per_topic=50, words_per_doc=50, shuffle=True, random_state=0) hdp1 = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate, n_doc_truncate=3, max_iter=1, random_state=0) hdp2 = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate, n_doc_truncate=3, max_iter=5, random_state=0) hdp1.fit(tf) hdp2.fit(tf) score_1 = hdp1.score(tf) score_2 = hdp2.score(tf) assert_greater_equal(score_2, score_1)
def test_hdp_dense_input(): """Dense and sparse input should be the same""" X = make_uniform_doc_word_matrix(n_topics=10, words_per_topic=3, docs_per_topic=3) dense_X = X.todense() array_X = X.toarray() params = { 'n_topic_truncate': 20, 'n_doc_truncate': 5, 'learning_method': 'batch', 'max_iter': 10, 'random_state': 1, } hdp1 = HierarchicalDirichletProcess(**params) transformed_1 = hdp1.fit_transform(dense_X) hdp2 = HierarchicalDirichletProcess(**params) transformed_2 = hdp2.fit_transform(array_X) hdp3 = HierarchicalDirichletProcess(**params) transformed_3 = hdp3.fit_transform(X) assert_almost_equal(transformed_1, transformed_2) assert_almost_equal(transformed_2, transformed_3)
def test_hdp_transform(): """Test HDP transform""" X = make_uniform_doc_word_matrix(n_topics=10, words_per_topic=3, docs_per_topic=3) params = { 'n_topic_truncate': 20, 'n_doc_truncate': 5, 'learning_method': 'batch', 'max_iter': 10, } hdp = HierarchicalDirichletProcess(**params) assert_raises_regexp(NotFittedError, r"^no 'lambda_' attribute", hdp.transform, X) hdp.fit(X) transformed = hdp.transform(X) assert_equal(transformed.shape[0], X.shape[0]) assert_equal(transformed.shape[1], 20)
def test_hdp_invalid_parameters(): """Test HDP Invalid paramters """ X = make_uniform_doc_word_matrix(n_topics=10, words_per_topic=3, docs_per_topic=3) hdp1 = HierarchicalDirichletProcess(n_topic_truncate=10, n_doc_truncate=3, max_iter=-1, random_state=0) assert_raises_regexp(ValueError, r"^Invalid ", hdp1.fit, X) hdp2 = HierarchicalDirichletProcess(n_topic_truncate=10, n_doc_truncate=3, learning_method='na', random_state=0) assert_raises_regexp(ValueError, r"^Invalid 'learning_method'", hdp2.fit, X)
def test_hdp_topic_distribution(): """Test HDP topic_distribution""" X = make_uniform_doc_word_matrix(n_topics=10, words_per_topic=3, docs_per_topic=3) params = { 'n_topic_truncate': 20, 'n_doc_truncate': 5, 'learning_method': 'batch', 'max_iter': 10, 'random_state': 1, } hdp = HierarchicalDirichletProcess(**params) assert_raises_regexp(NotFittedError, r"^no 'lambda_' attribute", hdp.topic_distribution) hdp.fit(X) topic_distr = hdp.topic_distribution() assert_almost_equal(np.sum(topic_distr), 1.0)
def test_likelihood_check(): """Test enable doc_likelihood check The result should be the same no matter it is True or False. """ X = make_uniform_doc_word_matrix(n_topics=10, words_per_topic=3, docs_per_topic=3) params = { 'n_topic_truncate': 20, 'n_doc_truncate': 5, 'learning_method': 'batch', 'max_iter': 10, 'random_state': 1, 'check_doc_likelihood': True, 'evaluate_every': 1, } hdp1 = HierarchicalDirichletProcess(**params) ret1 = hdp1.fit_transform(X) params['check_doc_likelihood'] = False hdp2 = HierarchicalDirichletProcess(**params) ret2 = hdp2.fit_transform(X) assert_almost_equal(ret1, ret2)
def test_hdp_partial_fit_with_fake_data(): """Test HDP partial_fit with fake data Same as `test_hdp_fit_topics_with_fake_data` but use `partial_fit` to replace `fit` """ n_topics = 3 n_topic_truncate = 10 topics_threshold = 0.1 words_per_topic = 10 tf = make_doc_word_matrix(n_topics=n_topics, words_per_topic=words_per_topic, docs_per_topic=100, words_per_doc=50, shuffle=True, random_state=0) hdp = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate, n_doc_truncate=3, random_state=0) for _ in xrange(5): hdp.partial_fit(tf) _hdp_topic_check(hdp, n_topics, words_per_topic, topics_threshold)
def test_hdp_fit_topics_with_fake_data(): """Test HDP fit with fake data Top words in large topics should be grouped correctly (small topic can be ignored.) """ n_topics = 3 n_topic_truncate = 10 topics_threshold = 0.1 words_per_topic = 10 tf = make_doc_word_matrix(n_topics=n_topics, words_per_topic=words_per_topic, docs_per_topic=100, words_per_doc=50, shuffle=True, random_state=0) hdp = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate, n_doc_truncate=3, max_iter=5, random_state=0) hdp.fit(tf) _hdp_topic_check(hdp, n_topics, words_per_topic, topics_threshold)
def test_hdp_fit_transform(): """Test HDP fit_transform""" X = make_uniform_doc_word_matrix(n_topics=10, words_per_topic=3, docs_per_topic=3) params = { 'n_topic_truncate': 20, 'n_doc_truncate': 5, 'learning_method': 'batch', 'max_iter': 10, 'random_state': 1, } hdp1 = HierarchicalDirichletProcess(**params) hdp1.fit(X) transformed_1 = hdp1.transform(X) hdp2 = HierarchicalDirichletProcess(**params) transformed_2 = hdp2.fit_transform(X) assert_almost_equal(transformed_1, transformed_2)
stop_words='english') t0 = time() tf = tf_vectorizer.fit_transform(train_samples) print("done in %0.3fs." % (time() - t0)) print() print("Fitting HDP models with tf features, " "n_samples=%d and n_features=%d..." % (tf.shape[0], n_features)) hdp = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate, n_doc_truncate=n_doc_truncate, omega=2.0, alpha=1.0, kappa=0.7, tau=64., max_iter=10, learning_method='online', batch_size=250, total_samples=1e6, max_doc_update_iter=200, verbose=1, mean_change_tol=1e-3, random_state=100) for i in range(5): t0 = time() print("iter %d" % i) suffled_tf = shuffle(tf, random_state=rs) hdp.partial_fit(suffled_tf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in HDP model:")
def test_partial_fit_after_fit(): """Test run partial_fit after fit partial_fit should reset global parameters """ X = make_uniform_doc_word_matrix(n_topics=10, words_per_topic=3, docs_per_topic=3) params = { 'n_topic_truncate': 20, 'n_doc_truncate': 5, 'learning_method': 'batch', 'max_iter': 10, 'random_state': 1, } hdp1 = HierarchicalDirichletProcess(**params) hdp1.fit(X) hdp1.partial_fit(X) hdp2 = HierarchicalDirichletProcess(**params) hdp2.partial_fit(X) assert_almost_equal(hdp1.transform(X), hdp2.transform(X))
topic = model.lambda_[topic_idx, :] message = "Topic #%d (%.3f): " % (topic_idx, topic_distr[topic_idx]) message += " ".join( [str(i) for i in topic.argsort()[:-n_words - 1:-1]]) print(message) print() rs = RandomState(100) tf = make_doc_word_matrix(n_topics=5, words_per_topic=10, docs_per_topic=500, words_per_doc=50, shuffle=True, random_state=rs) hdp = HierarchicalDirichletProcess(n_topic_truncate=n_topic_truncate, n_doc_truncate=n_doc_truncate, omega=2.0, alpha=1.0, max_iter=5, verbose=1, max_doc_update_iter=200, random_state=100) hdp.fit(tf) print("\nTopics in HDP model:") print_top_words(hdp, n_top_words)