def test_partial_fit_after_fit(): """Test run partial_fit after fit partial_fit should reset global parameters """ X = make_uniform_doc_word_matrix(n_topics=10, words_per_topic=3, docs_per_topic=3) params = { 'n_topic_truncate': 20, 'n_doc_truncate': 5, 'learning_method': 'batch', 'max_iter': 10, 'random_state': 1, } hdp1 = HierarchicalDirichletProcess(**params) hdp1.fit(X) hdp1.partial_fit(X) hdp2 = HierarchicalDirichletProcess(**params) hdp2.partial_fit(X) assert_almost_equal(hdp1.transform(X), hdp2.transform(X))
def test_hdp_transform(): """Test HDP transform""" X = make_uniform_doc_word_matrix(n_topics=10, words_per_topic=3, docs_per_topic=3) params = { 'n_topic_truncate': 20, 'n_doc_truncate': 5, 'learning_method': 'batch', 'max_iter': 10, } hdp = HierarchicalDirichletProcess(**params) assert_raises_regexp(NotFittedError, r"^no 'lambda_' attribute", hdp.transform, X) hdp.fit(X) transformed = hdp.transform(X) assert_equal(transformed.shape[0], X.shape[0]) assert_equal(transformed.shape[1], 20)
def test_hdp_fit_transform(): """Test HDP fit_transform""" X = make_uniform_doc_word_matrix(n_topics=10, words_per_topic=3, docs_per_topic=3) params = { 'n_topic_truncate': 20, 'n_doc_truncate': 5, 'learning_method': 'batch', 'max_iter': 10, 'random_state': 1, } hdp1 = HierarchicalDirichletProcess(**params) hdp1.fit(X) transformed_1 = hdp1.transform(X) hdp2 = HierarchicalDirichletProcess(**params) transformed_2 = hdp2.fit_transform(X) assert_almost_equal(transformed_1, transformed_2)
total_samples=1e6, max_doc_update_iter=200, verbose=1, mean_change_tol=1e-3, random_state=100) for i in range(5): t0 = time() print("iter %d" % i) suffled_tf = shuffle(tf, random_state=rs) hdp.partial_fit(suffled_tf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in HDP model:") tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(hdp, tf_feature_names, n_top_words) # top topics in each group print("\nTop topics in each group:") train_topics = hdp.transform(tf) # normalize train_topics = train_topics / np.sum(train_topics, axis=1)[:, np.newaxis] for grp_idx, group_name in enumerate(target_names): doc_idx = np.where(train_targets == grp_idx)[0] mean_doc_topics = np.mean(train_topics[doc_idx, :], axis=0) top_idx = mean_doc_topics.argsort()[:-n_top_topics - 1:-1] print("group: %s:" % group_name) print("top topics: %s" % (", ".join(["#%d (%.3f)" % (idx, mean_doc_topics[idx]) for idx in top_idx]))) print()