def test_lda_transform_before_fit(): """ test `transform` before `fit` """ rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) lda = OnlineLDA() lda.transform(X)
def lda_online_example(): """ Example for LDA online update """ def chunks(l, n): for i in xrange(0, len(l), n): yield l[i:i + n] # In default, we set topic number to 10, and both hyperparameter # eta and alpha to 0.1 (`1 / n_topics`) n_topics = 10 alpha = 1. / n_topics eta = 1. / n_topics # chunk_size is how many records we want to use # in each online iteration chunk_size = 2000 n_features = 1000 n_top_words = 15 print('Example of LDA with online update') print("Loading 20 news groups dataset...") dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) vectorizer = CountVectorizer(max_df=0.8, max_features=n_features, min_df=3, stop_words='english') lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, kappa=0.7, tau=512., n_jobs=-1, n_docs=1e4, random_state=0, verbose=0) for chunk_no, doc_list in enumerate(chunks(dataset.data, chunk_size)): if chunk_no == 0: doc_mtx = vectorizer.fit_transform(doc_list) feature_names = vectorizer.get_feature_names() else: doc_mtx = vectorizer.transform(doc_list) # fit model print("\nFitting LDA models with online udpate on chunk %d..." % chunk_no) lda.partial_fit(doc_mtx) print("Topics after training chunk %d:" % chunk_no) for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([ feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] ]))
def test_lda_normalize_docs(): """ test sum of topic distribution equals to 1 for each doc """ rng = np.random.RandomState(0) n_topics, alpha, eta, X = _build_sparse_mtx() lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=rng) X_fit = lda.fit_transform(X) assert_array_almost_equal(X_fit.sum(axis=1), np.ones(X.shape[0]))
def test_lda_fit_transform(): """ Test LDA fit_transform & transform fit_transform and transform result should be the same """ rng = np.random.RandomState(0) n_topics, alpha, eta, X = _build_sparse_mtx() lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=rng) X_fit = lda.fit_transform(X) X_trans = lda.transform(X) assert_array_almost_equal(X_fit, X_trans, 4)
def test_lda_batch(): """ Test LDA batch training(`fit` method) """ rng = np.random.RandomState(0) n_topics, alpha, eta, X = _build_sparse_mtx() lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=rng) lda.fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def test_lda_transform_mismatch(): """ test n_vocab mismatch in fit and transform """ rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) X_2 = rng.randint(4, size=(10, 8)) n_topics = rng.randint(3, 6) alpha0 = eta0 = 1.0 / n_topics lda = OnlineLDA(n_topics=n_topics, alpha=alpha0, eta=eta0, random_state=rng) lda.partial_fit(X) lda.transform(X_2)
def test_lda_dense_input(): """ Test LDA with dense input. Similar to test_lda() """ rng = np.random.RandomState(0) X = rng.randint(5, size=(20, 10)) n_topics = 3 alpha0 = eta0 = 1.0 / n_topics lda = OnlineLDA(n_topics=n_topics, alpha=alpha0, eta=eta0, random_state=rng) X_trans = lda.fit_transform(X) assert_true((X_trans > 0.0).any())
def test_lda_partial_fit_dim_mismatch(): """ test n_vocab mismatch in partial_fit """ rng = np.random.RandomState(0) n_topics = rng.randint(3, 6) alpha0 = eta0 = 1.0 / n_topics n_col = rng.randint(6, 10) X_1 = np.random.randint(4, size=(10, n_col)) X_2 = np.random.randint(4, size=(10, n_col + 1)) lda = OnlineLDA(n_topics=n_topics, alpha=alpha0, eta=eta0, tau=5.0, n_docs=20, random_state=rng) for X in [X_1, X_2]: lda.partial_fit(X)
def test_lda_online_multi_jobs(): """ Test LDA online training with multi CPU """ rng = np.random.RandomState(0) n_topics, alpha, eta, X = _build_sparse_mtx() lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, n_jobs=2, tau=5.0, n_docs=30, random_state=rng) for i in xrange(3): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def lda_online_example(): """ Example for LDA online update """ def chunks(l, n): for i in xrange(0, len(l), n): yield l[i:i + n] # In default, we set topic number to 10, and both hyperparameter # eta and alpha to 0.1 (`1 / n_topics`) n_topics = 10 alpha = 1. / n_topics eta = 1. / n_topics # chunk_size is how many records we want to use # in each online iteration chunk_size = 2000 n_features = 1000 n_top_words = 15 print('Example of LDA with online update') print("Loading 20 news groups dataset...") dataset = fetch_20newsgroups( shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) vectorizer = CountVectorizer( max_df=0.8, max_features=n_features, min_df=3, stop_words='english') lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, kappa=0.7, tau=512., n_jobs=-1, n_docs=1e4, random_state=0, verbose=0) for chunk_no, doc_list in enumerate(chunks(dataset.data, chunk_size)): if chunk_no == 0: doc_mtx = vectorizer.fit_transform(doc_list) feature_names = vectorizer.get_feature_names() else: doc_mtx = vectorizer.transform(doc_list) # fit model print("\nFitting LDA models with online udpate on chunk %d..." % chunk_no) lda.partial_fit(doc_mtx) print("Topics after training chunk %d:" % chunk_no) for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
def _lda_simple_example(): """ This is for debug """ from sklearn.feature_extraction.text import CountVectorizer test_words = ['aa', 'bb', 'cc', 'dd', 'ee', 'ff', 'gg', 'hh', 'ii', 'jj'] test_vocab = {} for idx, word in enumerate(test_words): test_vocab[word] = idx # group 1: aa, bb, cc, dd # group 2: ee ff gg # group 3: hh ii jj test_docs = [ 'aa bb cc dd aa aa', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa dd aa bb cc', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa dd aa bb cc', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj' ] vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b", max_df=0.9, min_df=1, vocabulary=test_vocab) doc_word_count = vectorizer.fit_transform(test_docs) # LDA setting n_topics = 3 alpha = 1. / n_topics eta = 1. / n_topics n_top_words = 3 lda = OnlineLDA(n_topics=n_topics, eta=eta, alpha=alpha, random_state=0, n_jobs=1, verbose=0) lda.fit(doc_word_count) feature_names = vectorizer.get_feature_names() for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join( [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
def test_lda_dense_input(): """ Test LDA with dense input. Similar to test_lda() """ rng = np.random.RandomState(0) X = rng.randint(5, size=(20, 10)) n_topics = 3 alpha0 = eta0 = 1. / n_topics lda = OnlineLDA(n_topics=n_topics, alpha=alpha0, eta=eta0, random_state=rng) X_trans = lda.fit_transform(X) assert_true((X_trans > 0.0).any())
def test_lda_online(): """ Test LDA online training(`partial_fit` method) (same as test_lda_batch) """ rng = np.random.RandomState(0) n_topics, alpha, eta, X = _build_sparse_mtx() lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, tau=30.0, random_state=rng) for i in xrange(3): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def _lda_simple_example(): """ This is for debug """ from sklearn.feature_extraction.text import CountVectorizer test_words = ['aa', 'bb', 'cc', 'dd', 'ee', 'ff', 'gg', 'hh', 'ii', 'jj'] test_vocab = {} for idx, word in enumerate(test_words): test_vocab[word] = idx # group 1: aa, bb, cc, dd # group 2: ee ff gg # group 3: hh ii jj test_docs = ['aa bb cc dd aa aa', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa dd aa bb cc', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa dd aa bb cc', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj'] vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b", max_df=0.9, min_df=1, vocabulary=test_vocab) doc_word_count = vectorizer.fit_transform(test_docs) # LDA setting n_topics = 3 alpha = 1. / n_topics eta = 1. / n_topics n_top_words = 3 lda = OnlineLDA(n_topics=n_topics, eta=eta, alpha=alpha, random_state=0, n_jobs=1, verbose=0) lda.fit(doc_word_count) feature_names = vectorizer.get_feature_names() for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
def lda_batch_example(): """ Example for LDA batch update """ # In default, we set topic number to 10, and both hyperparameter # eta and alpha to 0.1 (`1 / n_topics`) n_topics = 10 alpha = 1. / n_topics eta = 1. / n_topics # bach update is slow, so only use top 4000 records in 20 news groups n_samples = 4000 n_features = 1000 n_top_words = 15 print('Example of LDA with bath update') print("Loading 20 news groups dataset...") dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) print("convert text into sparse matrix...") vectorizer = CountVectorizer(max_df=0.8, max_features=n_features, min_df=3, stop_words='english') doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples]) print("Fitting LDA models with batch udpate...") lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, n_jobs=-1, random_state=0, verbose=1) feature_names = vectorizer.get_feature_names() lda.fit(doc_word_count, max_iters=10) for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join( [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
def test_lda_partial_fit_dim_mismatch(): """ test n_vocab mismatch in partial_fit """ rng = np.random.RandomState(0) n_topics = rng.randint(3, 6) alpha0 = eta0 = 1. / n_topics n_col = rng.randint(6, 10) X_1 = np.random.randint(4, size=(10, n_col)) X_2 = np.random.randint(4, size=(10, n_col + 1)) lda = OnlineLDA(n_topics=n_topics, alpha=alpha0, eta=eta0, tau=5., n_docs=20, random_state=rng) for X in [X_1, X_2]: lda.partial_fit(X)
def test_lda_online(): """ Test LDA online training(`partial_fit` method) (same as test_lda_batch) """ rng = np.random.RandomState(0) n_topics, alpha, eta, X = _build_sparse_mtx() lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, tau=30., random_state=rng) for i in xrange(3): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def test_lda_online_multi_jobs(): """ Test LDA online training with multi CPU """ rng = np.random.RandomState(0) n_topics, alpha, eta, X = _build_sparse_mtx() lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, n_jobs=2, tau=5., n_docs=30, random_state=rng) for i in xrange(3): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def test_lda_preplexity(): """ Test LDA preplexity for batch training preplexity should be lower after each iteration """ n_topics, alpha, eta, X = _build_sparse_mtx() lda_1 = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=0) lda_2 = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=0) distr_1 = lda_1.fit_transform(X, max_iters=1) prep_1 = lda_1.preplexity(X, distr_1, sub_sampling=False) distr_2 = lda_2.fit_transform(X, max_iters=10) prep_2 = lda_2.preplexity(X, distr_2, sub_sampling=False) assert_greater_equal(prep_1, prep_2)
def lda_batch_example(): """ Example for LDA batch update """ # In default, we set topic number to 10, and both hyperparameter # eta and alpha to 0.1 (`1 / n_topics`) n_topics = 10 alpha = 1. / n_topics eta = 1. / n_topics # bach update is slow, so only use top 4000 records in 20 news groups n_samples = 4000 n_features = 1000 n_top_words = 15 print('Example of LDA with bath update') print("Loading 20 news groups dataset...") dataset = fetch_20newsgroups( shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) print("convert text into sparse matrix...") vectorizer = CountVectorizer( max_df=0.8, max_features=n_features, min_df=3, stop_words='english') doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples]) print("Fitting LDA models with batch udpate...") lda = OnlineLDA( n_topics=n_topics, alpha=alpha, eta=eta, n_jobs=-1, random_state=0, verbose=1) feature_names = vectorizer.get_feature_names() lda.fit(doc_word_count, max_iters=10) for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
def test_lda_transform_mismatch(): """ test n_vocab mismatch in fit and transform """ rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) X_2 = rng.randint(4, size=(10, 8)) n_topics = rng.randint(3, 6) alpha0 = eta0 = 1. / n_topics lda = OnlineLDA(n_topics=n_topics, alpha=alpha0, eta=eta0, random_state=rng) lda.partial_fit(X) lda.transform(X_2)