def test_lda_batch(): """ Test LDA batch training(`fit` method) """ rng = np.random.RandomState(0) n_topics, alpha, eta, X = _build_sparse_mtx() lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=rng) lda.fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def _lda_simple_example(): """ This is for debug """ from sklearn.feature_extraction.text import CountVectorizer test_words = ['aa', 'bb', 'cc', 'dd', 'ee', 'ff', 'gg', 'hh', 'ii', 'jj'] test_vocab = {} for idx, word in enumerate(test_words): test_vocab[word] = idx # group 1: aa, bb, cc, dd # group 2: ee ff gg # group 3: hh ii jj test_docs = [ 'aa bb cc dd aa aa', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa dd aa bb cc', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa dd aa bb cc', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj' ] vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b", max_df=0.9, min_df=1, vocabulary=test_vocab) doc_word_count = vectorizer.fit_transform(test_docs) # LDA setting n_topics = 3 alpha = 1. / n_topics eta = 1. / n_topics n_top_words = 3 lda = OnlineLDA(n_topics=n_topics, eta=eta, alpha=alpha, random_state=0, n_jobs=1, verbose=0) lda.fit(doc_word_count) feature_names = vectorizer.get_feature_names() for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join( [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
def _lda_simple_example(): """ This is for debug """ from sklearn.feature_extraction.text import CountVectorizer test_words = ['aa', 'bb', 'cc', 'dd', 'ee', 'ff', 'gg', 'hh', 'ii', 'jj'] test_vocab = {} for idx, word in enumerate(test_words): test_vocab[word] = idx # group 1: aa, bb, cc, dd # group 2: ee ff gg # group 3: hh ii jj test_docs = ['aa bb cc dd aa aa', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa dd aa bb cc', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa dd aa bb cc', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj'] vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b", max_df=0.9, min_df=1, vocabulary=test_vocab) doc_word_count = vectorizer.fit_transform(test_docs) # LDA setting n_topics = 3 alpha = 1. / n_topics eta = 1. / n_topics n_top_words = 3 lda = OnlineLDA(n_topics=n_topics, eta=eta, alpha=alpha, random_state=0, n_jobs=1, verbose=0) lda.fit(doc_word_count) feature_names = vectorizer.get_feature_names() for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
def lda_batch_example(): """ Example for LDA batch update """ # In default, we set topic number to 10, and both hyperparameter # eta and alpha to 0.1 (`1 / n_topics`) n_topics = 10 alpha = 1. / n_topics eta = 1. / n_topics # bach update is slow, so only use top 4000 records in 20 news groups n_samples = 4000 n_features = 1000 n_top_words = 15 print('Example of LDA with bath update') print("Loading 20 news groups dataset...") dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) print("convert text into sparse matrix...") vectorizer = CountVectorizer(max_df=0.8, max_features=n_features, min_df=3, stop_words='english') doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples]) print("Fitting LDA models with batch udpate...") lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, n_jobs=-1, random_state=0, verbose=1) feature_names = vectorizer.get_feature_names() lda.fit(doc_word_count, max_iters=10) for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join( [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
def lda_batch_example(): """ Example for LDA batch update """ # In default, we set topic number to 10, and both hyperparameter # eta and alpha to 0.1 (`1 / n_topics`) n_topics = 10 alpha = 1. / n_topics eta = 1. / n_topics # bach update is slow, so only use top 4000 records in 20 news groups n_samples = 4000 n_features = 1000 n_top_words = 15 print('Example of LDA with bath update') print("Loading 20 news groups dataset...") dataset = fetch_20newsgroups( shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) print("convert text into sparse matrix...") vectorizer = CountVectorizer( max_df=0.8, max_features=n_features, min_df=3, stop_words='english') doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples]) print("Fitting LDA models with batch udpate...") lda = OnlineLDA( n_topics=n_topics, alpha=alpha, eta=eta, n_jobs=-1, random_state=0, verbose=1) feature_names = vectorizer.get_feature_names() lda.fit(doc_word_count, max_iters=10) for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))