def main(): """ Applies streamLDA to test data, currently either 20 newsgroups or wikipedia. The wikipedia option downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. This is nice for breadth of examples, but is not precisely repeatable since the articles are random. 20 newsgroups provides data on which a repeatable run can be performed. """ # The number of documents to analyze each iteration batchsize = 10 #64 # The number of topics K = 10 assert len( sys.argv ) == 3, "usage: ./stream_corpus corpus_name num_runs\ncorpus options: 20news, wikipedia" if sys.argv[1] == 'wikipedia': corpus = WikipediaCorpus() elif sys.argv[1] == '20news': corpus = TwentyNewsCorpus( "20_news", "data/20_news_date", ) else: print 'options not supported. please try again.' sys.exit() runs = int(sys.argv[2]) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 slda = streamlda.StreamLDA(K, 1. / K, 1. / K, 1., 0.7) (test_set, test_names) = corpus.docs(batchsize * 5, False) for iteration in xrange(0, runs): print '-----------------------------------' print ' Iteration %d ' % iteration print '-----------------------------------' # Get some new articles from the selected corpus (docset, articlenames) = \ corpus.docs(batchsize) # Give them to online LDA (gamma, bound) = slda.update_lambda(docset) # Compute an estimate of held-out perplexity wordids = slda.recentbatch['wordids'] wordcts = slda.recentbatch['wordcts'] #(wordids, wordcts) = slda.parse_new_docs(docset) if iteration % 10 == 0: gamma_test, new_lambda = slda.do_e_step(test_set) new_lambda = None lhood = slda.batch_bound(gamma_test) print_topics(slda._lambda, 10) print "Held-out likelihood", lhood
def main(): """ Applies streamLDA to test data, currently either 20 newsgroups or wikipedia. The wikipedia option downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. This is nice for breadth of examples, but is not precisely repeatable since the articles are random. 20 newsgroups provides data on which a repeatable run can be performed. """ # The number of documents to analyze each iteration batchsize = 10 #64 # The number of topics K = 10 assert len(sys.argv) == 3, "usage: ./stream_corpus corpus_name num_runs\ncorpus options: 20news, wikipedia" if sys.argv[1] == 'wikipedia': corpus = WikipediaCorpus() elif sys.argv[1] == '20news': corpus = TwentyNewsCorpus("20_news", "data/20_news_date", ) else: print 'options not supported. please try again.' sys.exit() runs = int(sys.argv[2]) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 slda = streamlda.StreamLDA(K, 1./K, 1./K, 1., 0.7) (test_set, test_names) = corpus.docs(batchsize * 5, False) for iteration in xrange(0, runs): print '-----------------------------------' print ' Iteration %d ' % iteration print '-----------------------------------' # Get some new articles from the selected corpus (docset, articlenames) = \ corpus.docs(batchsize) # Give them to online LDA (gamma, bound) = slda.update_lambda(docset) # Compute an estimate of held-out perplexity wordids = slda.recentbatch['wordids'] wordcts = slda.recentbatch['wordcts'] #(wordids, wordcts) = slda.parse_new_docs(docset) if iteration % 10 == 0: gamma_test, new_lambda = slda.do_e_step(test_set) new_lambda = None lhood = slda.batch_bound(gamma_test) print_topics(slda._lambda, 10) print "Held-out likelihood", lhood
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 10 #64 # The number of topics K = 10 if (len(sys.argv) < 3): corpus = WikipediaCorpus() else: assert sys.argv[2] == "20", "Only non-wikipedia corpus supported is 20 newsgroups" corpus = TwentyNewsCorpus("20_news", "data/20_news_date", ) if (len(sys.argv) < 2): runs = 50 else: runs = int(sys.argv[1]) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 slda = streamlda.StreamLDA(K, 1./K, 1./K, 1., 0.7) (test_set, test_names) = corpus.docs(batchsize * 5, False) for iteration in xrange(0, runs): print '-----------------------------------' print ' Iteration %d ' % iteration print '-----------------------------------' # Download some articles (docset, articlenames) = \ corpus.docs(batchsize) # Give them to online LDA (gamma, bound) = slda.update_lambda(docset) # Compute an estimate of held-out perplexity wordids = slda.recentbatch['wordids'] wordcts = slda.recentbatch['wordcts'] #(wordids, wordcts) = slda.parse_new_docs(docset) if iteration % 10 == 0: gamma_test, new_lambda = slda.do_e_step(test_set) new_lambda = None lhood = slda.batch_bound(gamma_test) print_topics(slda._lambda, 10) print "Held-out likelihood", lhood
for j in range(i, min(i + maxthreads, n)): wtlist.append(WikiThread()) wtlist[len(wtlist) - 1].start() for j in range(i, min(i + maxthreads, n)): wtlist[j].join() return (WikiThread.articles, WikiThread.articlenames) class WikipediaCorpus(Corpus): def __init__(self, name="wiki"): Corpus.__init__(self, name) def docs(self, num_docs, train=True): return get_random_wikipedia_articles(num_docs) if __name__ == '__main__': c = WikipediaCorpus() t0 = time.time() (articles, articlenames) = c.docs(1) for i in range(0, len(articles)): print articlenames[i] t1 = time.time() print 'took %f' % (t1 - t0) ########NEW FILE########