Example #1
0
def main():
    """
    Applies streamLDA to test data, currently either 20 newsgroups or
    wikipedia. The wikipedia option downloads and analyzes a bunch of random
    Wikipedia articles using online VB for LDA. This is nice for breadth of
    examples, but is not precisely repeatable since the articles are random. 20
    newsgroups provides data on which a repeatable run can be performed.
    """

    # The number of documents to analyze each iteration
    batchsize = 10  #64
    # The number of topics
    K = 10

    assert len(
        sys.argv
    ) == 3, "usage: ./stream_corpus corpus_name num_runs\ncorpus options: 20news, wikipedia"
    if sys.argv[1] == 'wikipedia':
        corpus = WikipediaCorpus()
    elif sys.argv[1] == '20news':
        corpus = TwentyNewsCorpus(
            "20_news",
            "data/20_news_date",
        )
    else:
        print 'options not supported. please try again.'
        sys.exit()
    runs = int(sys.argv[2])

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    slda = streamlda.StreamLDA(K, 1. / K, 1. / K, 1., 0.7)

    (test_set, test_names) = corpus.docs(batchsize * 5, False)

    for iteration in xrange(0, runs):
        print '-----------------------------------'
        print '         Iteration %d              ' % iteration
        print '-----------------------------------'

        # Get some new articles from the selected corpus
        (docset, articlenames) = \
            corpus.docs(batchsize)
        # Give them to online LDA
        (gamma, bound) = slda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        wordids = slda.recentbatch['wordids']
        wordcts = slda.recentbatch['wordcts']
        #(wordids, wordcts) = slda.parse_new_docs(docset)

        if iteration % 10 == 0:
            gamma_test, new_lambda = slda.do_e_step(test_set)
            new_lambda = None
            lhood = slda.batch_bound(gamma_test)

            print_topics(slda._lambda, 10)
            print "Held-out likelihood", lhood
Example #2
0
def main():
    """
    Applies streamLDA to test data, currently either 20 newsgroups or
    wikipedia. The wikipedia option downloads and analyzes a bunch of random
    Wikipedia articles using online VB for LDA. This is nice for breadth of
    examples, but is not precisely repeatable since the articles are random. 20
    newsgroups provides data on which a repeatable run can be performed.
    """

    # The number of documents to analyze each iteration
    batchsize = 10 #64
    # The number of topics
    K = 10

    assert len(sys.argv) == 3, "usage: ./stream_corpus corpus_name num_runs\ncorpus options: 20news, wikipedia"
    if sys.argv[1] == 'wikipedia':
        corpus = WikipediaCorpus()
    elif sys.argv[1] == '20news':
        corpus = TwentyNewsCorpus("20_news", "data/20_news_date", )
    else:
        print 'options not supported. please try again.'
        sys.exit()
    runs = int(sys.argv[2])        

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    slda = streamlda.StreamLDA(K, 1./K, 1./K, 1., 0.7)

    (test_set, test_names) = corpus.docs(batchsize * 5, False)

    for iteration in xrange(0, runs):
        print '-----------------------------------'
        print '         Iteration %d              ' % iteration
        print '-----------------------------------'
        
        # Get some new articles from the selected corpus
        (docset, articlenames) = \
            corpus.docs(batchsize)
        # Give them to online LDA
        (gamma, bound) = slda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        wordids = slda.recentbatch['wordids']
        wordcts = slda.recentbatch['wordcts']
        #(wordids, wordcts) = slda.parse_new_docs(docset)

        if iteration % 10 == 0:
          gamma_test, new_lambda = slda.do_e_step(test_set)
          new_lambda = None
          lhood = slda.batch_bound(gamma_test)

          print_topics(slda._lambda, 10)
          print "Held-out likelihood", lhood
Example #3
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 10 #64
    # The number of topics
    K = 10

    if (len(sys.argv) < 3):
      corpus = WikipediaCorpus()
    else:
      assert sys.argv[2] == "20", "Only non-wikipedia corpus supported is 20 newsgroups"
      corpus = TwentyNewsCorpus("20_news", "data/20_news_date", )

    if (len(sys.argv) < 2):        
        runs = 50
    else:
        runs = int(sys.argv[1])        

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    slda = streamlda.StreamLDA(K, 1./K, 1./K, 1., 0.7)

    (test_set, test_names) = corpus.docs(batchsize * 5, False)

    for iteration in xrange(0, runs):
        print '-----------------------------------'
        print '         Iteration %d              ' % iteration
        print '-----------------------------------'
        
        # Download some articles
        (docset, articlenames) = \
            corpus.docs(batchsize)
        # Give them to online LDA
        (gamma, bound) = slda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        wordids = slda.recentbatch['wordids']
        wordcts = slda.recentbatch['wordcts']
        #(wordids, wordcts) = slda.parse_new_docs(docset)

        if iteration % 10 == 0:
          gamma_test, new_lambda = slda.do_e_step(test_set)
          new_lambda = None
          lhood = slda.batch_bound(gamma_test)

          print_topics(slda._lambda, 10)
          print "Held-out likelihood", lhood
Example #4
0
        for j in range(i, min(i + maxthreads, n)):
            wtlist.append(WikiThread())
            wtlist[len(wtlist) - 1].start()
        for j in range(i, min(i + maxthreads, n)):
            wtlist[j].join()
    return (WikiThread.articles, WikiThread.articlenames)


class WikipediaCorpus(Corpus):
    def __init__(self, name="wiki"):
        Corpus.__init__(self, name)

    def docs(self, num_docs, train=True):
        return get_random_wikipedia_articles(num_docs)


if __name__ == '__main__':

    c = WikipediaCorpus()

    t0 = time.time()

    (articles, articlenames) = c.docs(1)
    for i in range(0, len(articles)):
        print articlenames[i]

    t1 = time.time()
    print 'took %f' % (t1 - t0)

########NEW FILE########