Ejemplo n.º 1
0
def process_corpus(batchsize, K, corpus, runs):
    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    slda = streamlda.StreamLDA(K, 1./K, 1./K, 1., 0.7)

    (test_set, test_names) = corpus.docs(batchsize * 5, False)

    for iteration in xrange(0, runs):
        print '-----------------------------------'
        print '         Iteration %d              ' % iteration
        print '-----------------------------------'
        
        # Get some new articles from the selected corpus
        (docset, articlenames) = \
            corpus.docs(batchsize)
        # Give them to online LDA
        (gamma, bound) = slda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        wordids = slda.recentbatch['wordids']
        wordcts = slda.recentbatch['wordcts']
        #(wordids, wordcts) = slda.parse_new_docs(docset)

        if iteration % 10 == 0:
            gamma_test, new_lambda = slda.do_e_step(test_set)
            new_lambda = None
            lhood = slda.batch_bound(gamma_test)

            print_topics(slda._lambda, 10)
            print "Held-out likelihood", lhood
Ejemplo n.º 2
0
def main():
    """
    Applies streamLDA to test data, currently either 20 newsgroups or
    wikipedia. The wikipedia option downloads and analyzes a bunch of random
    Wikipedia articles using online VB for LDA. This is nice for breadth of
    examples, but is not precisely repeatable since the articles are random. 20
    newsgroups provides data on which a repeatable run can be performed.
    """

    # The number of documents to analyze each iteration
    batchsize = 10  #64
    # The number of topics
    K = 10

    assert len(
        sys.argv
    ) == 3, "usage: ./stream_corpus corpus_name num_runs\ncorpus options: 20news, wikipedia"
    if sys.argv[1] == 'wikipedia':
        corpus = WikipediaCorpus()
    elif sys.argv[1] == '20news':
        corpus = TwentyNewsCorpus(
            "20_news",
            "data/20_news_date",
        )
    else:
        print 'options not supported. please try again.'
        sys.exit()
    runs = int(sys.argv[2])

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    slda = streamlda.StreamLDA(K, 1. / K, 1. / K, 1., 0.7)

    (test_set, test_names) = corpus.docs(batchsize * 5, False)

    for iteration in xrange(0, runs):
        print '-----------------------------------'
        print '         Iteration %d              ' % iteration
        print '-----------------------------------'

        # Get some new articles from the selected corpus
        (docset, articlenames) = \
            corpus.docs(batchsize)
        # Give them to online LDA
        (gamma, bound) = slda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        wordids = slda.recentbatch['wordids']
        wordcts = slda.recentbatch['wordcts']
        #(wordids, wordcts) = slda.parse_new_docs(docset)

        if iteration % 10 == 0:
            gamma_test, new_lambda = slda.do_e_step(test_set)
            new_lambda = None
            lhood = slda.batch_bound(gamma_test)

            print_topics(slda._lambda, 10)
            print "Held-out likelihood", lhood
Ejemplo n.º 3
0
def main():
    """
    Applies streamLDA to test data, currently either 20 newsgroups or
    wikipedia. The wikipedia option downloads and analyzes a bunch of random
    Wikipedia articles using online VB for LDA. This is nice for breadth of
    examples, but is not precisely repeatable since the articles are random. 20
    newsgroups provides data on which a repeatable run can be performed.
    """

    # The number of documents to analyze each iteration
    batchsize = 64
    # The number of topics
    K = 50

    tweets = open(sys.argv[1], 'r')
    runs = int(sys.argv[2])

    parser = MeCabParser.Parser()

    def parse(d):
        return parser.parse(d, to_unicode=True)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    slda = streamlda.StreamLDA(K, 1. / K, 1. / K, 1., 0.7, parse=parse)

    test_set = []
    for i in range(batchsize):
        t = json.loads(tweets.readline())['text']
        test_set.append(t)

    for iteration in xrange(0, runs):
        print '-----------------------------------'
        print '         Iteration %d              ' % iteration
        print '-----------------------------------'

        docset = []
        for i in range(batchsize):
            tweet = json.loads(tweets.readline())['text']
            docset.append(tweet)
        # Give them to online LDA
        (gamma, bound) = slda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        wordids = slda.recentbatch['wordids']
        wordcts = slda.recentbatch['wordcts']
        #(wordids, wordcts) = slda.parse_new_docs(docset)

        if iteration % 10 == 0:
            gamma_test, new_lambda = slda.do_e_step(test_set)
            new_lambda = None
            lhood = slda.batch_bound(gamma_test)

            print_topics(slda._lambda, 10)
            print "Held-out likelihood", lhood
Ejemplo n.º 4
0
def main():
    """
    Applies streamLDA to test data, currently either 20 newsgroups or
    wikipedia. The wikipedia option downloads and analyzes a bunch of random
    Wikipedia articles using online VB for LDA. This is nice for breadth of
    examples, but is not precisely repeatable since the articles are random. 20
    newsgroups provides data on which a repeatable run can be performed.
    """

    # The number of documents to analyze each iteration
    batchsize = 64
    # The number of topics
    K = 50

    tweets = open(sys.argv[1], 'r')
    runs = int(sys.argv[2])        
    
    parser = MeCabParser.Parser()
    
    def parse(d):
        return parser.parse(d, to_unicode=True)
    
    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    slda = streamlda.StreamLDA(K, 1./K, 1./K, 1., 0.7, parse=parse)

    test_set = []
    for i in range(batchsize):
        t = json.loads(tweets.readline())['text']
        test_set.append(t)
    
    for iteration in xrange(0, runs):
        print '-----------------------------------'
        print '         Iteration %d              ' % iteration
        print '-----------------------------------'
        
        docset = []
        for i in range(batchsize):
            tweet = json.loads(tweets.readline())['text']
            docset.append(tweet)
        # Give them to online LDA
        (gamma, bound) = slda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        wordids = slda.recentbatch['wordids']
        wordcts = slda.recentbatch['wordcts']
        #(wordids, wordcts) = slda.parse_new_docs(docset)

        if iteration % 10 == 0:
          gamma_test, new_lambda = slda.do_e_step(test_set)
          new_lambda = None
          lhood = slda.batch_bound(gamma_test)

          print_topics(slda._lambda, 10)
          print "Held-out likelihood", lhood
Ejemplo n.º 5
0
def main():
    """
    Applies streamLDA to test data, currently either 20 newsgroups or
    wikipedia. The wikipedia option downloads and analyzes a bunch of random
    Wikipedia articles using online VB for LDA. This is nice for breadth of
    examples, but is not precisely repeatable since the articles are random. 20
    newsgroups provides data on which a repeatable run can be performed.
    """

    # The number of documents to analyze each iteration
    batchsize = 10 #64
    # The number of topics
    K = 10

    assert len(sys.argv) == 3, "usage: ./stream_corpus corpus_name num_runs\ncorpus options: 20news, wikipedia"
    if sys.argv[1] == 'wikipedia':
        corpus = WikipediaCorpus()
    elif sys.argv[1] == '20news':
        corpus = TwentyNewsCorpus("20_news", "data/20_news_date", )
    else:
        print 'options not supported. please try again.'
        sys.exit()
    runs = int(sys.argv[2])        

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    slda = streamlda.StreamLDA(K, 1./K, 1./K, 1., 0.7)

    (test_set, test_names) = corpus.docs(batchsize * 5, False)

    for iteration in xrange(0, runs):
        print '-----------------------------------'
        print '         Iteration %d              ' % iteration
        print '-----------------------------------'
        
        # Get some new articles from the selected corpus
        (docset, articlenames) = \
            corpus.docs(batchsize)
        # Give them to online LDA
        (gamma, bound) = slda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        wordids = slda.recentbatch['wordids']
        wordcts = slda.recentbatch['wordcts']
        #(wordids, wordcts) = slda.parse_new_docs(docset)

        if iteration % 10 == 0:
          gamma_test, new_lambda = slda.do_e_step(test_set)
          new_lambda = None
          lhood = slda.batch_bound(gamma_test)

          print_topics(slda._lambda, 10)
          print "Held-out likelihood", lhood
Ejemplo n.º 6
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 10 #64
    # The number of topics
    K = 10

    if (len(sys.argv) < 3):
      corpus = WikipediaCorpus()
    else:
      assert sys.argv[2] == "20", "Only non-wikipedia corpus supported is 20 newsgroups"
      corpus = TwentyNewsCorpus("20_news", "data/20_news_date", )

    if (len(sys.argv) < 2):        
        runs = 50
    else:
        runs = int(sys.argv[1])        

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    slda = streamlda.StreamLDA(K, 1./K, 1./K, 1., 0.7)

    (test_set, test_names) = corpus.docs(batchsize * 5, False)

    for iteration in xrange(0, runs):
        print '-----------------------------------'
        print '         Iteration %d              ' % iteration
        print '-----------------------------------'
        
        # Download some articles
        (docset, articlenames) = \
            corpus.docs(batchsize)
        # Give them to online LDA
        (gamma, bound) = slda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        wordids = slda.recentbatch['wordids']
        wordcts = slda.recentbatch['wordcts']
        #(wordids, wordcts) = slda.parse_new_docs(docset)

        if iteration % 10 == 0:
          gamma_test, new_lambda = slda.do_e_step(test_set)
          new_lambda = None
          lhood = slda.batch_bound(gamma_test)

          print_topics(slda._lambda, 10)
          print "Held-out likelihood", lhood
Ejemplo n.º 7
0
def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"

    parser = argparse.ArgumentParser()
    parser.add_argument("--K", type=int, default=5, help="Number of topics")
    parser.add_argument(
        "--model",
        choices=["slda", "pfslda"],
        default="pfslda",
        help="Specify which model to train",
    )
    parser.add_argument("--p",
                        type=float,
                        default=0.15,
                        help="Value for the switch prior for pf-sLDA")
    parser.add_argument("--alpha",
                        type=bool,
                        default=True,
                        help="Specify if alpha is fixed")
    parser.add_argument(
        "--path",
        type=str,
        default=None,
        help="Path to saved model to load before training",
    )
    parser.add_argument("--lr",
                        type=float,
                        default=0.025,
                        help="Initial learning rate")
    parser.add_argument("--lambd",
                        type=float,
                        default=0,
                        help="Supervised task regularizer weight")
    parser.add_argument("--num_epochs",
                        type=int,
                        default=500,
                        help="Number of epochs to train")
    parser.add_argument(
        "--check",
        type=int,
        default=10,
        help="Number of epochs per stats check (print/save)",
    )
    parser.add_argument(
        "--batch_size",
        type=int,
        default=100,
    )
    parser.add_argument(
        "--y_thresh",
        type=float,
        default=None,
        help="Threshold for yscore (RMSE or AUC) to save model.",
    )
    parser.add_argument(
        "--c_thresh",
        type=float,
        default=None,
        help="Threshold for topic coherence to save model.",
    )

    args = parser.parse_args()

    # make sure args valid
    if args.K < 1:
        raise ValueError("Invalid number of topics.")

    p = args.p
    if p > 1 or p < 0:
        raise ValueError("Invalid switch prior p.")
    p = torch.tensor(p).to(device)
    p = torch.log(p / (1 - p))

    # load dataset and specify target type
    d = load_Pang_Lee()
    W = d["W"]
    W_val = d["W_val"]
    y = d["y"]
    y_val = d["y_val"]
    W_test = d["W_test"]
    y_test = d["y_test"]
    vocab = d["vocab"]
    version = "real"

    V = W.shape[1]
    M = W.shape[0]
    M_val = W_val.shape[0]

    # instantiate model
    if args.model == "slda":
        model = sLDA(args.K, V, M, M_val, args.alpha, device)
    elif args.model == "pfslda":
        model = pfsLDA(args.K, V, M, M_val, p, args.alpha, device)
    model.to(device)

    # load saved model if path specified
    if args.path:
        state_dict = torch.load(args["path"], map_location=device)
        model.load_state_dict(state_dict)

    kwargs = {
        "W": W,
        "y": y,
        "lr": args.lr,
        "lambd": args.lambd,
        "num_epochs": args.num_epochs,
        "check": args.check,
        "batch_size": args.batch_size,
        "version": version,
        "W_val": W_val,
        "y_val": y_val,
        "device": device,
        "y_thresh": args.y_thresh,
        "c_thresh": args.c_thresh,
    }

    fit(model, **kwargs)
    print_topics(model, 10, vocab)
Ejemplo n.º 8
0
    (wordids, wordcts) = slda.parse_new_docs(batch_docs)
    perwordbound = bound * len(batch_docs) / (slda._D * sum(map(sum, wordcts)))
    perplexity = n.exp(-perwordbound)
    perplexities.append(perplexity)

#    if (this_run % 10 == 0):                                                         
#        n.savetxt('lambda-%d.dat' % this_run, slda._lambda.as_matrix())
#        n.savetxt('gamma-%d.dat' % this_run, gamma)

    print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
        (this_run, slda._rhot, perplexity)
    perp.write("%d,%f\n" % (this_run, perplexity))
    perp.flush()
    this_run += 1
perp.close()
print_topics(slda._lambda, 50)


# set up a plot and show the results
xlabel('Run')
ylabel('Perplexity')
title('Perplexity Values - Sanity Check')
plot(range(num_runs), perplexities)
show()



########NEW FILE########
__FILENAME__ = twenty_news
# onlineldavb.py: Package of functions for fitting Latent Dirichlet
# Allocation (LDA) with online variational Bayes (VB).
Ejemplo n.º 9
0
    (wordids, wordcts) = slda.parse_new_docs(batch_docs)
    perwordbound = bound * len(batch_docs) / (slda._D * sum(map(sum, wordcts)))
    perplexity = n.exp(-perwordbound)
    perplexities.append(perplexity)

    #    if (this_run % 10 == 0):
    #        n.savetxt('lambda-%d.dat' % this_run, slda._lambda.as_matrix())
    #        n.savetxt('gamma-%d.dat' % this_run, gamma)

    print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
        (this_run, slda._rhot, perplexity)
    perp.write("%d,%f\n" % (this_run, perplexity))
    perp.flush()
    this_run += 1
perp.close()
print_topics(slda._lambda, 50)

# set up a plot and show the results
xlabel('Run')
ylabel('Perplexity')
title('Perplexity Values - Sanity Check')
plot(range(num_runs), perplexities)
show()

########NEW FILE########
__FILENAME__ = twenty_news
# onlineldavb.py: Package of functions for fitting Latent Dirichlet
# Allocation (LDA) with online variational Bayes (VB).
#
# Copyright (C) 2011 Jordan Boyd-Graber
#