def process_corpus(batchsize, K, corpus, runs): # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 slda = streamlda.StreamLDA(K, 1./K, 1./K, 1., 0.7) (test_set, test_names) = corpus.docs(batchsize * 5, False) for iteration in xrange(0, runs): print '-----------------------------------' print ' Iteration %d ' % iteration print '-----------------------------------' # Get some new articles from the selected corpus (docset, articlenames) = \ corpus.docs(batchsize) # Give them to online LDA (gamma, bound) = slda.update_lambda(docset) # Compute an estimate of held-out perplexity wordids = slda.recentbatch['wordids'] wordcts = slda.recentbatch['wordcts'] #(wordids, wordcts) = slda.parse_new_docs(docset) if iteration % 10 == 0: gamma_test, new_lambda = slda.do_e_step(test_set) new_lambda = None lhood = slda.batch_bound(gamma_test) print_topics(slda._lambda, 10) print "Held-out likelihood", lhood
def main(): """ Applies streamLDA to test data, currently either 20 newsgroups or wikipedia. The wikipedia option downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. This is nice for breadth of examples, but is not precisely repeatable since the articles are random. 20 newsgroups provides data on which a repeatable run can be performed. """ # The number of documents to analyze each iteration batchsize = 10 #64 # The number of topics K = 10 assert len( sys.argv ) == 3, "usage: ./stream_corpus corpus_name num_runs\ncorpus options: 20news, wikipedia" if sys.argv[1] == 'wikipedia': corpus = WikipediaCorpus() elif sys.argv[1] == '20news': corpus = TwentyNewsCorpus( "20_news", "data/20_news_date", ) else: print 'options not supported. please try again.' sys.exit() runs = int(sys.argv[2]) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 slda = streamlda.StreamLDA(K, 1. / K, 1. / K, 1., 0.7) (test_set, test_names) = corpus.docs(batchsize * 5, False) for iteration in xrange(0, runs): print '-----------------------------------' print ' Iteration %d ' % iteration print '-----------------------------------' # Get some new articles from the selected corpus (docset, articlenames) = \ corpus.docs(batchsize) # Give them to online LDA (gamma, bound) = slda.update_lambda(docset) # Compute an estimate of held-out perplexity wordids = slda.recentbatch['wordids'] wordcts = slda.recentbatch['wordcts'] #(wordids, wordcts) = slda.parse_new_docs(docset) if iteration % 10 == 0: gamma_test, new_lambda = slda.do_e_step(test_set) new_lambda = None lhood = slda.batch_bound(gamma_test) print_topics(slda._lambda, 10) print "Held-out likelihood", lhood
def main(): """ Applies streamLDA to test data, currently either 20 newsgroups or wikipedia. The wikipedia option downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. This is nice for breadth of examples, but is not precisely repeatable since the articles are random. 20 newsgroups provides data on which a repeatable run can be performed. """ # The number of documents to analyze each iteration batchsize = 64 # The number of topics K = 50 tweets = open(sys.argv[1], 'r') runs = int(sys.argv[2]) parser = MeCabParser.Parser() def parse(d): return parser.parse(d, to_unicode=True) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 slda = streamlda.StreamLDA(K, 1. / K, 1. / K, 1., 0.7, parse=parse) test_set = [] for i in range(batchsize): t = json.loads(tweets.readline())['text'] test_set.append(t) for iteration in xrange(0, runs): print '-----------------------------------' print ' Iteration %d ' % iteration print '-----------------------------------' docset = [] for i in range(batchsize): tweet = json.loads(tweets.readline())['text'] docset.append(tweet) # Give them to online LDA (gamma, bound) = slda.update_lambda(docset) # Compute an estimate of held-out perplexity wordids = slda.recentbatch['wordids'] wordcts = slda.recentbatch['wordcts'] #(wordids, wordcts) = slda.parse_new_docs(docset) if iteration % 10 == 0: gamma_test, new_lambda = slda.do_e_step(test_set) new_lambda = None lhood = slda.batch_bound(gamma_test) print_topics(slda._lambda, 10) print "Held-out likelihood", lhood
def main(): """ Applies streamLDA to test data, currently either 20 newsgroups or wikipedia. The wikipedia option downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. This is nice for breadth of examples, but is not precisely repeatable since the articles are random. 20 newsgroups provides data on which a repeatable run can be performed. """ # The number of documents to analyze each iteration batchsize = 64 # The number of topics K = 50 tweets = open(sys.argv[1], 'r') runs = int(sys.argv[2]) parser = MeCabParser.Parser() def parse(d): return parser.parse(d, to_unicode=True) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 slda = streamlda.StreamLDA(K, 1./K, 1./K, 1., 0.7, parse=parse) test_set = [] for i in range(batchsize): t = json.loads(tweets.readline())['text'] test_set.append(t) for iteration in xrange(0, runs): print '-----------------------------------' print ' Iteration %d ' % iteration print '-----------------------------------' docset = [] for i in range(batchsize): tweet = json.loads(tweets.readline())['text'] docset.append(tweet) # Give them to online LDA (gamma, bound) = slda.update_lambda(docset) # Compute an estimate of held-out perplexity wordids = slda.recentbatch['wordids'] wordcts = slda.recentbatch['wordcts'] #(wordids, wordcts) = slda.parse_new_docs(docset) if iteration % 10 == 0: gamma_test, new_lambda = slda.do_e_step(test_set) new_lambda = None lhood = slda.batch_bound(gamma_test) print_topics(slda._lambda, 10) print "Held-out likelihood", lhood
def main(): """ Applies streamLDA to test data, currently either 20 newsgroups or wikipedia. The wikipedia option downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. This is nice for breadth of examples, but is not precisely repeatable since the articles are random. 20 newsgroups provides data on which a repeatable run can be performed. """ # The number of documents to analyze each iteration batchsize = 10 #64 # The number of topics K = 10 assert len(sys.argv) == 3, "usage: ./stream_corpus corpus_name num_runs\ncorpus options: 20news, wikipedia" if sys.argv[1] == 'wikipedia': corpus = WikipediaCorpus() elif sys.argv[1] == '20news': corpus = TwentyNewsCorpus("20_news", "data/20_news_date", ) else: print 'options not supported. please try again.' sys.exit() runs = int(sys.argv[2]) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 slda = streamlda.StreamLDA(K, 1./K, 1./K, 1., 0.7) (test_set, test_names) = corpus.docs(batchsize * 5, False) for iteration in xrange(0, runs): print '-----------------------------------' print ' Iteration %d ' % iteration print '-----------------------------------' # Get some new articles from the selected corpus (docset, articlenames) = \ corpus.docs(batchsize) # Give them to online LDA (gamma, bound) = slda.update_lambda(docset) # Compute an estimate of held-out perplexity wordids = slda.recentbatch['wordids'] wordcts = slda.recentbatch['wordcts'] #(wordids, wordcts) = slda.parse_new_docs(docset) if iteration % 10 == 0: gamma_test, new_lambda = slda.do_e_step(test_set) new_lambda = None lhood = slda.batch_bound(gamma_test) print_topics(slda._lambda, 10) print "Held-out likelihood", lhood
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 10 #64 # The number of topics K = 10 if (len(sys.argv) < 3): corpus = WikipediaCorpus() else: assert sys.argv[2] == "20", "Only non-wikipedia corpus supported is 20 newsgroups" corpus = TwentyNewsCorpus("20_news", "data/20_news_date", ) if (len(sys.argv) < 2): runs = 50 else: runs = int(sys.argv[1]) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 slda = streamlda.StreamLDA(K, 1./K, 1./K, 1., 0.7) (test_set, test_names) = corpus.docs(batchsize * 5, False) for iteration in xrange(0, runs): print '-----------------------------------' print ' Iteration %d ' % iteration print '-----------------------------------' # Download some articles (docset, articlenames) = \ corpus.docs(batchsize) # Give them to online LDA (gamma, bound) = slda.update_lambda(docset) # Compute an estimate of held-out perplexity wordids = slda.recentbatch['wordids'] wordcts = slda.recentbatch['wordcts'] #(wordids, wordcts) = slda.parse_new_docs(docset) if iteration % 10 == 0: gamma_test, new_lambda = slda.do_e_step(test_set) new_lambda = None lhood = slda.batch_bound(gamma_test) print_topics(slda._lambda, 10) print "Held-out likelihood", lhood
def main(): device = "cuda" if torch.cuda.is_available() else "cpu" parser = argparse.ArgumentParser() parser.add_argument("--K", type=int, default=5, help="Number of topics") parser.add_argument( "--model", choices=["slda", "pfslda"], default="pfslda", help="Specify which model to train", ) parser.add_argument("--p", type=float, default=0.15, help="Value for the switch prior for pf-sLDA") parser.add_argument("--alpha", type=bool, default=True, help="Specify if alpha is fixed") parser.add_argument( "--path", type=str, default=None, help="Path to saved model to load before training", ) parser.add_argument("--lr", type=float, default=0.025, help="Initial learning rate") parser.add_argument("--lambd", type=float, default=0, help="Supervised task regularizer weight") parser.add_argument("--num_epochs", type=int, default=500, help="Number of epochs to train") parser.add_argument( "--check", type=int, default=10, help="Number of epochs per stats check (print/save)", ) parser.add_argument( "--batch_size", type=int, default=100, ) parser.add_argument( "--y_thresh", type=float, default=None, help="Threshold for yscore (RMSE or AUC) to save model.", ) parser.add_argument( "--c_thresh", type=float, default=None, help="Threshold for topic coherence to save model.", ) args = parser.parse_args() # make sure args valid if args.K < 1: raise ValueError("Invalid number of topics.") p = args.p if p > 1 or p < 0: raise ValueError("Invalid switch prior p.") p = torch.tensor(p).to(device) p = torch.log(p / (1 - p)) # load dataset and specify target type d = load_Pang_Lee() W = d["W"] W_val = d["W_val"] y = d["y"] y_val = d["y_val"] W_test = d["W_test"] y_test = d["y_test"] vocab = d["vocab"] version = "real" V = W.shape[1] M = W.shape[0] M_val = W_val.shape[0] # instantiate model if args.model == "slda": model = sLDA(args.K, V, M, M_val, args.alpha, device) elif args.model == "pfslda": model = pfsLDA(args.K, V, M, M_val, p, args.alpha, device) model.to(device) # load saved model if path specified if args.path: state_dict = torch.load(args["path"], map_location=device) model.load_state_dict(state_dict) kwargs = { "W": W, "y": y, "lr": args.lr, "lambd": args.lambd, "num_epochs": args.num_epochs, "check": args.check, "batch_size": args.batch_size, "version": version, "W_val": W_val, "y_val": y_val, "device": device, "y_thresh": args.y_thresh, "c_thresh": args.c_thresh, } fit(model, **kwargs) print_topics(model, 10, vocab)
(wordids, wordcts) = slda.parse_new_docs(batch_docs) perwordbound = bound * len(batch_docs) / (slda._D * sum(map(sum, wordcts))) perplexity = n.exp(-perwordbound) perplexities.append(perplexity) # if (this_run % 10 == 0): # n.savetxt('lambda-%d.dat' % this_run, slda._lambda.as_matrix()) # n.savetxt('gamma-%d.dat' % this_run, gamma) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (this_run, slda._rhot, perplexity) perp.write("%d,%f\n" % (this_run, perplexity)) perp.flush() this_run += 1 perp.close() print_topics(slda._lambda, 50) # set up a plot and show the results xlabel('Run') ylabel('Perplexity') title('Perplexity Values - Sanity Check') plot(range(num_runs), perplexities) show() ########NEW FILE######## __FILENAME__ = twenty_news # onlineldavb.py: Package of functions for fitting Latent Dirichlet # Allocation (LDA) with online variational Bayes (VB).
(wordids, wordcts) = slda.parse_new_docs(batch_docs) perwordbound = bound * len(batch_docs) / (slda._D * sum(map(sum, wordcts))) perplexity = n.exp(-perwordbound) perplexities.append(perplexity) # if (this_run % 10 == 0): # n.savetxt('lambda-%d.dat' % this_run, slda._lambda.as_matrix()) # n.savetxt('gamma-%d.dat' % this_run, gamma) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (this_run, slda._rhot, perplexity) perp.write("%d,%f\n" % (this_run, perplexity)) perp.flush() this_run += 1 perp.close() print_topics(slda._lambda, 50) # set up a plot and show the results xlabel('Run') ylabel('Perplexity') title('Perplexity Values - Sanity Check') plot(range(num_runs), perplexities) show() ########NEW FILE######## __FILENAME__ = twenty_news # onlineldavb.py: Package of functions for fitting Latent Dirichlet # Allocation (LDA) with online variational Bayes (VB). # # Copyright (C) 2011 Jordan Boyd-Graber #