def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() # The number of documents to analyze each iteration D = 1000 batchsize = 100 # The number of topics K = 30 # Our vocabulary vocab = file('./com_all_words.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(D / batchsize / size): # slaves # Download some articles docset = [] linecache.clearcache() startpoint = iteration * batchsize * size + batchsize * rank + 1 if startpoint > D: # search to the end break # stop # get the paper keywords in batches for i in range(batchsize): f1 = open('com_all_key.txt', 'r') docset.append( linecache.getline('com_all_key.txt', min(D, startpoint))[:-1]) startpoint = startpoint + 1 # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration+1, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. gammas = comm.gather(gamma, root=0) lambdas = comm.gather(olda._lambda, root=0) if rank == 0: gamma_result = numpy.vstack((x for x in gammas)) lambda_result = numpy.vstack((x for x in lambdas)) numpy.savetxt('lambda_parallel.dat', olda._lambda) numpy.savetxt('gamma_parallel.dat', gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() # The number of documents to analyze each iteration D = 1000 batchsize = 100 # The number of topics K = 30 # Our vocabulary vocab = file('./com_all_words.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(D/batchsize/size): # slaves # Download some articles docset = [] linecache.clearcache() startpoint = iteration * batchsize * size + batchsize * rank + 1 if startpoint > D: # search to the end break # stop # get the paper keywords in batches for i in range(batchsize): f1 = open('com_all_key.txt','r') docset.append(linecache.getline('com_all_key.txt', min(D, startpoint))[:-1]) startpoint = startpoint + 1 # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration+1, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. gammas = comm.gather(gamma, root = 0) lambdas = comm.gather(olda._lambda, root = 0) if rank == 0: gamma_result = numpy.vstack((x for x in gammas)) lambda_result = numpy.vstack((x for x in lambdas)) numpy.savetxt('lambda_parallel.dat', olda._lambda) numpy.savetxt('gamma_parallel.dat', gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D / batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() # The number of documents to analyze each iteration D = 3.3e6 batchsize = 10 # The number of topics K = 100 # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(3): # slaves # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration+1, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. gammas = comm.gather(gamma, root = 0) lambdas = comm.gather(olda._lambda, root = 0) if rank == 0: gamma_result = numpy.vstack((x for x in gammas)) lambda_result = numpy.vstack((x for x in lambdas)) numpy.savetxt('lambda_parallel.dat', olda._lambda) numpy.savetxt('gamma_parallel.dat', gamma)