def main(): """ Analyzes scraped pages using scikit-learn.LDA """ # The number of topics K = 10 # no of documents D = 300 n_features = 1000 # Our vocabulary vocab = list(set(file('./vocab').readlines())) W = len(vocab) # Add terms and topics to the DB db.init() db.add_terms(vocab) db.add_topics(K) olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # grab documents ### Load your scraped pages, re-tokenize, and vectorize result. docset, docnames = [], [] for filename in os.listdir(os.getcwd()): if filename.endswith('.html'): tree = html.parse(filename) try: encoding = tree.xpath('//meta/@charset')[0] except IndexError: encoding = 'utf-8' with open(filename) as page: rawtext = page.read() try: rawtext = rawtext.decode(encoding, errors='backslashreplace') except TypeError: continue # encoding issues, see http://stackoverflow.com/questions/19527279/python-unicode-to-ascii-conversion docset += [clean_html(rawtext)] docnames += [filename[:-5]] if not(len(docset) % 10): print("loaded " + str(len(docset)) + " documents") # Give them to online LDA # Also computes an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) (gamma, bound) = olda.update_lambda(wordids, wordcts) # Arrays for adding batches of data to the DB # doc_array = [] # doc_term_array = [] # for d in range(len(docnames)): # doc_array.append((docnames[d], docset[d])) doc_array = zip(docnames, docset) # Add a batch of docs to the DB; this is the one DB task that is not in # the separate DB write thread since later tasks depend on having doc ids. # Since writes take so long, this also balaces the two threads time-wise. doc_ids = db.add_docs(doc_array) doc_topic_array = [] for d in range(len(gamma)): doc_size = len(docset[d]) for k in range(len(gamma[d])): doc_topic_array.append((doc_ids[d], k, gamma[d][k], gamma[d][k]/doc_size)) db.add_doc_topics(doc_topic_array) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (1, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. numpy.savetxt('lambda-%d.dat' % 1, olda._lambda) numpy.savetxt('gamma-%d.dat' % 1, gamma) topic_terms_array = [] for topic in range(len(olda._lambda)): lambda_sum = sum(olda._lambda[topic]) for term in range(len(olda._lambda[topic])): topic_terms_array.append((topic, term, olda._lambda[topic][term]/lambda_sum)) db.update_topic_terms(K, topic_terms_array) gc.collect() # probably not necesary, but precautionary for long runs db.print_task_update() # The DB thread ends only when it has both run out of tasks and it has been # signaled that it will not be recieving any more tasks db.increment_batch_count() db.signal_end()
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Add terms and topics to the DB db.init() db.add_terms(vocab) db.add_topics(K) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) # Arrays for adding batches of data to the DB doc_array = [] doc_term_array = [] for d in range(len(articlenames)): doc_array.append((articlenames[d], docset[d])) # Add a batch of docs to the DB; this is the one DB task that is not in # the separate DB write thread since later tasks depend on having doc ids. # Since writes take so long, this also balaces the two threads time-wise. doc_ids = db.add_docs(doc_array) doc_topic_array = [] for d in range(len(gamma)): doc_size = len(docset[d]) for k in range(len(gamma[d])): doc_topic_array.append((doc_ids[d], k, gamma[d][k], gamma[d][k]/doc_size)) db.add_doc_topics(doc_topic_array) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma) topic_terms_array =[] for topic in range(len(olda._lambda)): lambda_sum = sum(olda._lambda[topic]) for term in range(len(olda._lambda[topic])): topic_terms_array.append((topic, term, olda._lambda[topic][term]/lambda_sum)) db.update_topic_terms(K, topic_terms_array) gc.collect() # probably not necesary, but precautionary for long runs db.print_task_update() db.increment_batch_count() # The DB thread ends only when it has both run out of tasks and it has been # signaled that it will not be recieving any more tasks db.signal_end()
if word in vocab: length += 1 # TODO: this should be done less hackishly t = int(filename.split('/')[6]) if length == 0: continue db.add_doc(title, subtitle, length, filename, t) (gamma, ss) = olda.do_e_step(alltxt) if t not in per_time: per_time[t] = ss else: per_time[t] += ss db.add_doc_topics(filename, gamma.tolist()[0]) if i % 100 == 0: tn = (time.time() - s) / 3600 rem = D - i time_rem = rem * (tn) / (i+1) print "doc %d (%d)" % (i, t), tn, str(time_rem)+'h', \ str(time_rem/24)+'d' i += 1 # slice up topics by time print "calculating time-slice topics" for t in per_time: per_time[t] += olda._eta db.add_time_topics(t, per_time[t])