def process(self, docset): # Give them to online LDA (gamma, bound) = self.old_alpha.update_lambda_docs(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, self.old_alpha._vocab) perwordbound = bound * len(docset) / (self.D * sum(map(sum, wordcts))) print("%d: rho_t = %f, held-out perplexity estimate = %f" % (self.iteration, self.old_alpha._rhot, numpy.exp(-perwordbound))) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (self.iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % self.iteration, self.old_alpha._lambda) numpy.savetxt('gamma-%d.dat' % self.iteration, gamma) # This prints the top words of the topics after each mini batch for k in range(0, len(self.old_alpha._lambda)): lambdak = list(self.old_alpha._lambda[k, :]) lambdak = lambdak / sum(lambdak) temp = zip(lambdak, range(0, len(lambdak))) temp = sorted(temp, key=lambda x: x[0], reverse=True) print('topic %d:' % (k)) # feel free to change the "53" here to whatever fits your screen nicely. for i in range(0, 5): print('%20s \t---\t %.4f' % (self.vocab[temp[i][1]], temp[i][0])) print() self.iteration = self.iteration + 1
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration #batchsize = 64 batchsize = 32 # The total number of tweets #D=297861 D = 1163 # The number of topics #K = 20 K = 10 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('dictnostops.txt').readlines() W = len(vocab) #open rawdata #train_file = open("congress_train.txt") train_file = open("text.txt") train = train_file.readlines() # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 #olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 128., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) iter = 0 for iteration in range(0, documentstoanalyze): # Download some articles #(docset, articlenames) = \ #wikirandom.get_random_wikipedia_articles(batchsize) docset = train[iter:(iter+batchsize)] iter+=batchsize # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) print wordids print wordcts perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(doc_list, vocab_file): batch_size = 64 D = len(doc_list) # number of documents K = 100 # number of topics vocab = file(vocab_file).readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. for iteration in range(0, D): doc_set = doc_list[batch_size*iteration:(iteration+1)*batch_size] # Give them to online LDA (gamma, bound) = olda.update_lambda_docs(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def runLDA(self): for iteration in range(0, self.__documentstoanalyze): #Retrieve texts docset=self.__doc[iteration*self.__batchsize:iteration*self.__batchsize+self.__batchsize] # Give them to online LDA (gamma, bound) = self.__ldaObj.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, self.__ldaObj._vocab) perwordbound = bound * len(docset) / (self.__documentstoanalyze * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, self.__ldaObj._rhot, numpy.exp(-perwordbound)) # Save a temporary lambda for this iteration temp_lambda = self.__ldaObj._lambda # Save lambda and gamma if (iteration == 0): self.__lambda_all = temp_lambda else: self.__lambda_all = numpy.concatenate((self.__lambda_all,temp_lambda), axis=0) if (iteration == 0): self.__gamma_all = gamma else: self.__gamma_all = numpy.concatenate((self.__gamma_all,gamma), axis=0) numpy.savetxt('./data/lambda.dat', self.__lambda_all) numpy.savetxt('./data/gamma.dat', self.__gamma_all)
def scanAllDocs(cursor): n = 0 row = cursor.fetchone() for iteration in range(0, documentstoanalyze + 1): # Download some articles docset = list() i = 0 while row is not None and i < BATCHSIZE: docset.append(row.MyText) row = cursor.fetchone() i += 1 n += i print ("Docs analyzed: ", n) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print ('%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound))) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 100 == 0): numpy.savetxt('lambda_{0}-{1}.dat'.format(K, iteration), olda._lambda) numpy.savetxt('gamma_{0}-{1}.dat'.format(K, iteration), gamma) if n >= D-1: numpy.savetxt('lambda_{0}-final.dat'.format(K), olda._lambda) numpy.savetxt('gamma_{0}-final.dat'.format(K), gamma) break
def run_lda(meaningful_words_path, batchsize, K, GAMMA_ITER_TIMES): t = Task.create_new_lda_task() global file_counts, g_all_files file_counts = calc_file_counts() D = file_counts t.status = Task.TASK_STATUS_STARTED t.save() try: # Remove the formmer results # TODO: save it in dababases os.popen("rm yls_app/tools/lambda*") os.popen("rm yls_app/tools/gamma*") # How many documents to look at documentstoanalyze = int(D/batchsize) + 1 print 'target iteration %d'%(documentstoanalyze) # Our vocabulary, we didn't use vocabulary. vocab = read_vocab(meaningful_words_path) # Keep track of the last iteration last_iteration_perplexity = 0.0 # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7, GAMMA_ITER_TIMES) # Run until we've seen all documents. for iteration in range(documentstoanalyze): # print 'iteration ... %d'%iteration # Download some articles docset = get_article(iteration * batchsize, batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) t.infomation = '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) last_iteration_perplexity = numpy.exp(-perwordbound) print perwordbound,bound if iteration == documentstoanalyze -1: break print '%d,%f'%(iteration,last_iteration_perplexity) t.status = Task.TASK_STATUS_STARTED t.save() #print 'perplexity: %f'%(last_iteration_perplexity) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. #if (iteration % 10 == 0): #numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) #numpy.savetxt('gamma-%d.dat' % iteration, gamma) # numpy.savetxt(LDARunner.LAMBDA_FILE, olda._lambda) numpy.savetxt(LDARunner.GAMMA_FILE, gamma) except Exception,e: print e t.infomation = "Exception:" + e.message Task.finish_task(t, False)
def main(): # LDA: a documents contains all the keywords of some journal/conference # equivalent to cluster keywords over journals/conferences journal_or_conference = sys.argv[1] num = int(sys.argv[2]) conn = jcke.get_db_conn() # default (num <=0 or > max number): figure out all the journals/conferences keywords if num <= 0 or (num >= 15151 and journal_or_conference == "journal") or (num >= 4545 and journal_or_conference == "conference"): query = """ SELECT COUNT(*) FROM ##journal_or_conference## """ query = query.replace("##journal_or_conference##", journal_or_conference) conn.cursor.execute(query) num = conn.cursor.fetchall() # number of journals/conferences to process # document parsing journal_conf_list = os.listdir("journal_conf_keyword") # check if txt files exist, then generate those docs if not num == len(journal_conf_list): jcke.journal_conf_keyword_generation(conn, num, journal_or_conference) journal_conf_list = os.listdir("journal_conf_keyword") # The number of journal/conference keyword sets in each batch batchsize = lambda num: num if num <= 100 else 100 batch = batchsize(num) iteration_times = int(num/batch) # The total number of journals/conferences DocNum = lambda journal_or_conference: 15151 if journal_or_conference == "journal" else 4545 D = DocNum(journal_or_conference) # The number of topics K = 100 # maybe some other numbers # Our vocabulary : we need some vocabulary set! vocab = dict() # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 online_LDA = lda.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* sooner than this.) for iteration in range(0, iteration_times): # getting documents (keyword sets) if iteration != iteration_times - 1: journal_conf_keyword_list = jcke.input_journal_conf_keywords(journal_conf_list[iteration * batch : (iteration + 1) * batch]) else: journal_conf_keyword_list = jcke.input_journal_conf_keywords(journal_conf_list[iteration * batch :]) # online LDA for keyword sets # here we update the relative function in the package (dangerous!) online_LDA._vocab = jcke.vocabulary_generation(journal_conf_keyword_list, online_LDA._vocab) (gamma, bound) = online_LDA.update_lambda(journal_conf_keyword_list) # Compute an estimate of held-out perplexity (keywordids, keywordcts) = lda.parse_doc_list(journal_conf_keyword_list, online_LDA._vocab) perkeywordbound = bound * len(journal_conf_keyword_list) / (D * sum(map(sum, keywordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, online_LDA._rhot, numpy.exp(-perkeywordbound)) # Save lambda, the parameters to the variational distributions over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in the last iteration. numpy.savetxt('lambda-%s.dat' % journal_or_conference, online_LDA._lambda) numpy.savetxt('gamma-%s.dat' % journal_or_conference, gamma)
def main(): articles = list() artnames = list() for line in file('./jacm/withIDAbstracts.txt').readlines(): combo = line.split('\t') artnames.append(combo[0]) articles.append(combo[1]) """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 1 # The total number of documents in Wikipedia D = len(artnames) # The number of topics K = 54 # How many documents to look at documentstoanalyze = len(artnames) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, D): # Download some articles docset = list() docset.append(articles[iteration]) articlenames = list() articlenames.append(artnames[iteration]) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) print bound perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. numpy.savetxt('./simpleLDA/gamma-%d.dat' % iteration, gamma) if (iteration % 50 == 0 or iteration == 616): numpy.savetxt('./simpleLDA/lambda-%d.dat' % iteration, olda._lambda)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 100 # The total number of documents in Wikipedia D = 1000 # The number of topics K = 100 # How many documents to look at documentstoanalyze = int(D / batchsize) # Our vocabulary vocab = file('./com_all_words.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles docset = [] counts = [] linecache.clearcache() startpoint = iteration * batchsize + 1 # get the paper keywords in batches for i in range(batchsize): f1 = open('com_all_key.txt', 'r') f2 = open('com_all.txt', 'r') docset.append( linecache.getline('com_all_key.txt', min(D, startpoint + i))[:-1]) counts.append( linecache.getline('com_all.txt', min(D, startpoint + i))[:-1]) # Give them to online LDA # print docset[0] (gamma, bound) = olda.update_lambda(docset, counts) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab, counts) # print [olda._vocab[x] for x in docset[0].split(';')], wordids[0], wordcts[0] perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda_paper-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma_paper-%d.dat' % iteration, gamma)
def main(): # The number of documents to analyze each iteration. batchsize = args.batchsize # The total number of documents in the corpus. D = args.num_docs # The number of topics. K = args.num_topics # How many documents to look at documentstoanalyze = int(D / batchsize) # The vocabulary vocab = file(args.vocab_file).readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 alpha = 1. / K # prior on topic weights theta eta = 1. / K # prior on p(w|topic) Beta tau_0 = args.tau_0 # learning parameter to downweight early documents kappa = args.kappa # learning parameter; decay factor for influence of batches olda = onlineldavb.OnlineLDA(vocab, K, D, alpha, 1. / K, tau_0, kappa) dataset_file = open(args.dataset) start = time.time() for iteration in range(0, documentstoanalyze): # Read a batch of articles. docset = batch_read(dataset_file, batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): i = iteration pct = round((i * 1.0 / documentstoanalyze) * 100, 2) elapsed = int(time.time() - start) Printer( "Processed {0} batches. ~ {1}% complete. Elapsed time: {2}s". format(i, pct, elapsed)) if (iteration % args.model_out_freq == 0): numpy.savetxt( '{0}lambda-{1}.dat'.format(args.outdir, iteration), olda._lambda) numpy.savetxt( '{0}gamma-{1}.dat'.format(args.outdir, iteration), gamma) numpy.savetxt('{0}lambda-final.dat'.format(args.outdir), olda._lambda) numpy.savetxt('{0}gamma-final.dat'.format(args.outdir), gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 100 # The total number of documents in Wikipedia D = 1000 # The number of topics K = 100 # How many documents to look at documentstoanalyze = int(D/batchsize) # Our vocabulary vocab = file('./com_all_words.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles docset = [] counts = [] linecache.clearcache() startpoint = iteration * batchsize + 1 # get the paper keywords in batches for i in range(batchsize): f1 = open('com_all_key.txt','r') f2 = open('com_all.txt', 'r') docset.append(linecache.getline('com_all_key.txt', min(D, startpoint + i))[:-1]) counts.append(linecache.getline('com_all.txt', min(D, startpoint + i))[:-1]) # Give them to online LDA # print docset[0] (gamma, bound) = olda.update_lambda(docset, counts) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab, counts) # print [olda._vocab[x] for x in docset[0].split(';')], wordids[0], wordcts[0] perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda_paper-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma_paper-%d.dat' % iteration, gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA print docset[0] print docset[1] print docset[2] (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(): # The number of documents to analyze each iteration. batchsize = args.batchsize # The total number of documents in the corpus. D = args.num_docs # The number of topics. K = args.num_topics # How many documents to look at documentstoanalyze = int(D/batchsize) # The vocabulary vocab = file(args.vocab_file).readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 alpha = 1./K # prior on topic weights theta eta = 1./K # prior on p(w|topic) Beta tau_0 = args.tau_0 # learning parameter to downweight early documents kappa = args.kappa # learning parameter; decay factor for influence of batches olda = onlineldavb.OnlineLDA(vocab, K, D, alpha, 1./K, tau_0, kappa) dataset_file = open(args.dataset) start = time.time() for iteration in range(0, documentstoanalyze): # Read a batch of articles. docset = batch_read(dataset_file, batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): i = iteration pct = round((i * 1.0 / documentstoanalyze) * 100, 2) elapsed = int(time.time() - start) Printer("Processed {0} batches. ~ {1}% complete. Elapsed time: {2}s" .format(i, pct, elapsed)) if (iteration % args.model_out_freq == 0): numpy.savetxt('{0}lambda-{1}.dat'.format(args.outdir, iteration), olda._lambda) numpy.savetxt('{0}gamma-{1}.dat'.format(args.outdir, iteration), gamma) numpy.savetxt('{0}lambda-final.dat'.format(args.outdir), olda._lambda) numpy.savetxt('{0}gamma-final.dat'.format(args.outdir), gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 64 # The total number of questions on Stack Overflow D = 3.3e6 # The number of topics K = 50 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D / batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./vocab2.txt').readlines() W = len(vocab) # Our set of questions from Stack Overflow questions = QuestionSet(datafilename) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) print 'processing', documentstoanalyze for iteration in range(0, documentstoanalyze): # Download some articles (docset, articlenames) = questions.get_batch(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def updateLda(olda, docset, gamma=None, nIts=10): D = olda._D for iteration in range(0, nIts): # Run online LDA with current document set (gamma, bound) = olda.update_lambda(docset, gamma) # Compute an estimate of held-out perplexity (_, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: held-out perplexity estimate = %f' % \ (iteration, numpy.exp(-perwordbound)) return (olda._lambda, gamma, olda)
def main(): """ Loads and analyzes tweets """ # The number of documents to analyze each iteration batchsize = 1000 # The total number of documents in Wikipedia D = 1.0e6 # The number of topics K = 50 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Load documents tweets = file('./tweets_linebyline.txt').readlines() # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.9) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Give some documents to online LDA start = iteration*batchsize end = (iteration+1)*batchsize - 1 (gamma, bound) = olda.update_lambda(tweets[start:end]) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(tweets[start:end], olda._vocab) perwordbound = bound * len(tweets[start:end]) / (D * sum(map(sum, wordcts))) print '%d (%d - %d): rho_t = %f, held-out perplexity estimate = \t %f' % \ (iteration, start, end, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('data_lda/lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('data_lda/gamma-%d.dat' % iteration, gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 # How many documents to look at if len(argv) < 2: documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(argv[1]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles docset, articlenames = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA gamma, bound = olda.update_lambda(docset) # Compute an estimate of held-out perplexity wordids, wordcts = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if iteration % 10 == 0: print "Iteration: ", iteration numpy.savetxt('lambda.dat', olda._lambda) numpy.savetxt('gamma.dat', gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ doc_files=sys.argv[1] (docset, articlenames) = \ load_documents(doc_files) D=len(docset) # of topics K = int(sys.argv[2]) # Our vocabulary vocab = file('./dictnostops_test.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 '''kappa set to 0 to eliminate decay''' olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print ' rho_t = %f, held-out perplexity estimate = %f' % \ ( olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. print (olda._lambda.shape) print (gamma.shape) numpy.savetxt('lambda.dat', olda._lambda) numpy.savetxt('gamma.dat', gamma)
def make_topic_columns(lda, data, K, D, batchsize): questions = QuestionSet(data) allgamma = numpy.zeros((len(data), K)) for iteration in range(0, len(data) / batchsize): start = iteration * batchsize end = start + batchsize # Download some articles (docset, articlenames) = questions.get_batch(start, end) # Give them to online LDA (gamma, bound) = lda.update_lambda(docset) allgamma[start:end, :] = gamma # Compute an estimate of held-out perplexity (wordids, wordcts) = parse_doc_list(docset, lda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print "%d: rho_t = %f, held-out perplexity estimate = %f" % (iteration, lda._rhot, numpy.exp(-perwordbound)) # copy to dataframe for k in range(K): data["Topic%d" % k] = allgamma[:, k]
def make_topic_columns(lda, data, K, D, batchsize): questions = QuestionSet(data) allgamma = numpy.zeros((len(data), K)) for iteration in range(0, len(data) / batchsize): start = iteration * batchsize end = start + batchsize # Download some articles (docset, articlenames) = questions.get_batch(start, end) # Give them to online LDA (gamma, bound) = lda.update_lambda(docset) allgamma[start:end,:] = gamma # Compute an estimate of held-out perplexity (wordids, wordcts) = parse_doc_list(docset, lda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, lda._rhot, numpy.exp(-perwordbound)) # copy to dataframe for k in range(K): data['Topic%d'%k] = allgamma[:,k]
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ doc_files = sys.argv[1] (docset, articlenames) = \ load_documents(doc_files) D = len(docset) # of topics K = int(sys.argv[2]) # Our vocabulary vocab = file('./dictnostops_test.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 '''kappa set to 0 to eliminate decay''' olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print ' rho_t = %f, held-out perplexity estimate = %f' % \ ( olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. print(olda._lambda.shape) print(gamma.shape) numpy.savetxt('lambda.dat', olda._lambda) numpy.savetxt('gamma.dat', gamma)
def main(num_batches, K): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. Arguments: - num_batches: the number of batchs to take corpus_size = num_batches * batch_size - K : the number of topics, determined from stdin """ # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia D = 3.3e6 # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, num_batches): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(): ''' Read PApers ''' papers_ = [] with open('papers.csv', 'r') as csvfile: for line in csv.reader(csvfile, delimiter=',', quotechar='"'): papers_.append(line) D = len(papers_) # The number of topics K = 10 # Our vocabulary vocab = open('./dictnostops.txt').readlines() # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) docset = [row[3] for row in papers_] #articlenames = [row[0] for row in papers_] # Give them to online LDA (gamma, bound) = olda.update_lambda_docs(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print('%d: rho_t = %f, held-out perplexity estimate = %f' % \ (1, olda._rhot, numpy.exp(-perwordbound))) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. numpy.savetxt('lambda.dat', olda._lambda) numpy.savetxt('gamma.dat', gamma) #show topics printtopics.main(5)
def main(): """ using online VB for LDA on Archive data. """ # The number of documents to analyze each iteration batchsize = 1000 # The total number of documents in Wikipedia D = 7000 # The number of topics K = 10 # How many documents to look at documentstoanalyze = int(D/batchsize) if (len(sys.argv) > 1): K = int(sys.argv[1]) # Our vocabulary vocab = file(data_dir+'/dictionary_all.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): docset= get_abstracts(iteration) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. numpy.savetxt('%darchive-lambda-%d.dat' % (K, iteration), olda._lambda) numpy.savetxt('%darchive-gamma-%d.dat' % (K, iteration), gamma)
def allocate_topics(lda, data, K, batchsize, D): n_iterations = len(data) / batchsize questions = QuestionSet(data) topics = numpy.zeros((len(data), K)) # derive topics from data in batches for iteration in range(0, n_iterations): start = iteration * batchsize end = start + batchsize (docset, _) = questions.get_batch(start, end) (gamma, bound) = lda.update_lambda(docset) topics[start:end,:] = gamma (wordids, wordcts) = onlineldavb.parse_doc_list(docset, lda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, lda._rhot, numpy.exp(-perwordbound)) # copy to dataframe for k in range(K): data['Topic%d'%k] = topics[:,k] return topics
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ global D global doc_list global last_gamma_file cut_words() print D print len(doc_list) # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia #D = 500 # The number of topics K = int(sys.argv[1]) # How many documents to look at if (len(sys.argv) < 3): documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./chineseNoStopWords.txt').readlines() #print vocab W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) print documentstoanalyze perplexity_set = [] iter_set = [] for iteration in range(0, documentstoanalyze): # Download some articles ''' (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) ''' docset = doc_list[iteration*batchsize:(iteration+1)*batchsize] # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) perplexity_set.append(numpy.exp(-perwordbound)) iter_set.append(iteration) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 100 == 0 or iteration==documentstoanalyze-1): numpy.savetxt('./res_'+sys.argv[1]+'/lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('./res_'+sys.argv[1]+'/gamma-%d.dat' % iteration, gamma) last_gamma_file = './res_'+sys.argv[1]+'/lambda-%d.dat'%(documentstoanalyze-1) save_lambda_path = 'last_lambda_'+sys.argv[1]+'.txt' flast = open(save_lambda_path,'w') flast.write(last_gamma_file) flast.close()
def fit_olda_liveparse(doc_path, vocab_file, outdir, K, batch_size, iterations,\ verbose_topics, anchors, tmv_pickle, lemmatize): """ Analyzes a set of documents using online VB for LDA. """ # instance to get random documents docgen = generalrandom.LiveparseDocGen(doc_path) # The total number of documents in Wikipedia D = docgen.getDocCount() # Our vocabulary vocab = [term.strip() for term in file(vocab_file).readlines()] W = len(vocab) # write out general settings to pickle file for use by TMV later if tmv_pickle: # save model settings: vocab, K, docgen f = open(join(outdir, 'settings.pickle'), 'w+') cPickle.dump((vocab, K, docgen), f) f.close() # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7, anchors, \ lem = lemmatize) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) iteration = 0 old_perplexity = 1.0 * sys.maxint delta_perplexity = 1.0 * sys.maxint delta_perplexities = [old_perplexity] * 10 logfile = open(join(outdir, 'log.out'), 'w+') while (iterations != 0 and iteration < iterations) or \ sum(delta_perplexities)/10 > 0.001: # 0.1% change in sample perplexity if iteration > D/batch_size: print "killing due to iteration count" break iter_start = time.time() # Download some articles docset = docgen.get_random_articles(batch_size) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab, \ lemmatize) # estimate perpexity with the current batch perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) perplexity = numpy.exp(-perwordbound) delta_perplexity = abs(old_perplexity - perplexity) / perplexity print '%d: rho_t = %f, held-out perplexity estimate = %f (%.2f%%)' % \ (iteration, olda._rhot, perplexity, delta_perplexity * 100) logfile.write('%d: rho_t = %f, held-out perplexity estimate = %f (%.2f%%)\n' % (iteration, olda._rhot, perplexity, delta_perplexity * 100)) old_perplexity = perplexity delta_perplexities.pop(0) delta_perplexities.append(delta_perplexity) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt(join(outdir, 'lambda-%d.dat' % iteration), \ olda._lambda) numpy.savetxt(join(outdir, 'gamma-%d.dat' % iteration), gamma) if verbose_topics: print_topics(K, 7, vocab, olda._lambda, anchors) iteration += 1 if tmv_pickle: f = open(join(outdir,'olda.pickle'), 'w+') cPickle.dump(olda, f) f.close()
def main(): # unpack input arguments # seednum = 1 # documentstoanalyze = 2000 # batchsize = 10 # priv = 1 # epsilon = 1 # comp = 2 # mech = 0 seednum = int(sys.argv[1]) documentstoanalyze = int(sys.argv[2]) batchsize = int(sys.argv[3]) priv = int(sys.argv[4]) # 1 is private version, 0 is nonprivate version # epsilon = float(sys.argv[5]) # total privacy budget comp = int(sys.argv[5]) # mech = int(sys.argv[6]) # 0 for Gaussian, 1 for Laplace # The number of topics #K = 100 K = 50 #JF # load data # the_filename = Data_PATH+'wiki_docsmallset' # with open(the_filename, 'rb') as f: # docset = cPickle.load(f) #the_filename = Data_PATH+'wiki_docsmallset_D=%s' %(400000) the_filename = os.path.join(Data_PATH, 'wiki_docsmallset_D=%s' % (400000)) #JF: Make this work on Windows if resampleShortDocs: the_filename = the_filename + '_resample_short_docs' with open(the_filename, 'rb') as f: docset = cPickle.load(f) D = len(docset) print 'document length: %s' % (D) nu = batchsize / float(D) # sampling rate numpy.random.seed(seednum) print 'seednum %s mini-batchsize %s and number of iter %s' % ( seednum, batchsize, documentstoanalyze) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) """ privacy budget calculation """ # (1) to set the same level of burned privacy, we first calculate MA composition #sigma = 1.00000000000000000000000000000000000001 #a small value to minimize the noise #sigma = 1.1 #an intermediate value sigma = 1.24 #an intermediate value #sigma = 1.5 #an intermediate value #sigma = 2 #a larger value, expected to substantially reduce privacy and performance. total_del = 1e-4 J = documentstoanalyze total_eps_MA = cal_pri.moments_accountant(sigma, total_del, nu, J) print 'total privacy loss is %f' % (total_eps_MA) #(2) strong composition del_iter = 1e-6 res = minimize_scalar(cal_pri.strong_composition, bounds=(0, 50), args=(total_eps_MA, total_del, J, nu, del_iter), method='bounded') eps_iter = res.x gamma_noise = 0 # we don't use this at all. if comp == 0: #MA c2 = 2 * np.log(1.25 / del_iter) eps_iter = np.sqrt(c2) / sigma budget = [eps_iter, del_iter] elif comp == 1: #strong composition budget = [eps_iter, del_iter] else: print "we don't support this composition" if priv: print 'private version' olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7, priv, budget, gamma_noise, mech) perplexity = numpy.zeros(documentstoanalyze) # for iteration in range(0, maxIter): for iteration in range(0, documentstoanalyze): # subset of data rand_perm_nums = numpy.random.permutation(len(docset)) idx_minibatch = rand_perm_nums[0:batchsize] docsubset = list(docset[i] for i in idx_minibatch) # Give them to online LDA (gamma, bound) = olda.update_lambda_docs(docsubset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docsubset, olda._vocab) perwordbound = bound * len(docsubset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) perplexity[iteration] = numpy.exp(-perwordbound) # save perplexity if priv: # if gamma_noise: # method = 'private_epsilon_%s_cdp_%s_gamma_noise_%s' % (epsilon, cdp, gamma_noise) # else: # method = 'private_epsilon_%s_cdp_%s' %(epsilon, cdp) # method = 'static_private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s_D=%s' %(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6], D) #method = Results_PATH+'static_private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s_Lap=%s_D=%s' %(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], total_eps_MA, sys.argv[5], sys.argv[6], D) method = os.path.join( Results_PATH, 'static_private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s_Lap=%s_D=%s' % (sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], total_eps_MA, sys.argv[5], sys.argv[6], D)) else: #method = Results_PATH+'static_nonprivate_seed=%s_J=%s_S=%s_priv=%s_D=%s' %(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], D) method = os.path.join( Results_PATH, 'static_nonprivate_seed=%s_J=%s_S=%s_priv=%s_D=%s' % (sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], D)) if resampleShortDocs: method = method + '_resample_short_docs' numpy.save(method + '.npy', perplexity) # method = 'private_epsilon_1' # filename = method+'_D=_%s_S=_%s' %(D, batchsize) # numpy.save(filename+'.npy', test_log_likelihood) # save lambda and gamma numpy.savetxt(method + '_lambda.dat', olda._lambda) numpy.savetxt(method + '_gamma.dat', gamma)
def main(): # unpack input arguments # seednum = 1 # documentstoanalyze = 2000 # batchsize = 1000 # priv = 0 # epsilon = 1 # comp = 2 seednum = int(sys.argv[1]) documentstoanalyze = int(sys.argv[2]) batchsize = int(sys.argv[3]) priv = int(sys.argv[4]) # 1 is private version, 0 is nonprivate version epsilon = float(sys.argv[5]) # total privacy budget comp = int(sys.argv[6]) # 0 conventional, 1 advanced, 2 CDP # The number of topics K = 100 # D = 1000000 D = 5000000 nu = batchsize / float(D) # sampling rate numpy.random.seed(seednum) print('seednum %s mini-batchsize %s and number of iter %s' % (seednum, batchsize, documentstoanalyze)) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) gamma_noise = 0 # will use Laplace noise all the time if comp == 2: # budget = numpy.sqrt(epsilon/float(documentstoanalyze)) # budget = numpy.sqrt(epsilon*D/float(2*batchsize)) budget = numpy.sqrt(2 * epsilon) / float( 2 * nu * numpy.sqrt(documentstoanalyze)) elif comp == 1: delta = 0.000001 budget = epsilon / float( 4 * nu * numpy.sqrt(2 * documentstoanalyze * numpy.log(1 / delta))) else: # budget = epsilon/float(documentstoanalyze) budget = epsilon / float(2 * documentstoanalyze * nu) if priv: print('private version') olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7, priv, budget, gamma_noise) # the_filename = Data_PATH+'wiki_data' # with open(the_filename, 'rb') as f: # docset = cPickle.load(f) # load all the documents # docset = [] # for whichdoc in range(1, 21): # the_filename = Data_PATH+'wikidata_seednum=_%s' %(whichdoc) # with open(the_filename, 'rb') as f: # docset1 = cPickle.load(f) # docset = docset + docset1 # print "docset %s is loaded" %(whichdoc) # # print "docset all loaded" perplexity = numpy.zeros(documentstoanalyze) # D_test = 10000 # for iteration in range(0, maxIter): for iteration in range(0, documentstoanalyze): # subset of data # rand_perm_nums = numpy.random.permutation(len(docset)) # idx_minibatch = rand_perm_nums[0:batchsize] # docsubset = list(docset[i] for i in idx_minibatch) # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda_docs(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print('%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound))) # # Give them to online LDA # (gamma, bound) = olda.update_lambda_docs(docsubset) # # Compute an estimate of held-out perplexity # (wordids, wordcts) = onlineldavb.parse_doc_list(docsubset, olda._vocab) # perwordbound = bound * len(docsubset) / (D * sum(map(sum, wordcts))) # print '%d: rho_t = %f, training perplexity estimate = %f' % \ # (iteration, olda._rhot, numpy.exp(-perwordbound)) # compute test perplexity # idx_test = rand_perm_nums[batchsize+1:batchsize+1+D_test] # doctest = list(docset[i] for i in idx_test) # # (gamma_test, ss) = olda.do_e_step_docs(doctest) # # Estimate held-out likelihood for current values of lambda. # bound_test = olda.approx_bound_docs(doctest, gamma_test) # (wordids, wordcts_test) = onlineldavb.parse_doc_list(doctest, olda._vocab) # # # perwordbound_test = bound_test*D_test / float(D*sum(map(sum, wordcts_test))) # perword_test_log_likelihood = bound_test / float(sum(map(sum, wordcts_test))) # print '%d: rho_t = %f, test perplexity estimate = %f' % \ # (iteration, olda._rhot, perword_test_log_likelihood) perplexity[iteration] = numpy.exp(-perwordbound) # save perplexity if priv: # if gamma_noise: # method = 'private_epsilon_%s_cdp_%s_gamma_noise_%s' % (epsilon, cdp, gamma_noise) # else: # method = 'private_epsilon_%s_cdp_%s' %(epsilon, cdp) method = 'private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s' % ( sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6]) else: method = 'Nonprivate_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s' % ( sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6]) numpy.save(method + '.npy', perplexity) # method = 'private_epsilon_1' # filename = method+'_D=_%s_S=_%s' %(D, batchsize) # numpy.save(filename+'.npy', test_log_likelihood) # save lambda and gamma numpy.savetxt(method + '_lambda.dat', olda._lambda) numpy.savetxt(method + '_gamma.dat', gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batch_size = 4 # Total number of documents in the population. For a fixed corpus, # this is the size of the corpus. In the truly online setting number_of_documents = 71 # The number of topics number_of_topics = 1 # establish mysql database connection database = MysqlMessager(database="keyword_app") sql = "select Abstract from PreprocessedAbstracts;" database.excute_sql(sql) row_iteration = database.fetch() abstracts = [row[0] for row in row_iteration] # How many documents to look at if len(sys.argv) < 2: documents_to_analyze = int(number_of_documents / batch_size) else: documents_to_analyze = int(sys.argv[1]) # Our vocabulary all_keywords_file_path = "../../keywords/abstract_109.txt" with read_pickle_file(all_keywords_file_path) as content: vocab = list(content) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, number_of_topics, number_of_documents, 1. / number_of_topics, 1. / number_of_topics, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documents_to_analyze): # set dataset as list that stores all abstracts doc_set = abstracts # Give them to online LDA (gamma, bound) = olda.update_lambda(doc_set) # Compute an estimate of held-out perplexity (word_ids, word_count_times) = onlineldavb.parse_doc_list(doc_set, olda.vocab) per_word_bound = bound * len(doc_set) / ( number_of_documents * sum(map(sum, word_count_times))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda.rhot, numpy.exp(-per_word_bound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if iteration % 10 == 0: numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ wn.ensure_loaded() wiki_pool = wiki_local.WikiPool() # The number of documents to analyze each iteration batchsize = 1 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 30 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D / batchsize) else: documentstoanalyze = int(sys.argv[1]) + 1 # Our vocabulary #vocab = file('./dictnostops.txt').readlines() #vocab = file('./wordnet_nouns.txt').readlines() #vocab = file('./synset_dict.txt').readlines() #vocab = file('./wn_ambig_no_stop.txt').readlines() vocab = file('./mixed_wn_dict.txt').readlines() #vocab = [] #for word in words.words(): # word = str(word).lower() # word = re.sub(r'[^a-z]', '', word) # if word != '': # vocab.append(word) ##we get repeats because of upper -> lowercase? #vocab = set(vocab) #vocab = list(vocab) W = len(vocab) print W # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles (docset, articlenames) = \ wiki_pool.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda_docs(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 50 == 0): numpy.savetxt( 'data_ground_truth_disambig/lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt( 'data_ground_truth_disambig/gamma-%d.dat' % iteration, gamma) numpy.savetxt('data_ground_truth_disambig/lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('data_ground_truth_disambig/gamma-%d.dat' % iteration, gamma) print "finished iterations" wiki_pool.end()
def main(): articles = list() artnames = list() for line in file('./jacm/withIDAbstracts.txt').readlines(): combo=line.split('\t') artnames.append(combo[0]) articles.append(combo[1]) """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 1 # The total number of documents in Wikipedia D = len(artnames) # The number of topics K = 54 # How many documents to look at documentstoanalyze = len(artnames) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, D): # Download some articles docset=list() docset.append(articles[iteration]) articlenames=list() articlenames.append(artnames[iteration]) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) print bound perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. numpy.savetxt('./simpleLDA/gamma-%d.dat' % iteration, gamma) if (iteration % 50 == 0 or iteration==616): numpy.savetxt('./simpleLDA/lambda-%d.dat' % iteration, olda._lambda)
def main(argv): doc_list = [] argList = handleArgs(argv) #list the docs in pickledDocs folder p = "../data/pickledDocs/" l = listdir(p) fileList = [p + f for f in l] #for each pickled doclist, append all docs to master doclist for fi in fileList: with open(fi, 'rb') as d: docs = cPickle.load(d) for k, x in docs.iteritems(): doc_list.append(x) print len(doc_list) #D is total number of docs to show to the model, K is number of topics goal_its = 80 #number of iterations to run LDA corp_size = len(doc_list) #number of documents in the corpus D = corp_size * goal_its #number of documents expected to see K = 10 #default topic value, if none given in parameters saveModel = False #whether to save LDA model itself desc = "" #for performing non-standard runs version = "" #for having multiple models with same parameters hyper_param = "" #for testing hyperparameters #define the vocabulary file we will be using vocab = helper_funcs.read_dict("../data/dictionary.txt") #default dict #initialize an instance of the OnlineLDA algorithm #parameters - dictionary, num topics, learning rate, beta, tau, kappa #if the path to an OnlineLDA pickle is passed, it re-opens that pickle K = int(argList[0]) vocab = vocab = str.split(file(argList[1]).read()) if not (argList[2] is None): alpha = argList[2] else: alpha = 0.1 if not (argList[3] is None): beta = argList[3] else: beta = 1. saveModel = False lda = onlineldavb.OnlineLDA(vocab, K, D, alpha, beta, 1024, 0.) print "created LDA with parameters:\nnumwords: " + str( len(vocab)) + "\n#topics: " + str(K) + "\nalpha: " + str( alpha) + "\nbeta: " + str(beta) paramTitle = hyper_param + str( len(vocab) / 1000) + "kwords_" + str(K) + "topics" folder = "../data/out/models/" + paramTitle if not isdir(folder): mkdir(folder) W = len(vocab) print "dictionary size: " + str(W) print paramTitle print folder #if desc.find("label") > -1: # with open("../data/out/past_models/"+paramTitle+"/dictionary.txt",'wb') as f: # voc = sorted(vocab.items(),key=operator.itemgetter(1)) # for x in voc: # f.write(x[0]+"\n") #perform LDA on the document list for goal_its iterations, updating lambda for i in range(lda._updatect, goal_its): print doc_list print i (gamma, bound) = lda.update_lambda(doc_list) (wordids, wordcts) = onlineldavb.parse_doc_list(doc_list, lda._vocab) perwordbound = bound * len(doc_list) / (D * sum(map(sum, wordcts))) print np.exp(-perwordbound) #pickle the model and its output occasionally if (i + 1) == goal_its: if not isdir(folder): mkdir(folder) with open(folder + "/gamma.pickle", 'wb') as f: cp2 = cPickle.Pickler(f) cp2.dump(gamma) with open(folder + "/lambda.pickle", 'wb') as f: cp = cPickle.Pickler(f) cp.dump(lda._lambda) np.savetxt(folder + '/lambda.dat', lda._lambda) if saveModel: with open(folder + "/LDA.pickle", 'wb') as f: cp3 = cPickle.Pickler(f) cp3.dump(lda)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() # The number of documents to analyze each iteration batchsize = 100 # The total number of documents in Wikipedia D = 1000 #D = 2129792 for the whole set # The number of topics K = 30 # Our vocabulary vocab = file('./com_all_words.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) iteration = 0 while iteration * batchsize * size <= D: # Download some articles docset = [] counts = [] linecache.clearcache() startpoint = iteration * batchsize * size + batchsize * rank + 1 if startpoint > D: # search to the end break # stop # get the paper keywords in batches for i in range(batchsize): f1 = open('com_all_key.txt', 'r') f2 = open('com_all.txt', 'r') docset.append( linecache.getline('com_all_key.txt', min(D, startpoint))[:-1]) counts.append( linecache.getline('com_all.txt', min(D, startpoint))[:-1]) startpoint = startpoint + 1 # print type(docset), type(docset[0]), docset[0] # Give them to online LDA (gamma, bound) = olda.update_lambda(docset, counts) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab, counts) # print wordcts[0:5] perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) iteration = iteration + 1 # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. # print olda._lambda[0] gammas = comm.gather(gamma, root=0) lambdas = comm.gather(olda._lambda, root=0) if rank == 0: gamma_result = numpy.vstack((x for x in gammas)) lambda_result = numpy.vstack((x for x in lambdas)) numpy.savetxt('lambda_parallel.dat', olda._lambda) numpy.savetxt('gamma_parallel.dat', gamma)
def main(argv): doc_list = [] K, folder, alpha, beta, saveModel = handleArgs(argv) #list the docs in pickledDocs folder p = "../data/pickledDocs/" l = listdir(p) fileList = [p+f for f in l] #for each pickled doclist, append all docs to master doclist with open(folder.replace("dictionary","filelist"),'wb') as f: for fi in fileList: with open(fi,'rb') as d: docs = cPickle.load(d) for k,x in docs.iteritems(): doc_list.append(x) f.write(k+"\n") print len(doc_list) #D is total number of docs to show to the model, K is number of topics goal_its = 40 #number of iterations to run LDA corp_size = len(doc_list) #number of documents in the corpus D = corp_size*goal_its #number of documents expected to see #K = 10 #default topic value, if none given in parameters #saveModel = False #whether to save LDA model itself desc = "" #for performing non-standard runs version = "" #for having multiple models with same parameters hyper_param = "" #for testing hyperparameters #initialize an instance of the OnlineLDA algorithm #parameters - dictionary, num topics, learning rate, beta, tau, kappa #if the path to an OnlineLDA pickle is passed, it re-opens that pickle #K = int(argList[0]) vocab = vocab = str.split(file(folder).read()) # if not (argList[2] is None): # alpha = argList[2] # else: # alpha = 0.1 # if not (argList[3] is None): # beta = argList[3] # else: # beta = 1. # # saveModel = argList[4] lda = onlineldavb.OnlineLDA(vocab,K,D,alpha,beta,1024,0.) print "created LDA with parameters:\nnumwords: "+str(len(vocab))+"\n#topics: "+str(K)+"\nalpha: "+str(alpha)+"\nbeta: "+str(beta) paramTitle = hyper_param+str(len(vocab)/1000)+"kwords_"+str(K)+"topics" folder = "../data/out/models/"+paramTitle if not isdir(folder): mkdir(folder) W = len(vocab) print "dictionary size: " + str(W) print paramTitle print folder #if desc.find("label") > -1: # with open("../data/out/past_models/"+paramTitle+"/dictionary.txt",'wb') as f: # voc = sorted(vocab.items(),key=operator.itemgetter(1)) # for x in voc: # f.write(x[0]+"\n") #perform LDA on the document list for goal_its iterations, updating lambda for i in range(lda._updatect,goal_its): print i (gamma, bound) = lda.update_lambda(doc_list) (wordids, wordcts) = onlineldavb.parse_doc_list(doc_list,lda._vocab) perwordbound = bound * len(doc_list) / (D*sum(map(sum,wordcts))) print np.exp(-perwordbound) #pickle the model and its output occasionally if (i+1) == goal_its: if not isdir(folder): mkdir(folder) with open(folder+"/gamma.pickle",'wb') as f: cp2 = cPickle.Pickler(f) cp2.dump(gamma) with open(folder+"/lambda.pickle",'wb') as f: cp = cPickle.Pickler(f) cp.dump(lda._lambda) np.savetxt(folder+'/lambda.dat', lda._lambda) if not (saveModel is None): with open(folder+"/LDA.pickle",'wb') as f: cp3 = cPickle.Pickler(f) cp3.dump(lda)
if not len(sys.argv) == 4: folder = "../data/out/"+paramTitle print folder #if desc.find("label") > -1: # with open("../data/out/past_models/"+paramTitle+"/dictionary.txt",'wb') as f: # voc = sorted(vocab.items(),key=operator.itemgetter(1)) # for x in voc: # f.write(x[0]+"\n") #perform LDA on the document list for goal_its iterations, updating lambda for i in range(lda._updatect,goal_its): print i (gamma, bound) = lda.update_lambda(doc_list) (wordids, wordcts) = onlineldavb.parse_doc_list(doc_list,lda._vocab) perwordbound = bound * len(doc_list) / (D*sum(map(sum,wordcts))) print np.exp(-perwordbound) #pickle the model and its output occasionally if (i+1) == goal_its: print doc_list[0] print gamma[0] if not isdir(folder): mkdir(folder) with open(folder+"/gamma.pickle",'wb') as f: cp2 = cPickle.Pickler(f) cp2.dump(gamma) with open(folder+"/lambda.pickle",'wb') as f: cp = cPickle.Pickler(f) cp.dump(lda._lambda)
def main(): # LDA: a documents contains all the keywords of some journal/conference # equivalent to cluster keywords over journals/conferences journal_or_conference = sys.argv[1] num = int(sys.argv[2]) conn = jcke.get_db_conn() # default (num <=0 or > max number): figure out all the journals/conferences keywords if num <= 0 or (num >= 15151 and journal_or_conference == "journal") or ( num >= 4545 and journal_or_conference == "conference"): query = """ SELECT COUNT(*) FROM ##journal_or_conference## """ query = query.replace("##journal_or_conference##", journal_or_conference) conn.cursor.execute(query) num = conn.cursor.fetchall( ) # number of journals/conferences to process # document parsing journal_conf_list = os.listdir("journal_conf_keyword") # check if txt files exist, then generate those docs if not num == len(journal_conf_list): jcke.journal_conf_keyword_generation(conn, num, journal_or_conference) journal_conf_list = os.listdir("journal_conf_keyword") # The number of journal/conference keyword sets in each batch batchsize = lambda num: num if num <= 100 else 100 batch = batchsize(num) iteration_times = int(num / batch) # The total number of journals/conferences DocNum = lambda journal_or_conference: 15151 if journal_or_conference == "journal" else 4545 D = DocNum(journal_or_conference) # The number of topics K = 100 # maybe some other numbers # Our vocabulary : we need some vocabulary set! vocab = dict() # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 online_LDA = lda.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* sooner than this.) for iteration in range(0, iteration_times): # getting documents (keyword sets) if iteration != iteration_times - 1: journal_conf_keyword_list = jcke.input_journal_conf_keywords( journal_conf_list[iteration * batch:(iteration + 1) * batch]) else: journal_conf_keyword_list = jcke.input_journal_conf_keywords( journal_conf_list[iteration * batch:]) # online LDA for keyword sets # here we update the relative function in the package (dangerous!) online_LDA._vocab = jcke.vocabulary_generation( journal_conf_keyword_list, online_LDA._vocab) (gamma, bound) = online_LDA.update_lambda(journal_conf_keyword_list) # Compute an estimate of held-out perplexity (keywordids, keywordcts) = lda.parse_doc_list(journal_conf_keyword_list, online_LDA._vocab) perkeywordbound = bound * len(journal_conf_keyword_list) / ( D * sum(map(sum, keywordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, online_LDA._rhot, numpy.exp(-perkeywordbound)) # Save lambda, the parameters to the variational distributions over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in the last iteration. numpy.savetxt('lambda-%s.dat' % journal_or_conference, online_LDA._lambda) numpy.savetxt('gamma-%s.dat' % journal_or_conference, gamma)
def fit_olda(parse, doc_path, doc_file, vocab_file, outdir, K, batch_size, \ iterations, verbose_topics, anchors, tmv_pickle, lemmatize, final_pass, \ full_doc_topics): """ Analyzes a set of documents using online VB for LDA. """ # instance to generate radom documents if parse == "live": # read and parse docs on the fly using vocab docgen = generalrandom.LiveparseDocGen(doc_path) else: # alternative: preparsed docgen = generalrandom.PreparseDocGen(doc_file) # The total number of documents in Wikipedia D = docgen.getDocCount() if iterations == 0: iterations = max(D / batch_size, 10) # Our vocabulary if parse == "live" or verbose_topics: vocab = [term.strip() for term in file(vocab_file).readlines()] W = len(vocab) else: W = docgen.getTermCount() vocab = ["term " + str(w) for w in range(W)] # write out general settings to pickle file for use by TMV later if tmv_pickle: # save model settings: vocab, K, docgen f = open(join(outdir, 'settings.pickle'), 'w+') cPickle.dump((vocab, K, docgen, lemmatize), f) f.close() # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7, anchors, \ lem = lemmatize, preparsed = (parse == "preparsed")) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) iteration = 0 old_perplexity = 1.0 * sys.maxint delta_perplexity = 1.0 * sys.maxint delta_perplexities = [old_perplexity] * 10 logfile = open(join(outdir, 'log.out'), 'w+') while iteration < iterations and sum( delta_perplexities ) / 10 > 0.001: # 0.1% change in sample perplexity iter_start = time.time() # Download some articles docset = docgen.get_random_articles(batch_size) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity if parse == "live": (wordids, wordcts) = onlineldavb.parse_doc_list(docset, \ olda._vocab, lemmatize) else: (wordids, wordcts) = docset # estimate perpexity with the current batch perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) perplexity = numpy.exp(-perwordbound) delta_perplexity = abs(old_perplexity - perplexity) / perplexity print '%d: rho_t = %f, held-out perplexity estimate = %f (%.2f%%)' % \ (iteration, olda._rhot, perplexity, delta_perplexity * 100) logfile.write( '%d: rho_t = %f, held-out perplexity estimate = %f (%.2f%%)\n' % (iteration, olda._rhot, perplexity, delta_perplexity * 100)) old_perplexity = perplexity delta_perplexities.pop(0) delta_perplexities.append(delta_perplexity) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt(join(outdir, 'lambda-%d.dat' % iteration), \ olda._lambda) numpy.savetxt(join(outdir, 'gamma-%d.dat' % iteration), gamma) if verbose_topics: print_topics(K, 7, vocab, olda._lambda, anchors) iteration += 1 logfile.close() if tmv_pickle: f = open(join(outdir, 'olda.pickle'), 'w+') cPickle.dump(olda, f) f.close() # save final iters numpy.savetxt(join(outdir, 'lambda-final.dat'), olda._lambda) numpy.savetxt(join(outdir, 'gamma-%d.dat' % iteration), gamma) # do a final pass on all documents if (final_pass): fout = open(join(outdir, "gamma-final.dat"), 'w+') if not full_doc_topics: fout.write("doc.lda.id\ttopic.id\tscore\n") i = 0 for doc in docgen: if parse == 'live': #TODO: the parsers should return same order... doc = doc[1] (gamma, ss) = olda.do_e_step(doc) j = 0 if not full_doc_topics: for g in gamma.tolist()[0]: if g > 0.051: fout.write("%d\t%d\t%f\n" % (i, j, g)) j += 1 i += 1 else: gf = gamma.tolist()[0] fout.write(('\t'.join(["%f"] * len(gf)) + '\n') % tuple(gf)) fout.close()
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 5 # The total number of documents in Wikipedia #D = 10 # The number of topics K = 20 #load my own dataset f = open('../annotated/odata.dat', 'rb') data = cPickle.load(f) f.close() if '-S' in sys.argv: catog = 'pos' else: catog = 'neg' docset = [] for each in data: if each[1] == catog: docset.append(each[0][0][0]) D = len(docset) #number of docset print D # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = open('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles #(docset, articlenames) = \ # wikirandom.get_random_wikipedia_articles(batchsize) #print type(docset[1]), type(docset[0]) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(): """ Retrieves the content of a set of text files whose content is obtained from SOAP API descriptors. """ path = sys.argv[1] docs = os.listdir(path) # The number of documents to analyze each iteration rest = 1 #batchsize = int(math.ceil(len(docs)/100)) batchsize = 15 #print len(docs) #while rest != 0: # rest = len(docs) % batchsize # if (rest != 0): # batchsize = batchsize + 1 # The total number of documents (is supposed to be a huge/infinite number in an online setting) D = 3.3e6 #D = len(docs) # The number of topics K = 40 #K = 50 # How many documents to look at #print batchsize #print sys.argv[0] if (len(sys.argv) == 2): #print 'Got into IF...' documentstoanalyze = int(math.ceil(len(docs)/float(batchsize))) elif (len(sys.argv) == 3): documentstoanalyze = int(sys.argv[2]) elif (len(sys.argv) == 4): documentstoanalyze = int(sys.argv[2]) K = int(sys.argv[3]) #print documentstoanalyze # Our vocabulary #vocab = file('./dictnostops.txt').readlines() vocab = file('./wlist_match10.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 0.5, 0.5, 1024., 0.7) #olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Dictionary for storing gamma values of the processed text files gamma_all = dict() #olda = onlineldavb.OnlineLDA(vocab, K, D, 0.01, 0.01, 1024., 0.7) for iteration in range(1, documentstoanalyze+1): # Download some articles (docset, operation_id) = \ get_file_content(iteration, batchsize, path) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) #print ('iteration %d: rho_t = %f, held-out perplexity estimate = %f ' % \ # (iteration, olda._rhot, numpy.exp(-perwordbound))) sys.stdout.write('\rBatchs of document analyzed: %d/%d' % (iteration, documentstoanalyze)) sys.stdout.flush() # Store the gamma values into the gamma_all for each one of the text files # in the current iteration for i in range(len(operation_id)): gamma_all[operation_id[i]] = list(gamma[i]) print(iteration, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the text files analyzed in # the last iteration. #if (iteration % 10 == 0): # numpy.savetxt('parameters/lambda-%d.dat' % iteration, olda._lambda) # numpy.savetxt('parameters/gamma-%d.dat' % iteration, gamma) if (iteration == documentstoanalyze): numpy.savetxt('parameters/lambda-all.dat', olda._lambda) # Save gamma_all for all the processed text files print '\n' temp = gamma_all.items() temp = sorted(temp, key = lambda x: x[0]) numpy.savetxt('parameters/gamma-all.dat' , [item[1] for item in temp])
def fit_olda(parse, doc_path, doc_file, vocab_file, outdir, K, batch_size, \ iterations, verbose_topics, anchors, tmv_pickle, lemmatize, final_pass, \ full_doc_topics): """ Analyzes a set of documents using online VB for LDA. """ # instance to generate radom documents if parse == "live": # read and parse docs on the fly using vocab docgen = generalrandom.LiveparseDocGen(doc_path) else: # alternative: preparsed docgen = generalrandom.PreparseDocGen(doc_file) # The total number of documents in Wikipedia D = docgen.getDocCount() if iterations == 0: iterations = max(D / batch_size, 10) # Our vocabulary if parse == "live" or verbose_topics: vocab = [term.strip() for term in file(vocab_file).readlines()] W = len(vocab) else: W = docgen.getTermCount() vocab = ["term " + str(w) for w in range(W)] # write out general settings to pickle file for use by TMV later if tmv_pickle: # save model settings: vocab, K, docgen f = open(join(outdir, 'settings.pickle'), 'w+') cPickle.dump((vocab, K, docgen, lemmatize), f) f.close() # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7, anchors, \ lem = lemmatize, preparsed = (parse == "preparsed")) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) iteration = 0 old_perplexity = 1.0 * sys.maxint delta_perplexity = 1.0 * sys.maxint delta_perplexities = [old_perplexity] * 10 logfile = open(join(outdir, 'log.out'), 'w+') while iteration < iterations and sum(delta_perplexities)/10 > 0.001: # 0.1% change in sample perplexity iter_start = time.time() # Download some articles docset = docgen.get_random_articles(batch_size) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity if parse == "live": (wordids, wordcts) = onlineldavb.parse_doc_list(docset, \ olda._vocab, lemmatize) else: (wordids, wordcts) = docset # estimate perpexity with the current batch perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) perplexity = numpy.exp(-perwordbound) delta_perplexity = abs(old_perplexity - perplexity) / perplexity print '%d: rho_t = %f, held-out perplexity estimate = %f (%.2f%%)' % \ (iteration, olda._rhot, perplexity, delta_perplexity * 100) logfile.write('%d: rho_t = %f, held-out perplexity estimate = %f (%.2f%%)\n' % (iteration, olda._rhot, perplexity, delta_perplexity * 100)) old_perplexity = perplexity delta_perplexities.pop(0) delta_perplexities.append(delta_perplexity) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt(join(outdir, 'lambda-%d.dat' % iteration), \ olda._lambda) numpy.savetxt(join(outdir, 'gamma-%d.dat' % iteration), gamma) if verbose_topics: print_topics(K, 7, vocab, olda._lambda, anchors) iteration += 1 logfile.close() if tmv_pickle: f = open(join(outdir,'olda.pickle'), 'w+') cPickle.dump(olda, f) f.close() # save final iters numpy.savetxt(join(outdir, 'lambda-final.dat'), olda._lambda) numpy.savetxt(join(outdir, 'gamma-%d.dat' % iteration), gamma) # do a final pass on all documents if (final_pass): fout = open(join(outdir, "gamma-final.dat"), 'w+') if not full_doc_topics: fout.write("doc.lda.id\ttopic.id\tscore\n") i = 0 for doc in docgen: if parse == 'live': #TODO: the parsers should return same order... doc = doc[1] (gamma, ss) = olda.do_e_step(doc) j = 0 if not full_doc_topics: for g in gamma.tolist()[0]: if g > 0.051: fout.write("%d\t%d\t%f\n" % (i,j,g)) j += 1 i += 1 else: gf = gamma.tolist()[0] fout.write(('\t'.join(["%f"]*len(gf))+'\n') % tuple(gf)) fout.close()
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 100 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 rho_t_vector = [] perplexity_vector = [] time_vector = [] time1_vector = [] # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D / batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 kappa = 0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., kappa) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) t1 = time.time() for iteration in tqdm(range(0, documentstoanalyze)): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda_docs(docset) # Compute an estimate of held-out perplexity t = time.time() (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) t2 = time.time() time_vector.append(t2 - t1) if len(time1_vector) == 0: time1_vector.append(t2 - t) else: time1_vector.append(time1_vector[-1] + t2 - t) rho_t_vector.append(olda._rhot) perplexity_vector.append(perwordbound) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma) numpy.savetxt('time_%.1f_%d' % (kappa, batchsize), numpy.array(time_vector)) numpy.savetxt('rho_%.1f_%d' % (kappa, batchsize), numpy.array(rho_t_vector)) numpy.savetxt('perplexity_%.1f_%d' % (kappa, batchsize), numpy.array(perplexity_vector)) numpy.savetxt('time1_%.1f_%d' % (kappa, batchsize), numpy.array(time1_vector))
def main(): """ Analyzes scraped pages using scikit-learn.LDA """ # The number of topics K = 10 # no of documents D = 300 n_features = 1000 # Our vocabulary vocab = list(set(file('./vocab').readlines())) W = len(vocab) # Add terms and topics to the DB db.init() db.add_terms(vocab) db.add_topics(K) olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # grab documents ### Load your scraped pages, re-tokenize, and vectorize result. docset, docnames = [], [] for filename in os.listdir(os.getcwd()): if filename.endswith('.html'): tree = html.parse(filename) try: encoding = tree.xpath('//meta/@charset')[0] except IndexError: encoding = 'utf-8' with open(filename) as page: rawtext = page.read() try: rawtext = rawtext.decode(encoding, errors='backslashreplace') except TypeError: continue # encoding issues, see http://stackoverflow.com/questions/19527279/python-unicode-to-ascii-conversion docset += [clean_html(rawtext)] docnames += [filename[:-5]] if not(len(docset) % 10): print("loaded " + str(len(docset)) + " documents") # Give them to online LDA # Also computes an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) (gamma, bound) = olda.update_lambda(wordids, wordcts) # Arrays for adding batches of data to the DB # doc_array = [] # doc_term_array = [] # for d in range(len(docnames)): # doc_array.append((docnames[d], docset[d])) doc_array = zip(docnames, docset) # Add a batch of docs to the DB; this is the one DB task that is not in # the separate DB write thread since later tasks depend on having doc ids. # Since writes take so long, this also balaces the two threads time-wise. doc_ids = db.add_docs(doc_array) doc_topic_array = [] for d in range(len(gamma)): doc_size = len(docset[d]) for k in range(len(gamma[d])): doc_topic_array.append((doc_ids[d], k, gamma[d][k], gamma[d][k]/doc_size)) db.add_doc_topics(doc_topic_array) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (1, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. numpy.savetxt('lambda-%d.dat' % 1, olda._lambda) numpy.savetxt('gamma-%d.dat' % 1, gamma) topic_terms_array = [] for topic in range(len(olda._lambda)): lambda_sum = sum(olda._lambda[topic]) for term in range(len(olda._lambda[topic])): topic_terms_array.append((topic, term, olda._lambda[topic][term]/lambda_sum)) db.update_topic_terms(K, topic_terms_array) gc.collect() # probably not necesary, but precautionary for long runs db.print_task_update() # The DB thread ends only when it has both run out of tasks and it has been # signaled that it will not be recieving any more tasks db.increment_batch_count() db.signal_end()
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() # The number of documents to analyze each iteration batchsize = 100 # The total number of documents in Wikipedia D = 1000 #D = 2129792 for the whole set # The number of topics K = 30 # Our vocabulary vocab = file('./com_all_words.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) iteration = 0 while iteration * batchsize * size <= D: # Download some articles docset = [] counts = [] linecache.clearcache() startpoint = iteration * batchsize * size + batchsize * rank + 1 if startpoint > D: # search to the end break # stop # get the paper keywords in batches for i in range(batchsize): f1 = open('com_all_key.txt','r') f2 = open('com_all.txt', 'r') docset.append(linecache.getline('com_all_key.txt', min(D, startpoint))[:-1]) counts.append(linecache.getline('com_all.txt', min(D, startpoint))[:-1]) startpoint = startpoint + 1 # print type(docset), type(docset[0]), docset[0] # Give them to online LDA (gamma, bound) = olda.update_lambda(docset, counts) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab, counts) # print wordcts[0:5] perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) iteration = iteration + 1 # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. # print olda._lambda[0] gammas = comm.gather(gamma, root = 0) lambdas = comm.gather(olda._lambda, root = 0) if rank == 0: gamma_result = numpy.vstack((x for x in gammas)) lambda_result = numpy.vstack((x for x in lambdas)) numpy.savetxt('lambda_parallel.dat', olda._lambda) numpy.savetxt('gamma_parallel.dat', gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ global D global doc_list global last_gamma_file cut_words() print D print len(doc_list) # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia #D = 500 # The number of topics K = int(sys.argv[1]) # How many documents to look at if (len(sys.argv) < 3): documentstoanalyze = int(D / batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./chineseNoStopWords.txt').readlines() #print vocab W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) print documentstoanalyze perplexity_set = [] iter_set = [] for iteration in range(0, documentstoanalyze): # Download some articles ''' (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) ''' docset = doc_list[iteration * batchsize:(iteration + 1) * batchsize] # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) perplexity_set.append(numpy.exp(-perwordbound)) iter_set.append(iteration) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 100 == 0 or iteration == documentstoanalyze - 1): numpy.savetxt( './res_' + sys.argv[1] + '/lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('./res_' + sys.argv[1] + '/gamma-%d.dat' % iteration, gamma) last_gamma_file = './res_' + sys.argv[1] + '/lambda-%d.dat' % ( documentstoanalyze - 1) save_lambda_path = 'last_lambda_' + sys.argv[1] + '.txt' flast = open(save_lambda_path, 'w') flast.write(last_gamma_file) flast.close()
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Add terms and topics to the DB db.init() db.add_terms(vocab) db.add_topics(K) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) # Arrays for adding batches of data to the DB doc_array = [] doc_term_array = [] for d in range(len(articlenames)): doc_array.append((articlenames[d], docset[d])) # Add a batch of docs to the DB; this is the one DB task that is not in # the separate DB write thread since later tasks depend on having doc ids. # Since writes take so long, this also balaces the two threads time-wise. doc_ids = db.add_docs(doc_array) doc_topic_array = [] for d in range(len(gamma)): doc_size = len(docset[d]) for k in range(len(gamma[d])): doc_topic_array.append((doc_ids[d], k, gamma[d][k], gamma[d][k]/doc_size)) db.add_doc_topics(doc_topic_array) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma) topic_terms_array =[] for topic in range(len(olda._lambda)): lambda_sum = sum(olda._lambda[topic]) for term in range(len(olda._lambda[topic])): topic_terms_array.append((topic, term, olda._lambda[topic][term]/lambda_sum)) db.update_topic_terms(K, topic_terms_array) gc.collect() # probably not necesary, but precautionary for long runs db.print_task_update() db.increment_batch_count() # The DB thread ends only when it has both run out of tasks and it has been # signaled that it will not be recieving any more tasks db.signal_end()
pages = [strs.rstrip() for strs in pages] D = len(pages) pageID = range(0, D) nBatches = D / batchsize #Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 lda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 128., 0.7) #Run nBatches = 100 for iteration in range(0, nBatches): #Grab Abstracts (docset, pagenames, pages, pageID) = grabAbstracts(pages, batchsize, pageID) #Give them to online LDA (gamma, bound) = lda.update_lambda(docset) #Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, lda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % ( iteration, lda._rhot, numpy.exp(-perwordbound)) #Save to file if (iteration % 10 == 0): numpy.savetxt('lambda.dat', lda._lambda) numpy.savetxt('gamma.dat', gamma)