def main(doc_list, vocab_file): batch_size = 64 D = len(doc_list) # number of documents K = 100 # number of topics vocab = file(vocab_file).readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. for iteration in range(0, D): doc_set = doc_list[batch_size*iteration:(iteration+1)*batch_size] # Give them to online LDA (gamma, bound) = olda.update_lambda_docs(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(): # OLDA parameters D = 3.3e6 K = 200 with open('./tweetdict_stemmed.txt', 'rb') as f: vocab = f.readlines() W = len(vocab) olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Assign lambdas and the counter to the previous iteration if len(sys.argv) > 1: olda._lambda = numpy.loadtxt(sys.argv[1]) counter = int(sys.argv[2]) else: counter = 0 # collect top words for each topic for hashtag prediction top_words = {} for k in range(0, len(olda._lambda)): lambdak = list(olda._lambda[k, :]) lambdak = lambdak / sum(lambdak) temp = zip(lambdak, range(0, len(lambdak))) temp = sorted(temp, key=lambda x: x[0], reverse=True) top_words[str(k)] = vocab[temp[0][1]] # Connect to Mongo try: c = Connection(host="localhost", port=27017) print "Connected successfully" except ConnectionFailure, e: sys.stderr.write("Could not connect to MongoDB: %s" % e) sys.exit(1)
def __init__(self,batchsize,d,k,tau,kappa): self.__dp=dataParse.dataParse(os.path.abspath("./data/ideas.txt")) self.__result=self.__dp.concatedField(os.path.abspath("./data/fieldList.txt")) #doc self.__doc=self.__result[0] #fielddata self.__fid=self.__result[1] #dictionary self.__vocab=file(os.path.abspath('./data/vocabulary.txt')).readlines() # the number of words in the dictionary self.__W=len(self.__vocab) #the number of documents to analyze in each iteration self.__batchsize=batchsize # the total number of documents self.__D=d # the number of topics self.__K=k # the number of iterations self.__documentstoanalyze=self.__D/self.__batchsize # tau self.__tau=tau # kappa self.__kappa=kappa # lda instance (alpha=1/K, eta=1/K, tau_0=1024, kappa=0.1) self.__ldaObj=onlineldavb.OnlineLDA(self.__vocab, self.__K, self.__D, 1./self.__K, 1./self.__K, self.__tau*1.0, self.__kappa)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration #batchsize = 64 batchsize = 32 # The total number of tweets #D=297861 D = 1163 # The number of topics #K = 20 K = 10 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('dictnostops.txt').readlines() W = len(vocab) #open rawdata #train_file = open("congress_train.txt") train_file = open("text.txt") train = train_file.readlines() # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 #olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 128., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) iter = 0 for iteration in range(0, documentstoanalyze): # Download some articles #(docset, articlenames) = \ #wikirandom.get_random_wikipedia_articles(batchsize) docset = train[iter:(iter+batchsize)] iter+=batchsize # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) print wordids print wordcts perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(): articles = list() artnames = list() for line in file('./jacm/withIDAbstracts.txt').readlines(): combo = line.split('\t') artnames.append(combo[0]) articles.append(combo[1]) """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 1 # The total number of documents in Wikipedia D = len(artnames) # The number of topics K = 54 # How many documents to look at documentstoanalyze = len(artnames) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, D): # Download some articles docset = list() docset.append(articles[iteration]) articlenames = list() articlenames.append(artnames[iteration]) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) print bound perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. numpy.savetxt('./simpleLDA/gamma-%d.dat' % iteration, gamma) if (iteration % 50 == 0 or iteration == 616): numpy.savetxt('./simpleLDA/lambda-%d.dat' % iteration, olda._lambda)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 100 # The total number of documents in Wikipedia D = 1000 # The number of topics K = 100 # How many documents to look at documentstoanalyze = int(D / batchsize) # Our vocabulary vocab = file('./com_all_words.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles docset = [] counts = [] linecache.clearcache() startpoint = iteration * batchsize + 1 # get the paper keywords in batches for i in range(batchsize): f1 = open('com_all_key.txt', 'r') f2 = open('com_all.txt', 'r') docset.append( linecache.getline('com_all_key.txt', min(D, startpoint + i))[:-1]) counts.append( linecache.getline('com_all.txt', min(D, startpoint + i))[:-1]) # Give them to online LDA # print docset[0] (gamma, bound) = olda.update_lambda(docset, counts) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab, counts) # print [olda._vocab[x] for x in docset[0].split(';')], wordids[0], wordcts[0] perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda_paper-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma_paper-%d.dat' % iteration, gamma)
def main(): # The number of documents to analyze each iteration. batchsize = args.batchsize # The total number of documents in the corpus. D = args.num_docs # The number of topics. K = args.num_topics # How many documents to look at documentstoanalyze = int(D / batchsize) # The vocabulary vocab = file(args.vocab_file).readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 alpha = 1. / K # prior on topic weights theta eta = 1. / K # prior on p(w|topic) Beta tau_0 = args.tau_0 # learning parameter to downweight early documents kappa = args.kappa # learning parameter; decay factor for influence of batches olda = onlineldavb.OnlineLDA(vocab, K, D, alpha, 1. / K, tau_0, kappa) dataset_file = open(args.dataset) start = time.time() for iteration in range(0, documentstoanalyze): # Read a batch of articles. docset = batch_read(dataset_file, batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): i = iteration pct = round((i * 1.0 / documentstoanalyze) * 100, 2) elapsed = int(time.time() - start) Printer( "Processed {0} batches. ~ {1}% complete. Elapsed time: {2}s". format(i, pct, elapsed)) if (iteration % args.model_out_freq == 0): numpy.savetxt( '{0}lambda-{1}.dat'.format(args.outdir, iteration), olda._lambda) numpy.savetxt( '{0}gamma-{1}.dat'.format(args.outdir, iteration), gamma) numpy.savetxt('{0}lambda-final.dat'.format(args.outdir), olda._lambda) numpy.savetxt('{0}gamma-final.dat'.format(args.outdir), gamma)
def __init__(self, K, tau0, kappa): # The total number of documents in Wikipedia self.D = 330 # Our vocabulary self.vocab = open('dictnostops.txt', 'rt').readlines() self.W = len(self.vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, self.old_alpha = onlineldavb.OnlineLDA(self.vocab, K, self.D, 1. / K, 1. / K, tau0, kappa) self.iteration = 0
def main(): # The number of documents to analyze each iteration batchsize = 100 # The total number of questions on Stack Overflow D = 3.3e6 # The number of topics K = 20 # Make sure the topics are included as features for analysis feature_names.extend('Topic%d' % k for k in range(K)) print("Reading the vocabulary") vocab = [w.strip() for w in file('./vocab4.txt')] # How many words are in the vocabulary W = len(vocab) print("Reading the data") data = cu.get_dataframe(train_file) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 lda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) print("Allocating the topics") allocate_topics(lda, data, K, batchsize, D) print("Extracting features") fea = extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=4) rf.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) allocate_topics(lda, data, K, batchsize, D) test_features = extract_features(feature_names, data) probs = rf.predict_proba(test_features) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 64 # The total number of questions on Stack Overflow D = 3.3e6 # The number of topics K = 50 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D / batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./vocab2.txt').readlines() W = len(vocab) # Our set of questions from Stack Overflow questions = QuestionSet(datafilename) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) print 'processing', documentstoanalyze for iteration in range(0, documentstoanalyze): # Download some articles (docset, articlenames) = questions.get_batch(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 # How many documents to look at if len(argv) < 2: documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(argv[1]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles docset, articlenames = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA gamma, bound = olda.update_lambda(docset) # Compute an estimate of held-out perplexity wordids, wordcts = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if iteration % 10 == 0: print "Iteration: ", iteration numpy.savetxt('lambda.dat', olda._lambda) numpy.savetxt('gamma.dat', gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ doc_files = sys.argv[1] (docset, articlenames) = \ load_documents(doc_files) D = len(docset) # of topics K = int(sys.argv[2]) # Our vocabulary vocab = file('./dictnostops_test.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 '''kappa set to 0 to eliminate decay''' olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print ' rho_t = %f, held-out perplexity estimate = %f' % \ ( olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. print(olda._lambda.shape) print(gamma.shape) numpy.savetxt('lambda.dat', olda._lambda) numpy.savetxt('gamma.dat', gamma)
def main(num_batches, K): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. Arguments: - num_batches: the number of batchs to take corpus_size = num_batches * batch_size - K : the number of topics, determined from stdin """ # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia D = 3.3e6 # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, num_batches): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(): ''' Read PApers ''' papers_ = [] with open('papers.csv', 'r') as csvfile: for line in csv.reader(csvfile, delimiter=',', quotechar='"'): papers_.append(line) D = len(papers_) # The number of topics K = 10 # Our vocabulary vocab = open('./dictnostops.txt').readlines() # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) docset = [row[3] for row in papers_] #articlenames = [row[0] for row in papers_] # Give them to online LDA (gamma, bound) = olda.update_lambda_docs(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print('%d: rho_t = %f, held-out perplexity estimate = %f' % \ (1, olda._rhot, numpy.exp(-perwordbound))) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. numpy.savetxt('lambda.dat', olda._lambda) numpy.savetxt('gamma.dat', gamma) #show topics printtopics.main(5)
def main(): """ using online VB for LDA on Archive data. """ # The number of documents to analyze each iteration batchsize = 1000 # The total number of documents in Wikipedia D = 7000 # The number of topics K = 10 # How many documents to look at documentstoanalyze = int(D/batchsize) if (len(sys.argv) > 1): K = int(sys.argv[1]) # Our vocabulary vocab = file(data_dir+'/dictionary_all.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): docset= get_abstracts(iteration) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. numpy.savetxt('%darchive-lambda-%d.dat' % (K, iteration), olda._lambda) numpy.savetxt('%darchive-gamma-%d.dat' % (K, iteration), gamma)
def main(): # LDA: a documents contains all the keywords of some journal/conference # equivalent to cluster keywords over journals/conferences journal_or_conference = sys.argv[1] num = int(sys.argv[2]) conn = jcke.get_db_conn() # default (num <=0 or > max number): figure out all the journals/conferences keywords if num <= 0 or (num >= 15151 and journal_or_conference == "journal") or ( num >= 4545 and journal_or_conference == "conference"): query = """ SELECT COUNT(*) FROM ##journal_or_conference## """ query = query.replace("##journal_or_conference##", journal_or_conference) conn.cursor.execute(query) num = conn.cursor.fetchall( ) # number of journals/conferences to process # document parsing journal_conf_list = os.listdir("journal_conf_keyword") # check if txt files exist, then generate those docs if not num == len(journal_conf_list): jcke.journal_conf_keyword_generation(conn, num, journal_or_conference) journal_conf_list = os.listdir("journal_conf_keyword") # The number of journal/conference keyword sets in each batch batchsize = lambda num: num if num <= 100 else 100 batch = batchsize(num) iteration_times = int(num / batch) # The total number of journals/conferences DocNum = lambda journal_or_conference: 15151 if journal_or_conference == "journal" else 4545 D = DocNum(journal_or_conference) # The number of topics K = 100 # maybe some other numbers # Our vocabulary : we need some vocabulary set! vocab = dict() # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 online_LDA = lda.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* sooner than this.) for iteration in range(0, iteration_times): # getting documents (keyword sets) if iteration != iteration_times - 1: journal_conf_keyword_list = jcke.input_journal_conf_keywords( journal_conf_list[iteration * batch:(iteration + 1) * batch]) else: journal_conf_keyword_list = jcke.input_journal_conf_keywords( journal_conf_list[iteration * batch:]) # online LDA for keyword sets # here we update the relative function in the package (dangerous!) online_LDA._vocab = jcke.vocabulary_generation( journal_conf_keyword_list, online_LDA._vocab) (gamma, bound) = online_LDA.update_lambda(journal_conf_keyword_list) # Compute an estimate of held-out perplexity (keywordids, keywordcts) = lda.parse_doc_list(journal_conf_keyword_list, online_LDA._vocab) perkeywordbound = bound * len(journal_conf_keyword_list) / ( D * sum(map(sum, keywordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, online_LDA._rhot, numpy.exp(-perkeywordbound)) # Save lambda, the parameters to the variational distributions over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in the last iteration. numpy.savetxt('lambda-%s.dat' % journal_or_conference, online_LDA._lambda) numpy.savetxt('gamma-%s.dat' % journal_or_conference, gamma)
def main(): # unpack input arguments # seednum = 1 # documentstoanalyze = 2000 # batchsize = 10 # priv = 1 # epsilon = 1 # comp = 2 # mech = 0 seednum = int(sys.argv[1]) documentstoanalyze = int(sys.argv[2]) batchsize = int(sys.argv[3]) priv = int(sys.argv[4]) # 1 is private version, 0 is nonprivate version # epsilon = float(sys.argv[5]) # total privacy budget comp = int(sys.argv[5]) # mech = int(sys.argv[6]) # 0 for Gaussian, 1 for Laplace # The number of topics #K = 100 K = 50 #JF # load data # the_filename = Data_PATH+'wiki_docsmallset' # with open(the_filename, 'rb') as f: # docset = cPickle.load(f) #the_filename = Data_PATH+'wiki_docsmallset_D=%s' %(400000) the_filename = os.path.join(Data_PATH, 'wiki_docsmallset_D=%s' % (400000)) #JF: Make this work on Windows if resampleShortDocs: the_filename = the_filename + '_resample_short_docs' with open(the_filename, 'rb') as f: docset = cPickle.load(f) D = len(docset) print 'document length: %s' % (D) nu = batchsize / float(D) # sampling rate numpy.random.seed(seednum) print 'seednum %s mini-batchsize %s and number of iter %s' % ( seednum, batchsize, documentstoanalyze) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) """ privacy budget calculation """ # (1) to set the same level of burned privacy, we first calculate MA composition #sigma = 1.00000000000000000000000000000000000001 #a small value to minimize the noise #sigma = 1.1 #an intermediate value sigma = 1.24 #an intermediate value #sigma = 1.5 #an intermediate value #sigma = 2 #a larger value, expected to substantially reduce privacy and performance. total_del = 1e-4 J = documentstoanalyze total_eps_MA = cal_pri.moments_accountant(sigma, total_del, nu, J) print 'total privacy loss is %f' % (total_eps_MA) #(2) strong composition del_iter = 1e-6 res = minimize_scalar(cal_pri.strong_composition, bounds=(0, 50), args=(total_eps_MA, total_del, J, nu, del_iter), method='bounded') eps_iter = res.x gamma_noise = 0 # we don't use this at all. if comp == 0: #MA c2 = 2 * np.log(1.25 / del_iter) eps_iter = np.sqrt(c2) / sigma budget = [eps_iter, del_iter] elif comp == 1: #strong composition budget = [eps_iter, del_iter] else: print "we don't support this composition" if priv: print 'private version' olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7, priv, budget, gamma_noise, mech) perplexity = numpy.zeros(documentstoanalyze) # for iteration in range(0, maxIter): for iteration in range(0, documentstoanalyze): # subset of data rand_perm_nums = numpy.random.permutation(len(docset)) idx_minibatch = rand_perm_nums[0:batchsize] docsubset = list(docset[i] for i in idx_minibatch) # Give them to online LDA (gamma, bound) = olda.update_lambda_docs(docsubset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docsubset, olda._vocab) perwordbound = bound * len(docsubset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) perplexity[iteration] = numpy.exp(-perwordbound) # save perplexity if priv: # if gamma_noise: # method = 'private_epsilon_%s_cdp_%s_gamma_noise_%s' % (epsilon, cdp, gamma_noise) # else: # method = 'private_epsilon_%s_cdp_%s' %(epsilon, cdp) # method = 'static_private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s_D=%s' %(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6], D) #method = Results_PATH+'static_private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s_Lap=%s_D=%s' %(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], total_eps_MA, sys.argv[5], sys.argv[6], D) method = os.path.join( Results_PATH, 'static_private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s_Lap=%s_D=%s' % (sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], total_eps_MA, sys.argv[5], sys.argv[6], D)) else: #method = Results_PATH+'static_nonprivate_seed=%s_J=%s_S=%s_priv=%s_D=%s' %(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], D) method = os.path.join( Results_PATH, 'static_nonprivate_seed=%s_J=%s_S=%s_priv=%s_D=%s' % (sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], D)) if resampleShortDocs: method = method + '_resample_short_docs' numpy.save(method + '.npy', perplexity) # method = 'private_epsilon_1' # filename = method+'_D=_%s_S=_%s' %(D, batchsize) # numpy.save(filename+'.npy', test_log_likelihood) # save lambda and gamma numpy.savetxt(method + '_lambda.dat', olda._lambda) numpy.savetxt(method + '_gamma.dat', gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() # The number of documents to analyze each iteration batchsize = 100 # The total number of documents in Wikipedia D = 1000 #D = 2129792 for the whole set # The number of topics K = 30 # Our vocabulary vocab = file('./com_all_words.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) iteration = 0 while iteration * batchsize * size <= D: # Download some articles docset = [] counts = [] linecache.clearcache() startpoint = iteration * batchsize * size + batchsize * rank + 1 if startpoint > D: # search to the end break # stop # get the paper keywords in batches for i in range(batchsize): f1 = open('com_all_key.txt', 'r') f2 = open('com_all.txt', 'r') docset.append( linecache.getline('com_all_key.txt', min(D, startpoint))[:-1]) counts.append( linecache.getline('com_all.txt', min(D, startpoint))[:-1]) startpoint = startpoint + 1 # print type(docset), type(docset[0]), docset[0] # Give them to online LDA (gamma, bound) = olda.update_lambda(docset, counts) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab, counts) # print wordcts[0:5] perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) iteration = iteration + 1 # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. # print olda._lambda[0] gammas = comm.gather(gamma, root=0) lambdas = comm.gather(olda._lambda, root=0) if rank == 0: gamma_result = numpy.vstack((x for x in gammas)) lambda_result = numpy.vstack((x for x in lambdas)) numpy.savetxt('lambda_parallel.dat', olda._lambda) numpy.savetxt('gamma_parallel.dat', gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ wn.ensure_loaded() wiki_pool = wiki_local.WikiPool() # The number of documents to analyze each iteration batchsize = 1 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 30 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D / batchsize) else: documentstoanalyze = int(sys.argv[1]) + 1 # Our vocabulary #vocab = file('./dictnostops.txt').readlines() #vocab = file('./wordnet_nouns.txt').readlines() #vocab = file('./synset_dict.txt').readlines() #vocab = file('./wn_ambig_no_stop.txt').readlines() vocab = file('./mixed_wn_dict.txt').readlines() #vocab = [] #for word in words.words(): # word = str(word).lower() # word = re.sub(r'[^a-z]', '', word) # if word != '': # vocab.append(word) ##we get repeats because of upper -> lowercase? #vocab = set(vocab) #vocab = list(vocab) W = len(vocab) print W # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles (docset, articlenames) = \ wiki_pool.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda_docs(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 50 == 0): numpy.savetxt( 'data_ground_truth_disambig/lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt( 'data_ground_truth_disambig/gamma-%d.dat' % iteration, gamma) numpy.savetxt('data_ground_truth_disambig/lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('data_ground_truth_disambig/gamma-%d.dat' % iteration, gamma) print "finished iterations" wiki_pool.end()
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ global D global doc_list global last_gamma_file cut_words() print D print len(doc_list) # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia #D = 500 # The number of topics K = int(sys.argv[1]) # How many documents to look at if (len(sys.argv) < 3): documentstoanalyze = int(D / batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./chineseNoStopWords.txt').readlines() #print vocab W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) print documentstoanalyze perplexity_set = [] iter_set = [] for iteration in range(0, documentstoanalyze): # Download some articles ''' (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) ''' docset = doc_list[iteration * batchsize:(iteration + 1) * batchsize] # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) perplexity_set.append(numpy.exp(-perwordbound)) iter_set.append(iteration) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 100 == 0 or iteration == documentstoanalyze - 1): numpy.savetxt( './res_' + sys.argv[1] + '/lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('./res_' + sys.argv[1] + '/gamma-%d.dat' % iteration, gamma) last_gamma_file = './res_' + sys.argv[1] + '/lambda-%d.dat' % ( documentstoanalyze - 1) save_lambda_path = 'last_lambda_' + sys.argv[1] + '.txt' flast = open(save_lambda_path, 'w') flast.write(last_gamma_file) flast.close()
def main(): # unpack input arguments # seednum = 1 # documentstoanalyze = 2000 # batchsize = 1000 # priv = 0 # epsilon = 1 # comp = 2 seednum = int(sys.argv[1]) documentstoanalyze = int(sys.argv[2]) batchsize = int(sys.argv[3]) priv = int(sys.argv[4]) # 1 is private version, 0 is nonprivate version epsilon = float(sys.argv[5]) # total privacy budget comp = int(sys.argv[6]) # 0 conventional, 1 advanced, 2 CDP # The number of topics K = 100 # D = 1000000 D = 5000000 nu = batchsize / float(D) # sampling rate numpy.random.seed(seednum) print('seednum %s mini-batchsize %s and number of iter %s' % (seednum, batchsize, documentstoanalyze)) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) gamma_noise = 0 # will use Laplace noise all the time if comp == 2: # budget = numpy.sqrt(epsilon/float(documentstoanalyze)) # budget = numpy.sqrt(epsilon*D/float(2*batchsize)) budget = numpy.sqrt(2 * epsilon) / float( 2 * nu * numpy.sqrt(documentstoanalyze)) elif comp == 1: delta = 0.000001 budget = epsilon / float( 4 * nu * numpy.sqrt(2 * documentstoanalyze * numpy.log(1 / delta))) else: # budget = epsilon/float(documentstoanalyze) budget = epsilon / float(2 * documentstoanalyze * nu) if priv: print('private version') olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7, priv, budget, gamma_noise) # the_filename = Data_PATH+'wiki_data' # with open(the_filename, 'rb') as f: # docset = cPickle.load(f) # load all the documents # docset = [] # for whichdoc in range(1, 21): # the_filename = Data_PATH+'wikidata_seednum=_%s' %(whichdoc) # with open(the_filename, 'rb') as f: # docset1 = cPickle.load(f) # docset = docset + docset1 # print "docset %s is loaded" %(whichdoc) # # print "docset all loaded" perplexity = numpy.zeros(documentstoanalyze) # D_test = 10000 # for iteration in range(0, maxIter): for iteration in range(0, documentstoanalyze): # subset of data # rand_perm_nums = numpy.random.permutation(len(docset)) # idx_minibatch = rand_perm_nums[0:batchsize] # docsubset = list(docset[i] for i in idx_minibatch) # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda_docs(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print('%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound))) # # Give them to online LDA # (gamma, bound) = olda.update_lambda_docs(docsubset) # # Compute an estimate of held-out perplexity # (wordids, wordcts) = onlineldavb.parse_doc_list(docsubset, olda._vocab) # perwordbound = bound * len(docsubset) / (D * sum(map(sum, wordcts))) # print '%d: rho_t = %f, training perplexity estimate = %f' % \ # (iteration, olda._rhot, numpy.exp(-perwordbound)) # compute test perplexity # idx_test = rand_perm_nums[batchsize+1:batchsize+1+D_test] # doctest = list(docset[i] for i in idx_test) # # (gamma_test, ss) = olda.do_e_step_docs(doctest) # # Estimate held-out likelihood for current values of lambda. # bound_test = olda.approx_bound_docs(doctest, gamma_test) # (wordids, wordcts_test) = onlineldavb.parse_doc_list(doctest, olda._vocab) # # # perwordbound_test = bound_test*D_test / float(D*sum(map(sum, wordcts_test))) # perword_test_log_likelihood = bound_test / float(sum(map(sum, wordcts_test))) # print '%d: rho_t = %f, test perplexity estimate = %f' % \ # (iteration, olda._rhot, perword_test_log_likelihood) perplexity[iteration] = numpy.exp(-perwordbound) # save perplexity if priv: # if gamma_noise: # method = 'private_epsilon_%s_cdp_%s_gamma_noise_%s' % (epsilon, cdp, gamma_noise) # else: # method = 'private_epsilon_%s_cdp_%s' %(epsilon, cdp) method = 'private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s' % ( sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6]) else: method = 'Nonprivate_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s' % ( sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6]) numpy.save(method + '.npy', perplexity) # method = 'private_epsilon_1' # filename = method+'_D=_%s_S=_%s' %(D, batchsize) # numpy.save(filename+'.npy', test_log_likelihood) # save lambda and gamma numpy.savetxt(method + '_lambda.dat', olda._lambda) numpy.savetxt(method + '_gamma.dat', gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batch_size = 4 # Total number of documents in the population. For a fixed corpus, # this is the size of the corpus. In the truly online setting number_of_documents = 71 # The number of topics number_of_topics = 1 # establish mysql database connection database = MysqlMessager(database="keyword_app") sql = "select Abstract from PreprocessedAbstracts;" database.excute_sql(sql) row_iteration = database.fetch() abstracts = [row[0] for row in row_iteration] # How many documents to look at if len(sys.argv) < 2: documents_to_analyze = int(number_of_documents / batch_size) else: documents_to_analyze = int(sys.argv[1]) # Our vocabulary all_keywords_file_path = "../../keywords/abstract_109.txt" with read_pickle_file(all_keywords_file_path) as content: vocab = list(content) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, number_of_topics, number_of_documents, 1. / number_of_topics, 1. / number_of_topics, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documents_to_analyze): # set dataset as list that stores all abstracts doc_set = abstracts # Give them to online LDA (gamma, bound) = olda.update_lambda(doc_set) # Compute an estimate of held-out perplexity (word_ids, word_count_times) = onlineldavb.parse_doc_list(doc_set, olda.vocab) per_word_bound = bound * len(doc_set) / ( number_of_documents * sum(map(sum, word_count_times))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda.rhot, numpy.exp(-per_word_bound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if iteration % 10 == 0: numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(): """ Analyzes scraped pages using scikit-learn.LDA """ # The number of topics K = 10 # no of documents D = 300 n_features = 1000 # Our vocabulary vocab = list(set(file('./vocab').readlines())) W = len(vocab) # Add terms and topics to the DB db.init() db.add_terms(vocab) db.add_topics(K) olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # grab documents ### Load your scraped pages, re-tokenize, and vectorize result. docset, docnames = [], [] for filename in os.listdir(os.getcwd()): if filename.endswith('.html'): tree = html.parse(filename) try: encoding = tree.xpath('//meta/@charset')[0] except IndexError: encoding = 'utf-8' with open(filename) as page: rawtext = page.read() try: rawtext = rawtext.decode(encoding, errors='backslashreplace') except TypeError: continue # encoding issues, see http://stackoverflow.com/questions/19527279/python-unicode-to-ascii-conversion docset += [clean_html(rawtext)] docnames += [filename[:-5]] if not(len(docset) % 10): print("loaded " + str(len(docset)) + " documents") # Give them to online LDA # Also computes an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) (gamma, bound) = olda.update_lambda(wordids, wordcts) # Arrays for adding batches of data to the DB # doc_array = [] # doc_term_array = [] # for d in range(len(docnames)): # doc_array.append((docnames[d], docset[d])) doc_array = zip(docnames, docset) # Add a batch of docs to the DB; this is the one DB task that is not in # the separate DB write thread since later tasks depend on having doc ids. # Since writes take so long, this also balaces the two threads time-wise. doc_ids = db.add_docs(doc_array) doc_topic_array = [] for d in range(len(gamma)): doc_size = len(docset[d]) for k in range(len(gamma[d])): doc_topic_array.append((doc_ids[d], k, gamma[d][k], gamma[d][k]/doc_size)) db.add_doc_topics(doc_topic_array) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (1, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. numpy.savetxt('lambda-%d.dat' % 1, olda._lambda) numpy.savetxt('gamma-%d.dat' % 1, gamma) topic_terms_array = [] for topic in range(len(olda._lambda)): lambda_sum = sum(olda._lambda[topic]) for term in range(len(olda._lambda[topic])): topic_terms_array.append((topic, term, olda._lambda[topic][term]/lambda_sum)) db.update_topic_terms(K, topic_terms_array) gc.collect() # probably not necesary, but precautionary for long runs db.print_task_update() # The DB thread ends only when it has both run out of tasks and it has been # signaled that it will not be recieving any more tasks db.increment_batch_count() db.signal_end()
def main(): """ Analyzes specified documents. """ options = parse_args() print (options) # we assume there exist three files: # a vocab file (corpus_vocab.dat) # a training file (corpus_train.dat) # a validation file (corpus_test.dat) corpus = options.corpus # vocab file W = len(open(corpus + "_vocab.dat", 'r').readlines()) #print(open(corpus + "_vocab.dat", 'r').readlines()) # validation file validation_filename = corpus + "_test.dat" wikirandom = archived_dataset.Corpus(corpus + "_train.dat") # should be _train.dat # else: # import wikirandom #load a held-out set validation_docs = archived_dataset.loadDocs(validation_filename) algorithmname = options.algorithmname # the second tells us the batch size batchsize = options.batchsize # the third tells us a list of number of threads to run. (each will be run sequentially) numthreads = options.numthreads # number of documents trueD = wikirandom._D if(algorithmname == "hbb"): if options.D == -1: D = trueD # number of documents to know in advance else: D = options.D # #prior for topics (ANDRE: this is now a parameter) # eta = 1. eta = options.eta # The total number of documents #D = 3.3e6 (used to be number in Wikipedia; now an argument) # The number of topics K = options.K alpha = 1./K #* numpy.ones(K) batchsize = options.batchsize if (algorithmname == "hdp_filtering"): alg = filtering.HDPFiltering(W,eta, options.max_iters,options.threshold*1E-6, T = 300, K = 30) if (algorithmname == "ss"): if (numthreads == 1): alg = filtering.Filtering(W, K, alpha, eta, 1, True, 0.1) # note: last two args shouldn't matter else: # NOT REALLY SUPPORTED! alg = parallelfiltering.ParallelFiltering(W, K, alpha, eta, 1, 0.1,True,options.numthreads) if (algorithmname == "filtering"): #maxiters = 15 if (numthreads == 1): alg = filtering.Filtering(W, K, alpha, eta, options.max_iters, options.useHBBBound, options.threshold) else: if (options.async): alg = asynchronous.ParallelFiltering(W, K, alpha, eta, options.max_iters, options.threshold, options.useHBBBound, options.batchsize, options.numthreads) batchsize = batchsize * options.async_batches_per_eval * options.numthreads else: alg = parallelfiltering.ParallelFiltering(W, K, alpha, eta, options.max_iters, options.threshold, options.useHBBBound, options.numthreads, options.batchsize) batchsize = batchsize * options.numthreads if (algorithmname == "hbb"): #default: tau0 = 1024; kappa = 0.7 # paper says: kappa = 0.5; tau0 = 64; S (minibatch size) = 4096 # alg = onlineldavb.OnlineLDA(W, K, D, alpha, 1./K, options.tau0, options.kappa) # the original code for NIPS submission, eta = 1/K alg = onlineldavb.OnlineLDA(W, K, D, alpha, eta, options.tau0, options.kappa) # EP for LDA if (algorithmname == "filtering_ep"): if (numthreads == 1): alg = filtering.FilteringEP(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton) else: if (options.async): alg = asynchronous.ParallelFilteringEP(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.batchsize, options.numthreads) batchsize = batchsize * options.async_batches_per_eval * options.numthreads else: alg = parallelfiltering.ParallelFilteringEP(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.numthreads, options.batchsize) batchsize = batchsize * options.numthreads # Fake EP for LDA (?) -- to be removed eventually since it's worse than true EP if (algorithmname == "filtering_ep2"): if (numthreads == 1): alg = filtering.FilteringEP2(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton) else: if (options.async): alg = asynchronous.ParallelFilteringEP2(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.batchsize, options.numthreads) batchsize = batchsize * options.async_batches_per_eval * options.numthreads else: alg = parallelfiltering.ParallelFilteringEP2(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.numthreads, options.batchsize) batchsize = batchsize * options.numthreads # Specify the minimum number of points to be processed before we run the evaluation code, since evaluation is expensive minNumPtsPerEval = options.minNumPtsPerEval expGrowthEval = options.expGrowthEval if (minNumPtsPerEval <= 0): if (corpus == "nature"): # 351K docs minNumPtsPerEval = 512 #1e3 elif (corpus == "wiki"): # 3.6M docs #minNumPtsPerEval = 512 #1e3 #2e4 minNumPtsPerEval = 2 # for toy wiki dataset else: minNumPtsPerEval = int(trueD / 1000) print ("Using algorithm: " + str(alg)) recordedData = [] totalTime = 0.0 totalDownloadingTime = 0.0 iters = int(trueD / batchsize) + 1 #print(iters, batchsize, trueD) numPtsProc = 0 # number of points processed since last evaluation for iteration in range(iters): # Get some articles start = time.time() docset = wikirandom.get_random_docs(batchsize) totalDownloadingTime += time.time() - start start = time.time() (alg_alpha, alg_lam) = alg.update_lambda(docset) iter_time = time.time() - start totalTime += iter_time numPtsProc += batchsize # we have processed this many more points if (numPtsProc >= minNumPtsPerEval or iteration == iters-1): # evaluate if we have processed enough points, or this is the last iteration numPtsProc = 0 # reset the counter # The following is just the usual evaluation code from before start = time.time() (perplex, split) = evaluation.evaluate(validation_docs, alg_alpha, alg_lam, options.usePtEst) testTime = time.time() - start print (str(iteration+1) + "/" + str(iters) + " " + str(alg) + " (%g, %g): held-out perplexity estimate = %f, %f" % (iter_time, testTime, perplex, split)) recordedData += [((iteration+1)*batchsize, totalTime, totalDownloadingTime, perplex, split)] # also save perplexity now! if (algorithmname in ["hbb", "filtering", "filtering_ep", "filtering_ep2"]): outfile = corpus + "_" + str(alg) + "_" + str(batchsize) + "_eta" + str(eta) # need to distinguish eta now else: outfile = corpus + "_" + algorithmname + "_" + str(options.batchsize) + "_" + str(options.numthreads) + "_eta" + str(eta) numpy.save(outfile, recordedData) if (expGrowthEval): # double the number of points to the next evaluation minNumPtsPerEval = minNumPtsPerEval * 2 else: print (str(iteration+1) + "/" + str(iters) + " " + str(alg) + " (%g)" % (iter_time)) if (iteration == iters-1): # save final lambda matrix if (algorithmname in ["hbb", "filtering", "filtering_ep", "filtering_ep2"]): topics_outfile = "topics_" + corpus + "_" + str(alg) + "_" + str(batchsize) + "_eta" + str(eta) # need to distinguish eta now else: topics_outfile = "topics_" + corpus + "_" + algorithmname + "_" + str(options.batchsize) + "_" + str(options.numthreads) numpy.save(topics_outfile, alg_lam) # asynchronous filtering needs to terminate its workers if (algorithmname == "filtering"): if (numthreads > 1): if (options.async): alg.shutdown() print ("DONE!")
def fit_olda(parse, doc_path, doc_file, vocab_file, outdir, K, batch_size, \ iterations, verbose_topics, anchors, tmv_pickle, lemmatize, final_pass, \ full_doc_topics): """ Analyzes a set of documents using online VB for LDA. """ # instance to generate radom documents if parse == "live": # read and parse docs on the fly using vocab docgen = generalrandom.LiveparseDocGen(doc_path) else: # alternative: preparsed docgen = generalrandom.PreparseDocGen(doc_file) # The total number of documents in Wikipedia D = docgen.getDocCount() if iterations == 0: iterations = max(D / batch_size, 10) # Our vocabulary if parse == "live" or verbose_topics: vocab = [term.strip() for term in file(vocab_file).readlines()] W = len(vocab) else: W = docgen.getTermCount() vocab = ["term " + str(w) for w in range(W)] # write out general settings to pickle file for use by TMV later if tmv_pickle: # save model settings: vocab, K, docgen f = open(join(outdir, 'settings.pickle'), 'w+') cPickle.dump((vocab, K, docgen, lemmatize), f) f.close() # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7, anchors, \ lem = lemmatize, preparsed = (parse == "preparsed")) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) iteration = 0 old_perplexity = 1.0 * sys.maxint delta_perplexity = 1.0 * sys.maxint delta_perplexities = [old_perplexity] * 10 logfile = open(join(outdir, 'log.out'), 'w+') while iteration < iterations and sum( delta_perplexities ) / 10 > 0.001: # 0.1% change in sample perplexity iter_start = time.time() # Download some articles docset = docgen.get_random_articles(batch_size) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity if parse == "live": (wordids, wordcts) = onlineldavb.parse_doc_list(docset, \ olda._vocab, lemmatize) else: (wordids, wordcts) = docset # estimate perpexity with the current batch perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) perplexity = numpy.exp(-perwordbound) delta_perplexity = abs(old_perplexity - perplexity) / perplexity print '%d: rho_t = %f, held-out perplexity estimate = %f (%.2f%%)' % \ (iteration, olda._rhot, perplexity, delta_perplexity * 100) logfile.write( '%d: rho_t = %f, held-out perplexity estimate = %f (%.2f%%)\n' % (iteration, olda._rhot, perplexity, delta_perplexity * 100)) old_perplexity = perplexity delta_perplexities.pop(0) delta_perplexities.append(delta_perplexity) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt(join(outdir, 'lambda-%d.dat' % iteration), \ olda._lambda) numpy.savetxt(join(outdir, 'gamma-%d.dat' % iteration), gamma) if verbose_topics: print_topics(K, 7, vocab, olda._lambda, anchors) iteration += 1 logfile.close() if tmv_pickle: f = open(join(outdir, 'olda.pickle'), 'w+') cPickle.dump(olda, f) f.close() # save final iters numpy.savetxt(join(outdir, 'lambda-final.dat'), olda._lambda) numpy.savetxt(join(outdir, 'gamma-%d.dat' % iteration), gamma) # do a final pass on all documents if (final_pass): fout = open(join(outdir, "gamma-final.dat"), 'w+') if not full_doc_topics: fout.write("doc.lda.id\ttopic.id\tscore\n") i = 0 for doc in docgen: if parse == 'live': #TODO: the parsers should return same order... doc = doc[1] (gamma, ss) = olda.do_e_step(doc) j = 0 if not full_doc_topics: for g in gamma.tolist()[0]: if g > 0.051: fout.write("%d\t%d\t%f\n" % (i, j, g)) j += 1 i += 1 else: gf = gamma.tolist()[0] fout.write(('\t'.join(["%f"] * len(gf)) + '\n') % tuple(gf)) fout.close()
def main(): """ Test function for Online LDA using Variational Bayes """ # The number of documents to analyze each iteration batchsize = 4 # The total number of documents (or an estimate of all docs) D = 16 # The number of topics K = 3 # How many documents to look at if (len(sys.argv) < 2): num_iters = int(D / batchsize) else: num_iters = int(sys.argv[1]) # Our vocabulary #vocab = file('./dictnostops.txt').readlines() #W = len(vocab) print "num_iters: %s " % num_iters QSR_vectors = cPickle.load(open("Data/feature_space.p", "rb")) QSR_codebook = cPickle.load(open("Data/code_book.p", "rb")) codebook_len = len(QSR_codebook) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(QSR_codebook, K, D, 1. / K, 1. / K, 1., 0.7) # Run until we've seen D documents. for iteration in range(0, num_iters): print "it: %s. start: %s. end: %s" % (iteration, iteration * batchsize, (iteration + 1) * batchsize) # Download some articles #(docset, articlenames) = wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA docset = QSR_vectors[iteration * batchsize:(iteration + 1) * batchsize] print "size of docset: %s" % len(docset) wordids = [] wordcts = [] for cnt, v in enumerate(docset): print "\n cnt: ", cnt nonzeros = numpy.nonzero(v) available_features = nonzeros wordids.append(available_features) feature_counts = v[nonzeros] wordcts.append(feature_counts) print "v ", v print "avail features %s, feature_cnts: %s" % (available_features, feature_counts) print "wordids %s, wordcts: %s" % (wordids, wordcts) (gamma, bound) = olda.update_lambda(wordids, wordcts) # Compute an estimate of held-out perplexity perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 1 == 0): numpy.savetxt('Data/lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('Data/gamma-%d.dat' % iteration, gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 100 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 rho_t_vector = [] perplexity_vector = [] time_vector = [] time1_vector = [] # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D / batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 kappa = 0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., kappa) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) t1 = time.time() for iteration in tqdm(range(0, documentstoanalyze)): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda_docs(docset) # Compute an estimate of held-out perplexity t = time.time() (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) t2 = time.time() time_vector.append(t2 - t1) if len(time1_vector) == 0: time1_vector.append(t2 - t) else: time1_vector.append(time1_vector[-1] + t2 - t) rho_t_vector.append(olda._rhot) perplexity_vector.append(perwordbound) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma) numpy.savetxt('time_%.1f_%d' % (kappa, batchsize), numpy.array(time_vector)) numpy.savetxt('rho_%.1f_%d' % (kappa, batchsize), numpy.array(rho_t_vector)) numpy.savetxt('perplexity_%.1f_%d' % (kappa, batchsize), numpy.array(perplexity_vector)) numpy.savetxt('time1_%.1f_%d' % (kappa, batchsize), numpy.array(time1_vector))
def main(argv): doc_list = [] argList = handleArgs(argv) #list the docs in pickledDocs folder p = "../data/pickledDocs/" l = listdir(p) fileList = [p + f for f in l] #for each pickled doclist, append all docs to master doclist for fi in fileList: with open(fi, 'rb') as d: docs = cPickle.load(d) for k, x in docs.iteritems(): doc_list.append(x) print len(doc_list) #D is total number of docs to show to the model, K is number of topics goal_its = 80 #number of iterations to run LDA corp_size = len(doc_list) #number of documents in the corpus D = corp_size * goal_its #number of documents expected to see K = 10 #default topic value, if none given in parameters saveModel = False #whether to save LDA model itself desc = "" #for performing non-standard runs version = "" #for having multiple models with same parameters hyper_param = "" #for testing hyperparameters #define the vocabulary file we will be using vocab = helper_funcs.read_dict("../data/dictionary.txt") #default dict #initialize an instance of the OnlineLDA algorithm #parameters - dictionary, num topics, learning rate, beta, tau, kappa #if the path to an OnlineLDA pickle is passed, it re-opens that pickle K = int(argList[0]) vocab = vocab = str.split(file(argList[1]).read()) if not (argList[2] is None): alpha = argList[2] else: alpha = 0.1 if not (argList[3] is None): beta = argList[3] else: beta = 1. saveModel = False lda = onlineldavb.OnlineLDA(vocab, K, D, alpha, beta, 1024, 0.) print "created LDA with parameters:\nnumwords: " + str( len(vocab)) + "\n#topics: " + str(K) + "\nalpha: " + str( alpha) + "\nbeta: " + str(beta) paramTitle = hyper_param + str( len(vocab) / 1000) + "kwords_" + str(K) + "topics" folder = "../data/out/models/" + paramTitle if not isdir(folder): mkdir(folder) W = len(vocab) print "dictionary size: " + str(W) print paramTitle print folder #if desc.find("label") > -1: # with open("../data/out/past_models/"+paramTitle+"/dictionary.txt",'wb') as f: # voc = sorted(vocab.items(),key=operator.itemgetter(1)) # for x in voc: # f.write(x[0]+"\n") #perform LDA on the document list for goal_its iterations, updating lambda for i in range(lda._updatect, goal_its): print doc_list print i (gamma, bound) = lda.update_lambda(doc_list) (wordids, wordcts) = onlineldavb.parse_doc_list(doc_list, lda._vocab) perwordbound = bound * len(doc_list) / (D * sum(map(sum, wordcts))) print np.exp(-perwordbound) #pickle the model and its output occasionally if (i + 1) == goal_its: if not isdir(folder): mkdir(folder) with open(folder + "/gamma.pickle", 'wb') as f: cp2 = cPickle.Pickler(f) cp2.dump(gamma) with open(folder + "/lambda.pickle", 'wb') as f: cp = cPickle.Pickler(f) cp.dump(lda._lambda) np.savetxt(folder + '/lambda.dat', lda._lambda) if saveModel: with open(folder + "/LDA.pickle", 'wb') as f: cp3 = cPickle.Pickler(f) cp3.dump(lda)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Add terms and topics to the DB db.init() db.add_terms(vocab) db.add_topics(K) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) # Arrays for adding batches of data to the DB doc_array = [] doc_term_array = [] for d in range(len(articlenames)): doc_array.append((articlenames[d], docset[d])) # Add a batch of docs to the DB; this is the one DB task that is not in # the separate DB write thread since later tasks depend on having doc ids. # Since writes take so long, this also balaces the two threads time-wise. doc_ids = db.add_docs(doc_array) doc_topic_array = [] for d in range(len(gamma)): doc_size = len(docset[d]) for k in range(len(gamma[d])): doc_topic_array.append((doc_ids[d], k, gamma[d][k], gamma[d][k]/doc_size)) db.add_doc_topics(doc_topic_array) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma) topic_terms_array =[] for topic in range(len(olda._lambda)): lambda_sum = sum(olda._lambda[topic]) for term in range(len(olda._lambda[topic])): topic_terms_array.append((topic, term, olda._lambda[topic][term]/lambda_sum)) db.update_topic_terms(K, topic_terms_array) gc.collect() # probably not necesary, but precautionary for long runs db.print_task_update() db.increment_batch_count() # The DB thread ends only when it has both run out of tasks and it has been # signaled that it will not be recieving any more tasks db.signal_end()
#Our vocabulary vocab = file(sys.argv[1] + '.dict').readlines() vocab = [item.rstrip() for item in vocab] W = len(vocab) #Read in Abstracts pages = file(sys.argv[1] + '.corpus').readlines() pages = [strs.rstrip() for strs in pages] D = len(pages) pageID = range(0, D) nBatches = D / batchsize #Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 lda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 128., 0.7) #Run nBatches = 100 for iteration in range(0, nBatches): #Grab Abstracts (docset, pagenames, pages, pageID) = grabAbstracts(pages, batchsize, pageID) #Give them to online LDA (gamma, bound) = lda.update_lambda(docset) #Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, lda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))