def run_experiment(): pattern = re.compile("lda_([0-9]+)\.pb") data_dir = "data" files = [ (re.search(pattern, f).group(1), join(data_dir, f)) for f in listdir(data_dir) if isfile(join(data_dir, f)) and re.match(pattern, f) ] cmd_str = "peircebayes {} -n 100 -m lda -t -s {}" cmd_str2 = "peircebayes {} -n 100 -m lda -t -s {} -a cgs" np.random.seed(1234) start = time.time() for i, f in files: print i # sample 10 times for j, seed in enumerate(np.random.choice(5000, 10, replace=False) + 1): call_cmd(cmd_str.format(f, seed)) phi = np.load("/tmp/peircebayes/avg_samples.npz")["arr_1"] np.savez(join(data_dir, "phi_{}_{}".format(i, j)), **{"phi": phi}) call_cmd("cp /tmp/peircebayes/lls.npz data/lls_{}_{}.npz".format(i, j)) call_cmd(cmd_str2.format(f, seed)) call_cmd("cp /tmp/peircebayes/lls.npz data/lls_cgs_{}_{}.npz".format(i, j)) end = time.time() with open("data/time_pb", "w") as f: f.write(str(end - start)) cmd_str_r = "Rscript run_lda.R" start = time.time() call_cmd(cmd_str_r) end = time.time() with open("data/time_r", "w") as f: f.write(str(end - start))
def do_pb(data_dir, j, seed, topics, alpha, beta, D, W): print 'Doing PB.' pb_file = join(data_dir,'lda_0.pb') new_pb_file = join(data_dir,'new_lda_0.pb') # change this if you're not me path_to_pb_out = '/home/rares/p/peircebayes_experiments/lda_ll_topics/data/peircebayes' cmd_str = 'peircebayes {} -n {} -m lda -d -t -s {} -o '+path_to_pb_out it = 200 metric_pb = [] for T in topics: print '\t{}'.format(T) with open(pb_file, 'r') as fin, open(new_pb_file, 'w') as fout: fout.write('''pb_dirichlet({}, mu, {}, {}). pb_dirichlet({}, phi, {}, {}). generate(Doc, Token) :- Topic in 1..{}, mu(Topic,Doc), phi(Token,Topic). '''.format(alpha, T, D, beta, W, T, T)) fout.write(fin.read()) start = time.time() call_cmd(cmd_str.format(new_pb_file, it, seed)) #time_pb.append(time.time()-start) lls = np.load(join(path_to_pb_out, 'lls.npz'))['lls'] ll = np.average(lls[-10:]) metric_pb.append(ll) return metric_pb
def do_stan(data_dir, j, seed, T, counts): print 'Doing Stan.' time_stan = [] metric_stan = [] for i in [5,10,15,20]: print '\t{}'.format(i) stan_file = join(data_dir, 'lda_stan_kwargs.py') out_file = join(data_dir, 'stan_ll_{}.npz'.format(j)) phi_file = join(data_dir, 'stan_phi_{}.npz'.format(j)) theta_file = join(data_dir, 'stan_theta_{}.npz'.format(j)) with open(join(data_dir, 'lda_stan_0.py'), 'r') as fin, open( stan_file, 'w') as fout: fout.write("kwargs = {{'seed': {}, 'iter':{} }}\n".format(seed, i)) fout.write("out_file = '{}'\n".format(out_file)) fout.write("phi_file = '{}'\n".format(phi_file)) fout.write("theta_file = '{}'\n".format(theta_file)) fout.write(fin.read()) start = time.time() call_cmd('python3 {}'.format(stan_file)) time_stan.append(time.time()-start) mu = np.load(theta_file)['arr_0'] phi = np.load(phi_file)['arr_0'] ll = ull(mu, phi, T, counts) metric_stan.append(ll) return time_stan, metric_stan
def do_tm_vem(data_dir, j, seed): print 'Doing topicmodels-VEM.' vem_file = join(data_dir, 'vem_lda.R') final_vem = join(data_dir, 'vem_lda_{}.R'.format(j)) vem_ll = join(data_dir, 'vem_ll_{}.R'.format(j)) time_vem = [] metric_vem = [] for i in [20,30,40,50]: print '\t{}'.format(i) with open(vem_file, 'r') as fin, open(final_vem, 'w') as fout: fout.write(''' iter = {} seed = {} '''.format(i, seed)) fout.write(fin.read()) fout.write(''' vem_lls = lapply(list.files("{}", pattern='lda_0.txt', full.names=T), run_vem) write.table(vem_lls, "{}", sep=" ", row.names=F, col.names=F) '''.format(data_dir, vem_ll)) start = time.time() call_cmd('Rscript {}'.format(final_vem)) time_vem.append(time.time()-start) with open(vem_ll, 'r') as fin: lls = [float(line.strip()) for line in fin] ll = lls[-1] metric_vem.append(ll) return time_vem, metric_vem
def do_pb(data_dir, j, seed, T, K): print 'Doing PB.' times_pb, metrics_pb = [], [] for k in range(K): print '\tfold {}'.format(k+1) counts = pickle.load(open(join(data_dir, 'counts_{}.pkl'.format(k)), 'r')) pb_file = 'lda_0_{}.pb'.format(k) # change this if you're not me path_to_pb_out = '/home/rares/p/peircebayes_experiments/lda_p_time/data/peircebayes' cmd_str = 'peircebayes {} -n {} -m lda -d -t -s {} -o '+path_to_pb_out time_pb = [] metric_pb = [] for i in [10,20,30,40,200]: print '\t\t{}'.format(i) start = time.time() call_cmd(cmd_str.format(join(data_dir, pb_file), i, seed)) time_pb.append(time.time()-start) # get mu and phi theta = np.load(join(path_to_pb_out, 'avg_samples.npz')) mu = theta['arr_0'] phi = theta['arr_1'] # call ull ll = perplexity(mu, phi, T, counts) metric_pb.append(ll) times_pb.append(time_pb) metrics_pb.append(metric_pb) return list(np.average(times_pb, axis=0)), list(np.average(metrics_pb, axis=0))
def do_pb(data_dir, j, seed, T, counts): print 'Doing PB.' pb_file = 'lda_0.pb' # change this if you're not me path_to_pb_out = '/home/rares/p/peircebayes_experiments/lda_ull_time/data/peircebayes' cmd_str = 'peircebayes {} -n {} -m lda -d -t -s {} -o '+path_to_pb_out time_pb = [] metric_pb = [] for i in [50,100,150,200]: print '\t{}'.format(i) start = time.time() call_cmd(cmd_str.format(join(data_dir, pb_file), i, seed)) time_pb.append(time.time()-start) # get mu and phi theta = np.load(join(path_to_pb_out, 'avg_samples.npz')) mu = theta['arr_0'] phi = theta['arr_1'] # call ull ll = ull(mu, phi, T, counts) metric_pb.append(ll) return time_pb, metric_pb
def do_tm_gibbs(data_dir, j, seed): print 'Doing topicmodels-Gibbs.' gibbs_file = join(data_dir, 'gibbs_lda.R') final_gibbs = join(data_dir, 'gibbs_lda_{}.R'.format(j)) gibbs_ll = join(data_dir, 'gibbs_ll_{}.R'.format(j)) time_gibbs = [] metric_gibbs = [] for i in [25,50,75,100]: print '\t{}'.format(i) with open(gibbs_file, 'r') as fin, open(final_gibbs, 'w') as fout: fout.write(''' iter = {} seed = {} '''.format(i, seed)) fout.write(fin.read()) fout.write(''' gibbs_lls = lapply(list.files("{}", pattern='lda_0.txt', full.names=T), run_gibbs) write.table(gibbs_lls, "{}", sep=" ", row.names=F, col.names=F) '''.format(data_dir, gibbs_ll)) start = time.time() call_cmd('Rscript {}'.format(final_gibbs)) time_gibbs.append(time.time()-start) with open(gibbs_ll, 'r') as fin: lls = [float(line.strip()) for line in fin] ll = np.average(lls[-10:]) metric_gibbs.append(ll) return time_gibbs, metric_gibbs
def main(): use_seeds = False if len(sys.argv)==2 and sys.argv[1] == 'seeds': use_seeds = True if use_seeds: data_dir = 'data' cmd_str = 'peircebayes data/news20comp_c.pb -n 400 -d -t -o $PWD/data/peircebayes' #call_cmd(cmd_str) #call_cmd('cp data/peircebayes/avg_samples.npz data/avg_samples.npz') #call_cmd('upprism clda.psm') else: data_dir = 'data' cmd_str = 'peircebayes data/news20comp.pb -n 400 -d -t -o $PWD/data/peircebayes' seed = 1234 n_stan_samples = 20 # 1 tm #TODO tm start = time.time() #call_cmd('Rscript data/news20_tm.R') tm_time = time.time()-start # 2 pb start = time.time() #call_cmd(cmd_str) #call_cmd('cp data/peircebayes/avg_samples.npz data/avg_samples.npz') pb_time = time.time()-start # 3 prism start = time.time() call_cmd('upprism data/news20_prism.psm') prism_time = time.time()-start # 4 stan stan_file = 'data/lda_stan.py' with open(os.path.join(data_dir, 'news20_stan.py'), 'r') as fin, open( stan_file, 'w') as fout: fout.write("kwargs = {{'seed': {}, 'iter':{} }}\n".format(seed, n_stan_samples)) fout.write(fin.read()) start = time.time() #call_cmd('python3 {}'.format(stan_file)) stan_time = time.time()-start with open('data/times', 'w') as fout: fout.write('PB: {} seconds\n'.format(pb_time)) fout.write('Prism: {} seconds\n'.format(prism_time)) fout.write('Stan: {} seconds\n'.format(stan_time)) fout.write('Topicmodels: {} seconds\n'.format(tm_time))
def do_tm_vem(data_dir, j, seed, topics, alpha, beta, D, W, K): print 'Doing topicmodels-VEM.' vem_file = join(data_dir, 'vem_lda.R') final_vem = join(data_dir, 'vem_lda_{}.R'.format(j)) vem_theta = join(data_dir, 'vem_theta_{}.csv'.format(j)) vem_phi = join(data_dir, 'vem_phi_{}.csv'.format(j)) metrics = [] for k in range(K): print '\tfold {}'.format(k+1) counts = pickle.load(open(join(data_dir, 'counts_{}.pkl'.format(k)), 'r')) metric_gibbs = [] metric_vem = [] for T in topics: print '\t\t{}'.format(T) with open(vem_file, 'r') as fin, open(final_vem, 'w') as fout: fout.write(''' K = {} alpha = {} beta = {} iter = {} seed = {} '''.format(T, alpha, beta, 500, seed)) fout.write(fin.read()) fout.write(''' vem_models = lapply(list.files("{}", pattern='lda_0_{}.txt', full.names=T), run_vem) write.table(vem_models[[1]][[1]]@gamma, "{}", sep=" ", row.names=F, col.names=F) write.table(exp(vem_models[[1]][[1]]@beta), "{}", sep=" ", row.names=F, col.names=F) '''.format(data_dir, k, vem_theta, vem_phi)) start = time.time() call_cmd('Rscript {}'.format(final_vem)) #time_vem.append(time.time()-start) with open(vem_theta, 'r') as fin: mu = [] for line in fin: mu.append([float(i) for i in line.strip().split()]) mu = np.array(mu) with open(vem_phi, 'r') as fin: phi = [] for line in fin: phi.append([float(i) for i in line.strip().split()]) phi = np.array(phi) ll = perplexity(mu, phi, T, counts) metric_vem.append(ll) metrics.append(metric_vem) return list(np.average(metrics, axis=0))
def data_gen(): K = 6 beta = 50./float(K) gamma = 0.1 fin_name = 'data/sushi3/sushi3a.5000.10.order' fout_name = 'data/sushi3/sushi3.pb' rewrite_dir('data') call_cmd('sh get_data.sh') N, data = read_data(fin_name) write_pb(fout_name, N, data, K, beta, gamma)
def do_tm_gibbs(data_dir, j, seed, T, K): print 'Doing topicmodels-Gibbs.' gibbs_file = join(data_dir, 'gibbs_lda.R') final_gibbs = join(data_dir, 'gibbs_lda_{}.R'.format(j)) gibbs_theta = join(data_dir, 'gibbs_theta_{}.csv'.format(j)) gibbs_phi = join(data_dir, 'gibbs_phi_{}.csv'.format(j)) times, metrics = [], [] for k in range(K): print '\tfold {}'.format(k+1) counts = pickle.load(open(join(data_dir, 'counts_{}.pkl'.format(k)), 'r')) #for doc in counts.toarray(): # print list(doc) time_gibbs = [] metric_gibbs = [] for i in [10,20,30,40,200]: print '\t\t{}'.format(i) with open(gibbs_file, 'r') as fin, open(final_gibbs, 'w') as fout: fout.write(''' iter = {} seed = {} '''.format(i, seed)) fout.write(fin.read()) fout.write(''' gibbs_models = lapply(list.files("{}", pattern='lda_0_{}.txt', full.names=T), run_gibbs) write.table(gibbs_models[[1]][[1]]@gamma, "{}", sep=" ", row.names=F, col.names=F) write.table(exp(gibbs_models[[1]][[1]]@beta), "{}", sep=" ", row.names=F, col.names=F) '''.format(data_dir, k, gibbs_theta, gibbs_phi)) start = time.time() call_cmd('Rscript {}'.format(final_gibbs)) time_gibbs.append(time.time()-start) with open(gibbs_theta, 'r') as fin: mu = [] for line in fin: mu.append([float(i) for i in line.strip().split()]) mu = np.array(mu) with open(gibbs_phi, 'r') as fin: phi = [] for line in fin: phi.append([float(i) for i in line.strip().split()]) phi = np.array(phi) ll = perplexity(mu, phi, T, counts) metric_gibbs.append(ll) times.append(time_gibbs) metrics.append(metric_gibbs) return list(np.average(times, axis=0)), list(np.average(metrics, axis=0))
def do_tm_gibbs(data_dir, j, seed, topics, alpha, beta, D, W, counts): print 'Doing topicmodels-Gibbs.' gibbs_file = join(data_dir, 'gibbs_lda.R') final_gibbs = join(data_dir, 'gibbs_lda_{}.R'.format(j)) gibbs_theta = join(data_dir, 'gibbs_theta_{}.csv'.format(j)) gibbs_phi = join(data_dir, 'gibbs_phi_{}.csv'.format(j)) it = 200 metric_gibbs = [] for T in topics: print '\t{}'.format(T) with open(gibbs_file, 'r') as fin, open(final_gibbs, 'w') as fout: fout.write(''' K = {} alpha = {} beta = {} iter = {} seed = {} '''.format(T, alpha, beta, it, seed)) fout.write(fin.read()) fout.write(''' gibbs_models = lapply(list.files("{}", pattern='lda_0.txt', full.names=T), run_gibbs) write.table(gibbs_models[[1]][[1]]@gamma, "{}", sep=" ", row.names=F, col.names=F) write.table(exp(gibbs_models[[1]][[1]]@beta), "{}", sep=" ", row.names=F, col.names=F) '''.format(data_dir, gibbs_theta, gibbs_phi)) start = time.time() call_cmd('Rscript {}'.format(final_gibbs)) #time_gibbs.append(time.time()-start) with open(gibbs_theta, 'r') as fin: mu = [] for line in fin: mu.append([float(i) for i in line.strip().split()]) mu = np.array(mu) with open(gibbs_phi, 'r') as fin: phi = [] for line in fin: phi.append([float(i) for i in line.strip().split()]) phi = np.array(phi) ll = ull(mu, phi, T, counts) metric_gibbs.append(ll) return metric_gibbs
def main(): cmd_str = 'peircebayes data/lda2_arxiv.pb -d -t -n 50 -b 100' call_cmd('cat data/pb_hlda3.pb data/arxiv_obs.pb', open('data/lda2_arxiv.pb', 'w')) call_cmd(cmd_str) call_cmd('cp /tmp/peircebayes/avg_samples.npz data/avg_samples.npz') call_cmd('rm data/lda2_arxiv.pb')
def do_stan(data_dir, j, seed, topics, alpha, beta, D, W, K): print 'Doing Stan.' it = 20 metrics = [] for k in range(K): print '\tfold {}'.format(k+1) counts = pickle.load(open(join(data_dir, 'counts_{}.pkl'.format(k)), 'r')) metric_stan = [] for T in topics: print '\t\t{}'.format(T) stan_file = join(data_dir, 'lda_stan_kwargs.py') out_file = join(data_dir, 'stan_ll_{}.npz'.format(j)) phi_file = join(data_dir, 'stan_phi_{}.npz'.format(j)) theta_file = join(data_dir, 'stan_theta_{}.npz'.format(j)) with open(join(data_dir, 'lda_stan_0_{}.py'.format(k)), 'r') as fin, open( stan_file, 'w') as fout: fout.write("kwargs = {{'seed': {}, 'iter':{} }}\n".format(seed, it)) fout.write("out_file = '{}'\n".format(out_file)) fout.write("phi_file = '{}'\n".format(phi_file)) fout.write("theta_file = '{}'\n".format(theta_file)) for line in fin: if 'lda_dat = {' in line: fout.write(line) alpha_l = [alpha]*T beta_l = [beta]*W fout.write(''' 'K' : {}, 'alpha' : {}, 'beta' : {}, '''.format(T, alpha_l, beta_l)) else: fout.write(line) start = time.time() call_cmd('python3 {}'.format(stan_file)) #time_stan.append(time.time()-start) mu = np.load(theta_file)['arr_0'] phi = np.load(phi_file)['arr_0'] ll = perplexity(mu, phi, T, counts) metric_stan.append(ll) metrics.append(metric_stan) return list(np.average(metrics, axis=0))
def do_pb(data_dir, j, seed, topics, alpha, beta, D, W, K): print 'Doing PB.' new_pb_file = join(data_dir,'new_lda_0.pb') # change this if you're not me path_to_pb_out = '/home/rares/p/peircebayes_experiments/lda_p_topics/data/peircebayes' cmd_str = 'peircebayes {} -n {} -m lda -d -t -s {} -o '+path_to_pb_out it = 200 metrics = [] for k in range(K): print '\tfold {}'.format(k+1) counts = pickle.load(open(join(data_dir, 'counts_{}.pkl'.format(k)), 'r')) pb_file = join(data_dir,'lda_0_{}.pb'.format(k)) metric_pb = [] for T in topics: print '\t\t{}'.format(T) with open(pb_file, 'r') as fin, open(new_pb_file, 'w') as fout: fout.write('''pb_dirichlet({}, mu, {}, {}). pb_dirichlet({}, phi, {}, {}). generate(Doc, Token) :- Topic in 1..{}, mu(Topic,Doc), phi(Token,Topic). '''.format(alpha, T, D, beta, W, T, T)) fout.write(fin.read()) start = time.time() call_cmd(cmd_str.format(new_pb_file, it, seed)) #time_pb.append(time.time()-start) # get mu and phi theta = np.load(join(path_to_pb_out, 'avg_samples.npz')) mu = theta['arr_0'] phi = theta['arr_1'] # call ull ll = perplexity(mu, phi, T, counts) metric_pb.append(ll) metrics.append(metric_pb) return list(np.average(metrics, axis=0))
def do_prism(data_dir, j, seed, topics, alpha, beta, D, W): print 'Doing Prism.' prism_file = join(data_dir, 'lda_prism_0.psm') in_prism = join(data_dir, 'in_prism_{}'.format(j)) out_prism = join(data_dir, 'out_prism_{}'.format(j)) vfe_prism = join(data_dir, 'vfe_prism_{}'.format(j)) metric_prism = [] for T in topics: print '\t{}'.format(T) theta_str = 'values(theta(_), [1-{}], a@{}).'.format(T,alpha) phi_str = 'values(phi(_), [1-{}], a@{}).'.format(W,beta) with open(prism_file, 'r') as fin, open(in_prism, 'w') as fout: fout.write(''' {} {} '''.format(theta_str, phi_str)) fout.write(fin.read()) fout.write(''' prism_main :- random_set_seed({}), set_prism_flag(learn_mode,both), go, learn_statistics(free_energy, V), open('{}',write, Stream), write(Stream,V), close(Stream), save_phi. save_phi :- findall(Param, get_sw(phi(_), [_,_,Param]), Params), open('{}',write, Stream), forall(member(Param, Params), (write(Stream,Param), nl(Stream))), close(Stream). '''.format(seed, vfe_prism, out_prism)) start = time.time() call_cmd('upprism {}'.format(in_prism)) #time_prism.append(time.time()-start) with open(vfe_prism, 'r') as fin: vfe = float(fin.read().strip()) metric_prism.append(vfe) return metric_prism
def do_tm_vem(data_dir, j, seed, T, counts): print 'Doing topicmodels-VEM.' vem_file = join(data_dir, 'vem_lda.R') final_vem = join(data_dir, 'vem_lda_{}.R'.format(j)) vem_theta = join(data_dir, 'vem_theta_{}.csv'.format(j)) vem_phi = join(data_dir, 'vem_phi_{}.csv'.format(j)) time_vem = [] metric_vem = [] for i in [20,30,40,50]: print '\t{}'.format(i) with open(vem_file, 'r') as fin, open(final_vem, 'w') as fout: fout.write(''' iter = {} seed = {} '''.format(i, seed)) fout.write(fin.read()) fout.write(''' vem_models = lapply(list.files("{}", pattern='lda_0.txt', full.names=T), run_vem) write.table(vem_models[[1]][[1]]@gamma, "{}", sep=" ", row.names=F, col.names=F) write.table(exp(vem_models[[1]][[1]]@beta), "{}", sep=" ", row.names=F, col.names=F) '''.format(data_dir, vem_theta, vem_phi)) start = time.time() call_cmd('Rscript {}'.format(final_vem)) time_vem.append(time.time()-start) with open(vem_theta, 'r') as fin: mu = [] for line in fin: mu.append([float(i) for i in line.strip().split()]) mu = np.array(mu) with open(vem_phi, 'r') as fin: phi = [] for line in fin: phi.append([float(i) for i in line.strip().split()]) phi = np.array(phi) ll = ull(mu, phi, T, counts) metric_vem.append(ll) return time_vem, metric_vem
def _ls_file_list(self): cmd = 'ssh %s ls %s' % (self._remote_ip, self._remote_dir) #self._logger.debug('excute cmd:%s', cmd) sta, result = call_cmd(cmd) #print sta, result if sta == 0: #self._logger.debug('success get filelist:\n%s', result) return result.split() else: return None
def do_stan(data_dir, j, seed, topics, alpha, beta, D, W, counts): print 'Doing Stan.' it = 20 metric_stan = [] for T in topics: print '\t{}'.format(T) stan_file = join(data_dir, 'lda_stan_kwargs.py') out_file = join(data_dir, 'stan_ll_{}.npz'.format(j)) phi_file = join(data_dir, 'stan_phi_{}.npz'.format(j)) theta_file = join(data_dir, 'stan_theta_{}.npz'.format(j)) with open(join(data_dir, 'lda_stan_0.py'), 'r') as fin, open( stan_file, 'w') as fout: fout.write("kwargs = {{'seed': {}, 'iter':{} }}\n".format(seed, it)) fout.write("out_file = '{}'\n".format(out_file)) fout.write("phi_file = '{}'\n".format(phi_file)) fout.write("theta_file = '{}'\n".format(theta_file)) for line in fin: if 'lda_dat = {' in line: fout.write(line) alpha_l = [alpha]*T beta_l = [beta]*W fout.write(''' 'K' : {}, 'alpha' : {}, 'beta' : {}, '''.format(T, alpha_l, beta_l)) else: fout.write(line) start = time.time() call_cmd('python3 {}'.format(stan_file)) #time_stan.append(time.time()-start) mu = np.load(theta_file)['arr_0'] phi = np.load(phi_file)['arr_0'] ll = ull(mu, phi, T, counts) metric_stan.append(ll) return metric_stan
def do_pb(data_dir, j, seed): print 'Doing PB.' pb_file = 'lda_0.pb' # change this if you're not me path_to_pb_out = '/home/rares/p/peircebayes_experiments/lda_ll_time/data/peircebayes' cmd_str = 'peircebayes {} -n {} -m lda -d -t -s {} -o '+path_to_pb_out time_pb = [] metric_pb = [] for i in [50,100,150,200]: print '\t{}'.format(i) start = time.time() call_cmd(cmd_str.format(join(data_dir, pb_file), i, seed)) time_pb.append(time.time()-start) lls = np.load(join(path_to_pb_out, 'lls.npz'))['lls'] ll = np.average(lls[-10:]) metric_pb.append(ll) return time_pb, metric_pb
def do_prism(data_dir, j, seed): print 'Doing Prism.' prism_file = join(data_dir, 'lda_prism_0.psm') in_prism = join(data_dir, 'in_prism_{}'.format(j)) out_prism = join(data_dir, 'out_prism_{}'.format(j)) vfe_prism = join(data_dir, 'vfe_prism_{}'.format(j)) time_prism = [] metric_prism = [] for i in [50,100,150,200]: print '\t{}'.format(i) with open(prism_file, 'r') as fin, open(in_prism, 'w') as fout: fout.write(fin.read()) fout.write(''' prism_main :- random_set_seed({}), set_prism_flag(learn_mode,both), set_prism_flag(max_iterate,{}), go, learn_statistics(free_energy, V), open('{}',write, Stream), write(Stream,V), close(Stream), save_phi. save_phi :- findall(Param, get_sw(phi(_), [_,_,Param]), Params), open('{}',write, Stream), forall(member(Param, Params), (write(Stream,Param), nl(Stream))), close(Stream). '''.format(seed, i, vfe_prism, out_prism)) start = time.time() call_cmd('upprism {}'.format(in_prism)) time_prism.append(time.time()-start) with open(vfe_prism, 'r') as fin: vfe = float(fin.read().strip()) metric_prism.append(vfe) return time_prism, metric_prism
def run_experiment(): pattern = re.compile('lda_([0-9]+)\.pb') data_dir = 'data' files = [ (re.search(pattern, f).group(1), join(data_dir,f) ) for f in listdir(data_dir) if isfile(join(data_dir,f)) and re.match(pattern, f)] cmd_str = 'peircebayes {} -n 200 -m lda_ll -t -s {}' #cmd_str2 = 'peircebayes {} -n 200 -m lda_ll -t -s {} -a amcmc' np.random.seed(1234) start = time.time() for i,f in files: # sample 10 times for j,seed in enumerate(np.random.choice(5000, 10, replace=False)+1): call_cmd(cmd_str.format(f, seed)) phi = np.load('/tmp/peircebayes/avg_samples.npz')['arr_1'] np.savez(join(data_dir, 'phi_{}_{}'.format(i,j)), **{'phi':phi}) call_cmd('cp /tmp/peircebayes/lls.npz data/lls_{}_{}.npz'.format(i,j)) #call_cmd(cmd_str2.format(f, seed)) #call_cmd('cp /tmp/peircebayes/lls.npz data/lls_amcmc_{}_{}.npz'.format(i,j)) end = time.time() with open('data/time_pb', 'w') as f: f.write(str(end-start)) cmd_str_r = 'Rscript run_lda.R' start = time.time() call_cmd(cmd_str_r) end = time.time() with open('data/time_r', 'w') as f: f.write(str(end-start))
def do_stan(data_dir, j, seed, T, K): print 'Doing Stan.' times, metrics = [], [] for k in range(K): print '\tfold {}'.format(k+1) counts = pickle.load(open(join(data_dir, 'counts_{}.pkl'.format(k)), 'r')) time_stan = [] metric_stan = [] for i in [2,4,6,8,20]: print '\t\t{}'.format(i) stan_file = join(data_dir, 'lda_stan_kwargs.py') out_file = join(data_dir, 'stan_ll_{}.npz'.format(j)) phi_file = join(data_dir, 'stan_phi_{}.npz'.format(j)) theta_file = join(data_dir, 'stan_theta_{}.npz'.format(j)) with open(join(data_dir, 'lda_stan_0_{}.py'.format(k)), 'r' ) as fin, open(stan_file, 'w') as fout: fout.write("kwargs = {{'seed': {}, 'iter':{} }}\n".format(seed, i)) fout.write("out_file = '{}'\n".format(out_file)) fout.write("phi_file = '{}'\n".format(phi_file)) fout.write("theta_file = '{}'\n".format(theta_file)) fout.write(fin.read()) start = time.time() call_cmd('python3 {}'.format(stan_file)) time_stan.append(time.time()-start) mu = np.load(theta_file)['arr_0'] phi = np.load(phi_file)['arr_0'] ll = perplexity(mu, phi, T, counts) metric_stan.append(ll) times.append(time_stan) metrics.append(metric_stan) return list(np.average(times, axis=0)), list(np.average(metrics, axis=0))
def run_experiment(): data_dir = 'data/sushi3' pb_file = 'sushi3.pb' cmd_str = 'peircebayes {} -n 100 -t -s {}' np.random.seed(1234) start = time.time() for j,seed in enumerate(np.random.choice(1000, 10, replace=False)+1): call_cmd(cmd_str.format(join(data_dir, pb_file), seed)) call_cmd('cp /tmp/peircebayes/avg_samples.npz data/avg_samples_{}.npz'.format(j)) call_cmd('cp /tmp/peircebayes/lls.npz data/lls_{}.npz'.format(j)) end = time.time() with open('data/rim_time', 'w') as f: f.write(str(end-start))
def do_prism(data_dir, j, seed, topics, alpha, beta, D, W, K): print 'Doing Prism.' metrics = [] for k in range(K): print '\tfold {}'.format(k+1) counts = pickle.load(open(join(data_dir, 'counts_{}.pkl'.format(k)), 'r')) prism_file = join(data_dir, 'lda_prism_0_{}.psm'.format(k)) in_prism = join(data_dir, 'in_prism_{}'.format(j)) phi_prism = join(data_dir, 'phi_prism_{}'.format(j)) theta_prism = join(data_dir, 'theta_prism_{}'.format(j)) vfe_prism = join(data_dir, 'vfe_prism_{}'.format(j)) metric_prism = [] for T in topics: print '\t\t{}'.format(T) theta_str = 'values(theta(_), [1-{}], a@{}).'.format(T,alpha) phi_str = 'values(phi(_), [1-{}], a@{}).'.format(W,beta) with open(prism_file, 'r') as fin, open(in_prism, 'w') as fout: fout.write(''' {} {} '''.format(theta_str, phi_str)) fout.write(fin.read()) fout.write(''' prism_main :- random_set_seed({}), set_prism_flag(learn_mode,both), go, learn_statistics(free_energy, V), open('{}',write, Stream), write(Stream,V), close(Stream), save_phi, save_theta. save_phi :- findall((V,Param), get_sw(phi(V), [_,_,Param]), Params), open('{}',write, Stream), forall(member((V,Param), Params), (write(Stream,V), write(Stream,'|'), write(Stream,Param), nl(Stream))), close(Stream). save_theta :- findall((V,Param), get_sw(theta(V), [_,_,Param]), Params), open('{}',write, Stream), forall(member((V,Param), Params), (write(Stream,V), write(Stream,'|'), write(Stream,Param), nl(Stream))), close(Stream). '''.format(seed, vfe_prism, phi_prism, theta_prism)) start = time.time() call_cmd('upprism {}'.format(in_prism)) #time_prism.append(time.time()-start) with open(phi_prism, 'r') as fin: phi_idx = [] for line in fin: idx, rest = line.strip().split('|') idx = int(idx)-1 topic = np.array(ast.literal_eval(rest)) phi_idx.append((idx, topic)) phi = [0 for i in range(len(phi_idx))] for idx, topic in phi_idx: phi[idx] = topic phi = np.array(phi) with open(theta_prism, 'r') as fin: theta_idx = [] for line in fin: idx, rest = line.strip().split('|') idx = int(idx)-1 doc_mix = np.array(ast.literal_eval(rest)) theta_idx.append((idx, doc_mix)) mu = [0 for i in range(len(theta_idx))] for idx, doc_mix in theta_idx: mu[idx] = doc_mix mu = np.array(mu) ll = perplexity(mu, phi, T, counts) metric_prism.append(ll) metrics.append(metric_prism) return list(np.average(metrics, axis=0))
def run_experiment(): data_dir = 'data' pb_file = 'lda_0.pb' cmd_str = 'peircebayes {} -n 300 -m lda_ll -d -t -s {}' np.random.seed(1234) times_pb = [] times_church = [] times_church2 = [] for j,seed in enumerate(np.random.choice(5000, 10, replace=False)+1): start = time.time() call_cmd(cmd_str.format(join(data_dir, pb_file), seed)) end = time.time() times_pb.append(end-start) call_cmd('cp /tmp/peircebayes/last_sample.npz data/last_sample_{}.npz'.format(j)) call_cmd('cp /tmp/peircebayes/lls.npz data/lls_{}.npz'.format(j)) church_str = 'church -s {} data/lda_church1_0.md'.format(seed) start = time.time() call_cmd(church_str) end = time.time() times_church.append(end-start) call_cmd('mv ll_church1.csv data/ll_church1_{}.csv'.format(j)) church2_str = 'church -s {} data/lda_church2_0.md'.format(seed) start = time.time() call_cmd(church2_str) end = time.time() times_church2.append(end-start) call_cmd('mv ll_church2.csv data/ll_church2_{}.csv'.format(j)) with open('data/times', 'w') as f: f.write('pb: {}\n'.format(np.average(times_pb))) f.write('church: {}\n'.format(np.average(times_church))) f.write('church2: {}\n'.format(np.average(times_church2)))
def generate_seed(project, bugnumber): """ generates a file that contains json info to run d4j and lithium """ initial_projects = ["Chart", "Lang", "Closure", "Math", "Mockito", "Time"] if project not in initial_projects: raise Exception("Project {} invalid. Please select one of {}".format( project, initial_projects)) project_path = os.path.join(os.getcwd(), "data", project) if not os.path.isdir(project_path): print("FAILED") # should print to stop the main script raise Exception("Project {} directory not found".format(project_path)) # Solves the issue of different source paths for the same project if project == 'Lang' and int(bugnumber) < 36: source_path = get_source_path(project + '2') elif project == 'Math' and int(bugnumber) > 84: source_path = get_source_path(project + '2') else: source_path = get_source_path(project) # get only bugs choosen by user bugnumber = bugnumber.split(",") if not is_input_number_valid(bugnumber, project_path): print("FAILED") # should print to stop the main script raise Exception( "one or more json files({}) are not found in path {}".format( bugnumber, project_path)) bugnumbers = ['{}.json'.format(bug) for bug in bugnumber] if '0' in bugnumber: # 0 similar to "all" bugs bugnumbers = os.listdir(project_path) else: bugnumbers = [ doc for doc in os.listdir(project_path) if doc in bugnumbers ] # for each bug for bug in bugnumbers: data = json_to_dict(os.path.join(project_path, bug)) bug_number = bug.replace(".json", "") # getting the expected message expected_dir = 'oracle/' + project_name + '/' if not os.path.exists(expected_dir): os.makedirs(expected_dir) expected_msg_path = expected_dir + bug_number project_dir = tempfile.mkdtemp(prefix="lithium-slicer_") output_filepath = project_dir + '/failing_tests' print('output_filepath=', output_filepath) expected_msg = [] failing = '' runtest_script = "bash run_input_test.sh {PROJECTDIR} {PROJECT} {BUG}" cmd_str = runtest_script.format(PROJECTDIR=project_dir, PROJECT=project_name, BUG=bug_number + 'b') output = call_cmd(cmd_str) # call shell script if os.path.isfile(output_filepath): with open(output_filepath) as out_fail: failing = out_fail.readlines() with open(expected_msg_path, "w+") as expected: expected.write("{}".format(''.join(failing)))
def do_prism(data_dir, j, seed, T, counts): print 'Doing Prism.' prism_file = join(data_dir, 'lda_prism_0.psm') in_prism = join(data_dir, 'in_prism_{}'.format(j)) phi_prism = join(data_dir, 'phi_prism_{}'.format(j)) theta_prism = join(data_dir, 'theta_prism_{}'.format(j)) vfe_prism = join(data_dir, 'vfe_prism_{}'.format(j)) time_prism = [] metric_prism = [] for i in [50,100,150,200]: print '\t{}'.format(i) with open(prism_file, 'r') as fin, open(in_prism, 'w') as fout: fout.write(fin.read()) fout.write(''' prism_main :- random_set_seed({}), set_prism_flag(learn_mode,both), set_prism_flag(max_iterate,{}), go, learn_statistics(free_energy, V), open('{}',write, Stream), write(Stream,V), close(Stream), save_phi, save_theta. save_phi :- findall((V,Param), get_sw(phi(V), [_,_,Param]), Params), open('{}',write, Stream), forall(member((V,Param), Params), (write(Stream,V), write(Stream,'|'), write(Stream,Param), nl(Stream))), close(Stream). save_theta :- findall((V,Param), get_sw(theta(V), [_,_,Param]), Params), open('{}',write, Stream), forall(member((V,Param), Params), (write(Stream,V), write(Stream,'|'), write(Stream,Param), nl(Stream))), close(Stream). '''.format(seed, i, vfe_prism, phi_prism, theta_prism)) start = time.time() call_cmd('upprism {}'.format(in_prism)) time_prism.append(time.time()-start) with open(phi_prism, 'r') as fin: phi_idx = [] for line in fin: idx, rest = line.strip().split('|') idx = int(idx)-1 topic = np.array(ast.literal_eval(rest)) phi_idx.append((idx, topic)) phi = [0 for i in range(len(phi_idx))] for idx, topic in phi_idx: phi[idx] = topic phi = np.array(phi) with open(theta_prism, 'r') as fin: theta_idx = [] for line in fin: idx, rest = line.strip().split('|') idx = int(idx)-1 doc_mix = np.array(ast.literal_eval(rest)) theta_idx.append((idx, doc_mix)) mu = [0 for i in range(len(theta_idx))] for idx, doc_mix in theta_idx: mu[idx] = doc_mix mu = np.array(mu) ll = ull(mu, phi, T, counts) metric_prism.append(ll) return time_prism, metric_prism
def _delete_file_remote(self, filename): cmd = 'ssh search@%s rm %s/%s' % (self._remote_ip, self._remote_dir, filename) #self._logger.debug('excute cmd:%s', cmd) sta, result = call_cmd(cmd) return True if sta == 0 else False
def _scp_file(self, filename, filename_tmp): cmd = 'scp search@%s:%s/%s %s/%s' % (self._remote_ip, self._remote_dir, filename, self._local_dir, filename_tmp) #self._logger.debug('excute cmd:%s', cmd) sta, result = call_cmd(cmd) return True if sta==0 else False