def run_experiment():
    pattern = re.compile("lda_([0-9]+)\.pb")
    data_dir = "data"
    files = [
        (re.search(pattern, f).group(1), join(data_dir, f))
        for f in listdir(data_dir)
        if isfile(join(data_dir, f)) and re.match(pattern, f)
    ]
    cmd_str = "peircebayes {} -n 100 -m lda -t -s {}"
    cmd_str2 = "peircebayes {} -n 100 -m lda -t -s {} -a cgs"
    np.random.seed(1234)
    start = time.time()
    for i, f in files:
        print i
        # sample 10 times
        for j, seed in enumerate(np.random.choice(5000, 10, replace=False) + 1):
            call_cmd(cmd_str.format(f, seed))
            phi = np.load("/tmp/peircebayes/avg_samples.npz")["arr_1"]
            np.savez(join(data_dir, "phi_{}_{}".format(i, j)), **{"phi": phi})
            call_cmd("cp /tmp/peircebayes/lls.npz data/lls_{}_{}.npz".format(i, j))
            call_cmd(cmd_str2.format(f, seed))
            call_cmd("cp /tmp/peircebayes/lls.npz data/lls_cgs_{}_{}.npz".format(i, j))
    end = time.time()
    with open("data/time_pb", "w") as f:
        f.write(str(end - start))
    cmd_str_r = "Rscript run_lda.R"
    start = time.time()
    call_cmd(cmd_str_r)
    end = time.time()
    with open("data/time_r", "w") as f:
        f.write(str(end - start))
def do_pb(data_dir, j, seed, topics, alpha, beta, D, W):
    print 'Doing PB.'
    pb_file = join(data_dir,'lda_0.pb')
    new_pb_file = join(data_dir,'new_lda_0.pb')
    # change this if you're not me
    path_to_pb_out = '/home/rares/p/peircebayes_experiments/lda_ll_topics/data/peircebayes'
    cmd_str  = 'peircebayes {} -n {} -m lda -d -t -s {} -o '+path_to_pb_out
    it = 200

    metric_pb = []

    for T in topics:
        print '\t{}'.format(T)

        with open(pb_file, 'r') as fin, open(new_pb_file, 'w') as fout:
            fout.write('''pb_dirichlet({}, mu, {}, {}).
pb_dirichlet({}, phi, {}, {}).

generate(Doc, Token) :-
    Topic in 1..{},
    mu(Topic,Doc),
    phi(Token,Topic).

'''.format(alpha, T, D, beta, W, T, T))
            fout.write(fin.read())

        start = time.time()
        call_cmd(cmd_str.format(new_pb_file, it, seed))
        #time_pb.append(time.time()-start)
        lls = np.load(join(path_to_pb_out, 'lls.npz'))['lls']
        ll = np.average(lls[-10:])
        metric_pb.append(ll)
    return metric_pb
def do_stan(data_dir, j, seed, T, counts):
    print 'Doing Stan.'
    time_stan = []
    metric_stan = []
    for i in [5,10,15,20]:
        print '\t{}'.format(i)
        stan_file = join(data_dir, 'lda_stan_kwargs.py')
        out_file = join(data_dir, 'stan_ll_{}.npz'.format(j))
        phi_file = join(data_dir, 'stan_phi_{}.npz'.format(j))
        theta_file = join(data_dir, 'stan_theta_{}.npz'.format(j))
        with open(join(data_dir, 'lda_stan_0.py'), 'r') as fin, open(
                stan_file, 'w') as fout:
            fout.write("kwargs = {{'seed': {}, 'iter':{} }}\n".format(seed, i))
            fout.write("out_file = '{}'\n".format(out_file))
            fout.write("phi_file = '{}'\n".format(phi_file))
            fout.write("theta_file = '{}'\n".format(theta_file))
            fout.write(fin.read())

        start = time.time()
        call_cmd('python3 {}'.format(stan_file))
        time_stan.append(time.time()-start)

        mu = np.load(theta_file)['arr_0']
        phi = np.load(phi_file)['arr_0']
        ll = ull(mu, phi, T, counts)
        metric_stan.append(ll)
    return time_stan, metric_stan
def do_tm_vem(data_dir, j, seed):
    print 'Doing topicmodels-VEM.'

    vem_file = join(data_dir, 'vem_lda.R')
    final_vem = join(data_dir, 'vem_lda_{}.R'.format(j))
    vem_ll = join(data_dir, 'vem_ll_{}.R'.format(j))

    time_vem = []
    metric_vem = []

    for i in [20,30,40,50]:
        print '\t{}'.format(i)

        with open(vem_file, 'r') as fin, open(final_vem, 'w') as fout:
            fout.write('''
iter = {}
seed = {}
            '''.format(i, seed))
            fout.write(fin.read())
            fout.write('''
vem_lls = lapply(list.files("{}", pattern='lda_0.txt', full.names=T), run_vem)
write.table(vem_lls, "{}", sep=" ", row.names=F, col.names=F)
'''.format(data_dir, vem_ll))

        start = time.time()
        call_cmd('Rscript {}'.format(final_vem))
        time_vem.append(time.time()-start)
        with open(vem_ll, 'r') as fin:
            lls = [float(line.strip()) for line in fin]
        ll = lls[-1]
        metric_vem.append(ll)

    return time_vem, metric_vem
def do_pb(data_dir, j, seed, T, K):
    print 'Doing PB.'
    times_pb, metrics_pb = [], []
    for k in range(K):
        print '\tfold {}'.format(k+1)
        counts = pickle.load(open(join(data_dir, 'counts_{}.pkl'.format(k)), 'r'))
        pb_file = 'lda_0_{}.pb'.format(k)
        # change this if you're not me
        path_to_pb_out = '/home/rares/p/peircebayes_experiments/lda_p_time/data/peircebayes'
        cmd_str  = 'peircebayes {} -n {} -m lda -d -t -s {} -o '+path_to_pb_out

        time_pb = []
        metric_pb = []

        for i in [10,20,30,40,200]:
            print '\t\t{}'.format(i)

            start = time.time()
            call_cmd(cmd_str.format(join(data_dir, pb_file), i, seed))
            time_pb.append(time.time()-start)

            # get mu and phi
            theta = np.load(join(path_to_pb_out, 'avg_samples.npz'))
            mu = theta['arr_0']
            phi = theta['arr_1']
            
            # call ull
            ll = perplexity(mu, phi, T, counts)
            metric_pb.append(ll)
        times_pb.append(time_pb)
        metrics_pb.append(metric_pb)
    return list(np.average(times_pb, axis=0)), list(np.average(metrics_pb, axis=0))
def do_pb(data_dir, j, seed, T, counts):
    print 'Doing PB.'
    pb_file = 'lda_0.pb'
    # change this if you're not me
    path_to_pb_out = '/home/rares/p/peircebayes_experiments/lda_ull_time/data/peircebayes'
    cmd_str  = 'peircebayes {} -n {} -m lda -d -t -s {} -o '+path_to_pb_out

    time_pb = []
    metric_pb = []

    for i in [50,100,150,200]:
        print '\t{}'.format(i)

        start = time.time()
        call_cmd(cmd_str.format(join(data_dir, pb_file), i, seed))
        time_pb.append(time.time()-start)

        # get mu and phi
        theta = np.load(join(path_to_pb_out, 'avg_samples.npz'))
        mu = theta['arr_0']
        phi = theta['arr_1']
        
        # call ull
        ll = ull(mu, phi, T, counts)
        metric_pb.append(ll)
    return time_pb, metric_pb
def do_tm_gibbs(data_dir, j, seed):
    print 'Doing topicmodels-Gibbs.'

    gibbs_file = join(data_dir, 'gibbs_lda.R')
    final_gibbs = join(data_dir, 'gibbs_lda_{}.R'.format(j))
    gibbs_ll = join(data_dir, 'gibbs_ll_{}.R'.format(j))

    time_gibbs = []
    metric_gibbs = []

    for i in [25,50,75,100]:
        print '\t{}'.format(i)

        with open(gibbs_file, 'r') as fin, open(final_gibbs, 'w') as fout:
            fout.write('''
iter = {}
seed = {}
            '''.format(i, seed))
            fout.write(fin.read())
            fout.write('''
gibbs_lls = lapply(list.files("{}", pattern='lda_0.txt', full.names=T), run_gibbs)
write.table(gibbs_lls, "{}", sep=" ", row.names=F, col.names=F)
'''.format(data_dir, gibbs_ll))

        start = time.time()
        call_cmd('Rscript {}'.format(final_gibbs))
        time_gibbs.append(time.time()-start)
        with open(gibbs_ll, 'r') as fin:
            lls = [float(line.strip()) for line in fin]
        ll = np.average(lls[-10:])
        metric_gibbs.append(ll)

    return time_gibbs, metric_gibbs
def main():
    use_seeds = False
    if len(sys.argv)==2 and sys.argv[1] == 'seeds':
        use_seeds = True
    if use_seeds:
        data_dir = 'data'
        cmd_str = 'peircebayes data/news20comp_c.pb -n 400 -d -t -o $PWD/data/peircebayes'
        #call_cmd(cmd_str)
        #call_cmd('cp data/peircebayes/avg_samples.npz data/avg_samples.npz')
        #call_cmd('upprism clda.psm')

    else:
        data_dir = 'data'
        cmd_str = 'peircebayes data/news20comp.pb -n 400 -d -t -o $PWD/data/peircebayes'
        seed = 1234
        n_stan_samples = 20

        # 1 tm
        #TODO tm
        start = time.time()

        #call_cmd('Rscript data/news20_tm.R')

        tm_time = time.time()-start

        # 2 pb
        start = time.time()

        #call_cmd(cmd_str)
        #call_cmd('cp data/peircebayes/avg_samples.npz data/avg_samples.npz')

        pb_time = time.time()-start

        # 3 prism
        start = time.time()

        call_cmd('upprism data/news20_prism.psm')

        prism_time = time.time()-start

        # 4 stan
        stan_file = 'data/lda_stan.py'
        with open(os.path.join(data_dir, 'news20_stan.py'), 'r') as fin, open(
                    stan_file, 'w') as fout:
                fout.write("kwargs = {{'seed': {}, 'iter':{} }}\n".format(seed, n_stan_samples))
                fout.write(fin.read())
        start = time.time()

        #call_cmd('python3 {}'.format(stan_file))

        stan_time = time.time()-start
        with open('data/times', 'w') as fout:
            fout.write('PB: {} seconds\n'.format(pb_time))
            fout.write('Prism: {} seconds\n'.format(prism_time))
            fout.write('Stan: {} seconds\n'.format(stan_time))
            fout.write('Topicmodels: {} seconds\n'.format(tm_time))
def do_tm_vem(data_dir, j, seed, topics, alpha, beta, D, W, K):
    print 'Doing topicmodels-VEM.'

    vem_file = join(data_dir, 'vem_lda.R')
    final_vem = join(data_dir, 'vem_lda_{}.R'.format(j))
    vem_theta = join(data_dir, 'vem_theta_{}.csv'.format(j))
    vem_phi = join(data_dir, 'vem_phi_{}.csv'.format(j))
    
    metrics = []
    
    for k in range(K):
        print '\tfold {}'.format(k+1)
        counts = pickle.load(open(join(data_dir, 'counts_{}.pkl'.format(k)), 'r'))
        metric_gibbs = []
    
        metric_vem = []
        
        for T in topics:
            print '\t\t{}'.format(T)

            with open(vem_file, 'r') as fin, open(final_vem, 'w') as fout:
                fout.write('''
K = {}
alpha = {}
beta = {}
iter = {}
seed = {}
'''.format(T, alpha, beta, 500, seed))
                fout.write(fin.read())
                fout.write('''
vem_models = lapply(list.files("{}", pattern='lda_0_{}.txt', full.names=T), run_vem)
write.table(vem_models[[1]][[1]]@gamma, "{}", sep=" ", row.names=F, col.names=F)
write.table(exp(vem_models[[1]][[1]]@beta), "{}", sep=" ", row.names=F, col.names=F)
'''.format(data_dir, k, vem_theta, vem_phi))

            start = time.time()
            call_cmd('Rscript {}'.format(final_vem))
            #time_vem.append(time.time()-start)
            
            with open(vem_theta, 'r') as fin:
                mu = []
                for line in fin:
                    mu.append([float(i) for i in line.strip().split()])
                mu = np.array(mu)
            
            with open(vem_phi, 'r') as fin:
                phi = []
                for line in fin:
                    phi.append([float(i) for i in line.strip().split()])
                phi = np.array(phi)

            ll = perplexity(mu, phi, T, counts)
            metric_vem.append(ll)
        metrics.append(metric_vem)
    return list(np.average(metrics, axis=0))
Ejemplo n.º 10
0
def data_gen():
    K = 6
    beta = 50./float(K)
    gamma = 0.1
    fin_name = 'data/sushi3/sushi3a.5000.10.order'
    fout_name = 'data/sushi3/sushi3.pb'
    
    rewrite_dir('data')
    call_cmd('sh get_data.sh')
    N, data = read_data(fin_name)
    write_pb(fout_name, N, data, K, beta, gamma)
def do_tm_gibbs(data_dir, j, seed, T, K):
    print 'Doing topicmodels-Gibbs.'

    gibbs_file = join(data_dir, 'gibbs_lda.R')
    final_gibbs = join(data_dir, 'gibbs_lda_{}.R'.format(j))
    gibbs_theta = join(data_dir, 'gibbs_theta_{}.csv'.format(j))
    gibbs_phi = join(data_dir, 'gibbs_phi_{}.csv'.format(j))
    
    times, metrics = [], []
    for k in range(K):
        print '\tfold {}'.format(k+1)    
        counts = pickle.load(open(join(data_dir, 'counts_{}.pkl'.format(k)), 'r'))
        #for doc in counts.toarray():
        #    print list(doc)
        time_gibbs = []
        metric_gibbs = []

        for i in [10,20,30,40,200]:
            print '\t\t{}'.format(i)

            with open(gibbs_file, 'r') as fin, open(final_gibbs, 'w') as fout:
                fout.write('''
iter = {}
seed = {}
                '''.format(i, seed))
                fout.write(fin.read())
                fout.write('''
gibbs_models = lapply(list.files("{}", pattern='lda_0_{}.txt', full.names=T), run_gibbs)
write.table(gibbs_models[[1]][[1]]@gamma, "{}", sep=" ", row.names=F, col.names=F)
write.table(exp(gibbs_models[[1]][[1]]@beta), "{}", sep=" ", row.names=F, col.names=F)
'''.format(data_dir, k, gibbs_theta, gibbs_phi))

            start = time.time()
            call_cmd('Rscript {}'.format(final_gibbs))
            time_gibbs.append(time.time()-start)
            
            with open(gibbs_theta, 'r') as fin:
                mu = []
                for line in fin:
                    mu.append([float(i) for i in line.strip().split()])
                mu = np.array(mu)
            
            with open(gibbs_phi, 'r') as fin:
                phi = []
                for line in fin:
                    phi.append([float(i) for i in line.strip().split()])
                phi = np.array(phi)
            
            ll = perplexity(mu, phi, T, counts)
            metric_gibbs.append(ll)

        times.append(time_gibbs)
        metrics.append(metric_gibbs)
    return list(np.average(times, axis=0)), list(np.average(metrics, axis=0))
def do_tm_gibbs(data_dir, j, seed,  topics, alpha, beta, D, W, counts):
    print 'Doing topicmodels-Gibbs.'

    gibbs_file = join(data_dir, 'gibbs_lda.R')
    final_gibbs = join(data_dir, 'gibbs_lda_{}.R'.format(j))
    gibbs_theta = join(data_dir, 'gibbs_theta_{}.csv'.format(j))
    gibbs_phi = join(data_dir, 'gibbs_phi_{}.csv'.format(j))
    
    it = 200

    metric_gibbs = []

    for T in topics:
        print '\t{}'.format(T)

        with open(gibbs_file, 'r') as fin, open(final_gibbs, 'w') as fout:
            fout.write('''
K = {}
alpha = {}
beta = {}
iter = {}
seed = {}
            '''.format(T, alpha, beta, it, seed))
            fout.write(fin.read())
            fout.write('''
gibbs_models = lapply(list.files("{}", pattern='lda_0.txt', full.names=T), run_gibbs)
write.table(gibbs_models[[1]][[1]]@gamma, "{}", sep=" ", row.names=F, col.names=F)
write.table(exp(gibbs_models[[1]][[1]]@beta), "{}", sep=" ", row.names=F, col.names=F)
'''.format(data_dir, gibbs_theta, gibbs_phi))

        start = time.time()
        call_cmd('Rscript {}'.format(final_gibbs))
        #time_gibbs.append(time.time()-start)
        
        with open(gibbs_theta, 'r') as fin:
            mu = []
            for line in fin:
                mu.append([float(i) for i in line.strip().split()])
            mu = np.array(mu)
        
        with open(gibbs_phi, 'r') as fin:
            phi = []
            for line in fin:
                phi.append([float(i) for i in line.strip().split()])
            phi = np.array(phi)
        
        ll = ull(mu, phi, T, counts)
        metric_gibbs.append(ll)

    return metric_gibbs
def main():
    cmd_str = 'peircebayes data/lda2_arxiv.pb -d -t -n 50 -b 100'
    call_cmd('cat data/pb_hlda3.pb data/arxiv_obs.pb',
             open('data/lda2_arxiv.pb', 'w'))
    call_cmd(cmd_str)
    call_cmd('cp /tmp/peircebayes/avg_samples.npz data/avg_samples.npz')
    call_cmd('rm data/lda2_arxiv.pb')
def do_stan(data_dir, j, seed, topics, alpha, beta, D, W, K):
    print 'Doing Stan.'

    it = 20
    
    metrics = []
    
    for k in range(K):
        print '\tfold {}'.format(k+1)
        counts = pickle.load(open(join(data_dir, 'counts_{}.pkl'.format(k)), 'r'))
    
        metric_stan = []

        for T in topics:
            print '\t\t{}'.format(T)
            
            stan_file = join(data_dir, 'lda_stan_kwargs.py')
            out_file = join(data_dir, 'stan_ll_{}.npz'.format(j))
            phi_file = join(data_dir, 'stan_phi_{}.npz'.format(j))
            theta_file = join(data_dir, 'stan_theta_{}.npz'.format(j))
            with open(join(data_dir, 'lda_stan_0_{}.py'.format(k)), 'r') as fin, open(
                    stan_file, 'w') as fout:
                fout.write("kwargs = {{'seed': {}, 'iter':{} }}\n".format(seed, it))
                fout.write("out_file = '{}'\n".format(out_file))
                fout.write("phi_file = '{}'\n".format(phi_file))
                fout.write("theta_file = '{}'\n".format(theta_file))
                for line in fin:
                    if 'lda_dat = {' in line:
                        fout.write(line)
                        alpha_l = [alpha]*T
                        beta_l = [beta]*W
                        fout.write('''  'K' : {},
'alpha' : {},
'beta' : {},
'''.format(T, alpha_l, beta_l))
                    else:
                        fout.write(line)

            start = time.time()
            call_cmd('python3 {}'.format(stan_file))
            #time_stan.append(time.time()-start)

            mu = np.load(theta_file)['arr_0']
            phi = np.load(phi_file)['arr_0']
            
            ll = perplexity(mu, phi, T, counts)
            metric_stan.append(ll)
        metrics.append(metric_stan)
    return list(np.average(metrics, axis=0))
def do_pb(data_dir, j, seed, topics, alpha, beta, D, W, K):
    print 'Doing PB.'
    
    
    new_pb_file = join(data_dir,'new_lda_0.pb')
    # change this if you're not me
    path_to_pb_out = '/home/rares/p/peircebayes_experiments/lda_p_topics/data/peircebayes'
    cmd_str  = 'peircebayes {} -n {} -m lda -d -t -s {} -o '+path_to_pb_out
    it = 200
    
    metrics = []
    
    for k in range(K):
        print '\tfold {}'.format(k+1)
        counts = pickle.load(open(join(data_dir, 'counts_{}.pkl'.format(k)), 'r'))
        pb_file = join(data_dir,'lda_0_{}.pb'.format(k))
        metric_pb = []
        
        for T in topics:
            print '\t\t{}'.format(T)

            with open(pb_file, 'r') as fin, open(new_pb_file, 'w') as fout:
                fout.write('''pb_dirichlet({}, mu, {}, {}).
pb_dirichlet({}, phi, {}, {}).

generate(Doc, Token) :-
    Topic in 1..{},
    mu(Topic,Doc),
    phi(Token,Topic).

'''.format(alpha, T, D, beta, W, T, T))
                fout.write(fin.read())

            start = time.time()
            call_cmd(cmd_str.format(new_pb_file, it, seed))
            #time_pb.append(time.time()-start)
            
            # get mu and phi
            theta = np.load(join(path_to_pb_out, 'avg_samples.npz'))
            mu = theta['arr_0']
            phi = theta['arr_1']
            
            # call ull
            ll = perplexity(mu, phi, T, counts)
            metric_pb.append(ll)
        metrics.append(metric_pb)
    return list(np.average(metrics, axis=0))
def do_prism(data_dir, j, seed, topics, alpha, beta, D, W):
    print 'Doing Prism.'

    prism_file = join(data_dir, 'lda_prism_0.psm')
    in_prism = join(data_dir, 'in_prism_{}'.format(j))
    out_prism = join(data_dir, 'out_prism_{}'.format(j))
    vfe_prism = join(data_dir, 'vfe_prism_{}'.format(j))

    metric_prism = []

    for T in topics:
        print '\t{}'.format(T)

        theta_str = 'values(theta(_), [1-{}], a@{}).'.format(T,alpha)
        phi_str = 'values(phi(_), [1-{}], a@{}).'.format(W,beta)
        with open(prism_file, 'r') as fin, open(in_prism, 'w') as fout:
            fout.write('''

{}

{}
'''.format(theta_str, phi_str))
            fout.write(fin.read())
            fout.write('''
prism_main :-
    random_set_seed({}),
    set_prism_flag(learn_mode,both),
    go,
    learn_statistics(free_energy, V),
    open('{}',write, Stream), write(Stream,V), close(Stream),
    save_phi.

save_phi :-
    findall(Param, get_sw(phi(_), [_,_,Param]), Params),
    open('{}',write, Stream), forall(member(Param, Params), (write(Stream,Param), nl(Stream))), close(Stream).

'''.format(seed, vfe_prism, out_prism))

        start = time.time()
        call_cmd('upprism {}'.format(in_prism))
        #time_prism.append(time.time()-start)

        with open(vfe_prism, 'r') as fin:
            vfe = float(fin.read().strip())
        metric_prism.append(vfe)
    return metric_prism
def do_tm_vem(data_dir, j, seed, T, counts):
    print 'Doing topicmodels-VEM.'

    vem_file = join(data_dir, 'vem_lda.R')
    final_vem = join(data_dir, 'vem_lda_{}.R'.format(j))
    vem_theta = join(data_dir, 'vem_theta_{}.csv'.format(j))
    vem_phi = join(data_dir, 'vem_phi_{}.csv'.format(j))

    time_vem = []
    metric_vem = []

    for i in [20,30,40,50]:
        print '\t{}'.format(i)

        with open(vem_file, 'r') as fin, open(final_vem, 'w') as fout:
            fout.write('''
iter = {}
seed = {}
            '''.format(i, seed))
            fout.write(fin.read())
            fout.write('''
vem_models = lapply(list.files("{}", pattern='lda_0.txt', full.names=T), run_vem)
write.table(vem_models[[1]][[1]]@gamma, "{}", sep=" ", row.names=F, col.names=F)
write.table(exp(vem_models[[1]][[1]]@beta), "{}", sep=" ", row.names=F, col.names=F)
'''.format(data_dir, vem_theta, vem_phi))

        start = time.time()
        call_cmd('Rscript {}'.format(final_vem))
        time_vem.append(time.time()-start)
        
        with open(vem_theta, 'r') as fin:
            mu = []
            for line in fin:
                mu.append([float(i) for i in line.strip().split()])
            mu = np.array(mu)
        
        with open(vem_phi, 'r') as fin:
            phi = []
            for line in fin:
                phi.append([float(i) for i in line.strip().split()])
            phi = np.array(phi)
        
        ll = ull(mu, phi, T, counts)
        metric_vem.append(ll)

    return time_vem, metric_vem
Ejemplo n.º 18
0
 def _ls_file_list(self):
   cmd = 'ssh %s ls %s' % (self._remote_ip, self._remote_dir)
   #self._logger.debug('excute cmd:%s', cmd)
   sta, result = call_cmd(cmd)
   #print sta, result
   if sta == 0:
     #self._logger.debug('success get filelist:\n%s', result)
     return result.split()
   else:
     return None
def do_stan(data_dir, j, seed, topics, alpha, beta, D, W, counts):
    print 'Doing Stan.'

    it = 20
    metric_stan = []


    for T in topics:
        print '\t{}'.format(T)
        
        stan_file = join(data_dir, 'lda_stan_kwargs.py')
        out_file = join(data_dir, 'stan_ll_{}.npz'.format(j))
        phi_file = join(data_dir, 'stan_phi_{}.npz'.format(j))
        theta_file = join(data_dir, 'stan_theta_{}.npz'.format(j))
        with open(join(data_dir, 'lda_stan_0.py'), 'r') as fin, open(
                stan_file, 'w') as fout:
            fout.write("kwargs = {{'seed': {}, 'iter':{} }}\n".format(seed, it))
            fout.write("out_file = '{}'\n".format(out_file))
            fout.write("phi_file = '{}'\n".format(phi_file))
            fout.write("theta_file = '{}'\n".format(theta_file))
            for line in fin:
                if 'lda_dat = {' in line:
                    fout.write(line)
                    alpha_l = [alpha]*T
                    beta_l = [beta]*W
                    fout.write('''  'K' : {},
    'alpha' : {},
    'beta' : {},
                    '''.format(T, alpha_l, beta_l))
                else:
                    fout.write(line)

        start = time.time()
        call_cmd('python3 {}'.format(stan_file))
        #time_stan.append(time.time()-start)

        mu = np.load(theta_file)['arr_0']
        phi = np.load(phi_file)['arr_0']
        
        ll = ull(mu, phi, T, counts)
        metric_stan.append(ll)
    return metric_stan
def do_pb(data_dir, j, seed):
    print 'Doing PB.'
    pb_file = 'lda_0.pb'
    # change this if you're not me
    path_to_pb_out = '/home/rares/p/peircebayes_experiments/lda_ll_time/data/peircebayes'
    cmd_str  = 'peircebayes {} -n {} -m lda -d -t -s {} -o '+path_to_pb_out

    time_pb = []
    metric_pb = []

    for i in [50,100,150,200]:
        print '\t{}'.format(i)

        start = time.time()
        call_cmd(cmd_str.format(join(data_dir, pb_file), i, seed))
        time_pb.append(time.time()-start)

        lls = np.load(join(path_to_pb_out, 'lls.npz'))['lls']
        ll = np.average(lls[-10:])
        metric_pb.append(ll)
    return time_pb, metric_pb
def do_prism(data_dir, j, seed):
    print 'Doing Prism.'

    prism_file = join(data_dir, 'lda_prism_0.psm')
    in_prism = join(data_dir, 'in_prism_{}'.format(j))
    out_prism = join(data_dir, 'out_prism_{}'.format(j))
    vfe_prism = join(data_dir, 'vfe_prism_{}'.format(j))

    time_prism = []
    metric_prism = []

    for i in [50,100,150,200]:
        print '\t{}'.format(i)

        with open(prism_file, 'r') as fin, open(in_prism, 'w') as fout:
            fout.write(fin.read())
            fout.write('''
prism_main :-
    random_set_seed({}),
    set_prism_flag(learn_mode,both),
    set_prism_flag(max_iterate,{}),
    go,
    learn_statistics(free_energy, V),
    open('{}',write, Stream), write(Stream,V), close(Stream),
    save_phi.

save_phi :-
    findall(Param, get_sw(phi(_), [_,_,Param]), Params),
    open('{}',write, Stream), forall(member(Param, Params), (write(Stream,Param), nl(Stream))), close(Stream).

'''.format(seed, i, vfe_prism, out_prism))

        start = time.time()
        call_cmd('upprism {}'.format(in_prism))
        time_prism.append(time.time()-start)

        with open(vfe_prism, 'r') as fin:
            vfe = float(fin.read().strip())
        metric_prism.append(vfe)
    return time_prism, metric_prism
def run_experiment():
    pattern = re.compile('lda_([0-9]+)\.pb')
    data_dir = 'data'
    files = [ (re.search(pattern, f).group(1), join(data_dir,f) )
        for f in listdir(data_dir)
        if isfile(join(data_dir,f)) and re.match(pattern, f)]
    cmd_str = 'peircebayes {} -n 200 -m lda_ll -t -s {}'
    #cmd_str2 = 'peircebayes {} -n 200 -m lda_ll -t -s {} -a amcmc'
    np.random.seed(1234)
    start = time.time()
    for i,f in files:
        # sample 10 times
        for j,seed in enumerate(np.random.choice(5000, 10, replace=False)+1):
            call_cmd(cmd_str.format(f, seed))
            phi = np.load('/tmp/peircebayes/avg_samples.npz')['arr_1']
            np.savez(join(data_dir, 'phi_{}_{}'.format(i,j)), **{'phi':phi})
            call_cmd('cp /tmp/peircebayes/lls.npz data/lls_{}_{}.npz'.format(i,j))
            #call_cmd(cmd_str2.format(f, seed))
            #call_cmd('cp /tmp/peircebayes/lls.npz data/lls_amcmc_{}_{}.npz'.format(i,j))
    end = time.time()
    with open('data/time_pb', 'w') as f:
        f.write(str(end-start))
    cmd_str_r = 'Rscript run_lda.R'
    start = time.time()
    call_cmd(cmd_str_r)
    end = time.time()
    with open('data/time_r', 'w') as f:
        f.write(str(end-start))
def do_stan(data_dir, j, seed, T, K):
    print 'Doing Stan.'
    
    times, metrics = [], []
    for k in range(K):
        print '\tfold {}'.format(k+1)
        counts = pickle.load(open(join(data_dir, 'counts_{}.pkl'.format(k)), 'r'))
    
        time_stan = []
        metric_stan = []
        for i in [2,4,6,8,20]:
            print '\t\t{}'.format(i)
            stan_file = join(data_dir, 'lda_stan_kwargs.py')
            out_file = join(data_dir, 'stan_ll_{}.npz'.format(j))
            phi_file = join(data_dir, 'stan_phi_{}.npz'.format(j))
            theta_file = join(data_dir, 'stan_theta_{}.npz'.format(j))
            with open(join(data_dir, 'lda_stan_0_{}.py'.format(k)), 'r'
                ) as fin, open(stan_file, 'w') as fout:
                
                fout.write("kwargs = {{'seed': {}, 'iter':{} }}\n".format(seed, i))
                fout.write("out_file = '{}'\n".format(out_file))
                fout.write("phi_file = '{}'\n".format(phi_file))
                fout.write("theta_file = '{}'\n".format(theta_file))
                fout.write(fin.read())

            start = time.time()
            call_cmd('python3 {}'.format(stan_file))
            time_stan.append(time.time()-start)

            mu = np.load(theta_file)['arr_0']
            phi = np.load(phi_file)['arr_0']
            ll = perplexity(mu, phi, T, counts)
            metric_stan.append(ll)
        times.append(time_stan)
        metrics.append(metric_stan)
    return list(np.average(times, axis=0)), list(np.average(metrics, axis=0))
def run_experiment():
    data_dir = 'data/sushi3'
    pb_file = 'sushi3.pb'
    cmd_str  = 'peircebayes {} -n 100 -t -s {}'
    np.random.seed(1234)
    start = time.time()
    for j,seed in enumerate(np.random.choice(1000, 10, replace=False)+1):
        call_cmd(cmd_str.format(join(data_dir, pb_file), seed))
        call_cmd('cp /tmp/peircebayes/avg_samples.npz data/avg_samples_{}.npz'.format(j))
        call_cmd('cp /tmp/peircebayes/lls.npz data/lls_{}.npz'.format(j))
    end = time.time()
    with open('data/rim_time', 'w') as f:
        f.write(str(end-start))
def do_prism(data_dir, j, seed, topics, alpha, beta, D, W, K):
    print 'Doing Prism.'

    metrics = []
    
    for k in range(K):
        print '\tfold {}'.format(k+1)
        counts = pickle.load(open(join(data_dir, 'counts_{}.pkl'.format(k)), 'r'))

        prism_file = join(data_dir, 'lda_prism_0_{}.psm'.format(k))
        in_prism = join(data_dir, 'in_prism_{}'.format(j))
        phi_prism = join(data_dir, 'phi_prism_{}'.format(j))
        theta_prism = join(data_dir, 'theta_prism_{}'.format(j))
        vfe_prism = join(data_dir, 'vfe_prism_{}'.format(j))

        metric_prism = []

        for T in topics:
            print '\t\t{}'.format(T)

            theta_str = 'values(theta(_), [1-{}], a@{}).'.format(T,alpha)
            phi_str = 'values(phi(_), [1-{}], a@{}).'.format(W,beta)
            with open(prism_file, 'r') as fin, open(in_prism, 'w') as fout:
                fout.write('''

{}

{}
    '''.format(theta_str, phi_str))
                fout.write(fin.read())
                fout.write('''
prism_main :-
    random_set_seed({}),
    set_prism_flag(learn_mode,both),
    go,
    learn_statistics(free_energy, V),
    open('{}',write, Stream), write(Stream,V), close(Stream),
    save_phi,
    save_theta.

save_phi :-
    findall((V,Param), get_sw(phi(V), [_,_,Param]), Params),
    open('{}',write, Stream), forall(member((V,Param), Params), (write(Stream,V), write(Stream,'|'), write(Stream,Param), nl(Stream))), close(Stream).

save_theta :-
    findall((V,Param), get_sw(theta(V), [_,_,Param]), Params),
    open('{}',write, Stream), forall(member((V,Param), Params), (write(Stream,V), write(Stream,'|'), write(Stream,Param), nl(Stream))), close(Stream).

    '''.format(seed, vfe_prism, phi_prism, theta_prism))

            start = time.time()
            call_cmd('upprism {}'.format(in_prism))
            #time_prism.append(time.time()-start)

            with open(phi_prism, 'r') as fin:
                phi_idx = []
                for line in fin:
                    idx, rest = line.strip().split('|')
                    idx = int(idx)-1
                    topic = np.array(ast.literal_eval(rest))
                    phi_idx.append((idx, topic))
                phi = [0 for i in range(len(phi_idx))]
                for idx, topic in phi_idx:
                    phi[idx] = topic
                phi = np.array(phi)    
                
            with open(theta_prism, 'r') as fin:
                theta_idx = []
                for line in fin:
                    idx, rest = line.strip().split('|')
                    idx = int(idx)-1
                    doc_mix = np.array(ast.literal_eval(rest))
                    theta_idx.append((idx, doc_mix))
                mu = [0 for i in range(len(theta_idx))]
                for idx, doc_mix in theta_idx:
                    mu[idx] = doc_mix
                mu = np.array(mu)  
            
            ll = perplexity(mu, phi, T, counts)
            metric_prism.append(ll)
        metrics.append(metric_prism)
    return list(np.average(metrics, axis=0))
def run_experiment():
    data_dir = 'data'
    pb_file = 'lda_0.pb'
    cmd_str  = 'peircebayes {} -n 300 -m lda_ll -d -t -s {}'
    np.random.seed(1234)
    times_pb = []
    times_church = []
    times_church2 = []
    for j,seed in enumerate(np.random.choice(5000, 10, replace=False)+1):
        start = time.time()
        call_cmd(cmd_str.format(join(data_dir, pb_file), seed))
        end = time.time()
        times_pb.append(end-start)
        call_cmd('cp /tmp/peircebayes/last_sample.npz data/last_sample_{}.npz'.format(j))
        call_cmd('cp /tmp/peircebayes/lls.npz data/lls_{}.npz'.format(j))
        church_str = 'church -s {} data/lda_church1_0.md'.format(seed)
        start = time.time()
        call_cmd(church_str)
        end = time.time()
        times_church.append(end-start)
        call_cmd('mv ll_church1.csv data/ll_church1_{}.csv'.format(j))
        church2_str = 'church -s {} data/lda_church2_0.md'.format(seed)
        start = time.time()
        call_cmd(church2_str)  
        end = time.time()
        times_church2.append(end-start)
        call_cmd('mv ll_church2.csv data/ll_church2_{}.csv'.format(j))
    with open('data/times', 'w') as f:
        f.write('pb: {}\n'.format(np.average(times_pb)))
        f.write('church: {}\n'.format(np.average(times_church)))
        f.write('church2: {}\n'.format(np.average(times_church2)))
Ejemplo n.º 27
0
def generate_seed(project, bugnumber):
    """ generates a file that contains json info to run d4j and lithium """
    initial_projects = ["Chart", "Lang", "Closure", "Math", "Mockito", "Time"]
    if project not in initial_projects:
        raise Exception("Project {} invalid. Please select one of {}".format(
            project, initial_projects))

    project_path = os.path.join(os.getcwd(), "data", project)

    if not os.path.isdir(project_path):
        print("FAILED")  # should print to stop the main script
        raise Exception("Project {} directory not found".format(project_path))

    # Solves the issue of different source paths for the same project
    if project == 'Lang' and int(bugnumber) < 36:
        source_path = get_source_path(project + '2')
    elif project == 'Math' and int(bugnumber) > 84:
        source_path = get_source_path(project + '2')
    else:
        source_path = get_source_path(project)

    # get only bugs choosen by user
    bugnumber = bugnumber.split(",")

    if not is_input_number_valid(bugnumber, project_path):
        print("FAILED")  # should print to stop the main script
        raise Exception(
            "one or more json files({}) are not found in path {}".format(
                bugnumber, project_path))

    bugnumbers = ['{}.json'.format(bug) for bug in bugnumber]

    if '0' in bugnumber:  # 0 similar to "all" bugs
        bugnumbers = os.listdir(project_path)
    else:
        bugnumbers = [
            doc for doc in os.listdir(project_path) if doc in bugnumbers
        ]

        # for each bug
    for bug in bugnumbers:
        data = json_to_dict(os.path.join(project_path, bug))
        bug_number = bug.replace(".json", "")

        # getting the expected message
        expected_dir = 'oracle/' + project_name + '/'
        if not os.path.exists(expected_dir):
            os.makedirs(expected_dir)

        expected_msg_path = expected_dir + bug_number
        project_dir = tempfile.mkdtemp(prefix="lithium-slicer_")
        output_filepath = project_dir + '/failing_tests'
        print('output_filepath=', output_filepath)
        expected_msg = []
        failing = ''

        runtest_script = "bash run_input_test.sh {PROJECTDIR} {PROJECT} {BUG}"
        cmd_str = runtest_script.format(PROJECTDIR=project_dir,
                                        PROJECT=project_name,
                                        BUG=bug_number + 'b')
        output = call_cmd(cmd_str)  # call shell script
        if os.path.isfile(output_filepath):
            with open(output_filepath) as out_fail:
                failing = out_fail.readlines()

        with open(expected_msg_path, "w+") as expected:
            expected.write("{}".format(''.join(failing)))
def do_prism(data_dir, j, seed, T, counts):
    print 'Doing Prism.'

    prism_file = join(data_dir, 'lda_prism_0.psm')
    in_prism = join(data_dir, 'in_prism_{}'.format(j))
    phi_prism = join(data_dir, 'phi_prism_{}'.format(j))
    theta_prism = join(data_dir, 'theta_prism_{}'.format(j))
    vfe_prism = join(data_dir, 'vfe_prism_{}'.format(j))

    time_prism = []
    metric_prism = []

    for i in [50,100,150,200]:
        print '\t{}'.format(i)

        with open(prism_file, 'r') as fin, open(in_prism, 'w') as fout:
            fout.write(fin.read())
            fout.write('''
prism_main :-
    random_set_seed({}),
    set_prism_flag(learn_mode,both),
    set_prism_flag(max_iterate,{}),
    go,
    learn_statistics(free_energy, V),
    open('{}',write, Stream), write(Stream,V), close(Stream),
    save_phi,
    save_theta.

save_phi :-
    findall((V,Param), get_sw(phi(V), [_,_,Param]), Params),
    open('{}',write, Stream), forall(member((V,Param), Params), (write(Stream,V), write(Stream,'|'), write(Stream,Param), nl(Stream))), close(Stream).

save_theta :-
    findall((V,Param), get_sw(theta(V), [_,_,Param]), Params),
    open('{}',write, Stream), forall(member((V,Param), Params), (write(Stream,V), write(Stream,'|'), write(Stream,Param), nl(Stream))), close(Stream).
'''.format(seed, i, vfe_prism, phi_prism, theta_prism))

        start = time.time()
        call_cmd('upprism {}'.format(in_prism))
        time_prism.append(time.time()-start)
        
        with open(phi_prism, 'r') as fin:
            phi_idx = []
            for line in fin:
                idx, rest = line.strip().split('|')
                idx = int(idx)-1
                topic = np.array(ast.literal_eval(rest))
                phi_idx.append((idx, topic))
            phi = [0 for i in range(len(phi_idx))]
            for idx, topic in phi_idx:
                phi[idx] = topic
            phi = np.array(phi)    
        
        with open(theta_prism, 'r') as fin:
            theta_idx = []
            for line in fin:
                idx, rest = line.strip().split('|')
                idx = int(idx)-1
                doc_mix = np.array(ast.literal_eval(rest))
                theta_idx.append((idx, doc_mix))
            mu = [0 for i in range(len(theta_idx))]
            for idx, doc_mix in theta_idx:
                mu[idx] = doc_mix
            mu = np.array(mu)
        
        ll = ull(mu, phi, T, counts)
        metric_prism.append(ll)
    return time_prism, metric_prism
Ejemplo n.º 29
0
 def _delete_file_remote(self, filename):
   cmd = 'ssh search@%s rm %s/%s' % (self._remote_ip, self._remote_dir, filename)
   #self._logger.debug('excute cmd:%s', cmd)
   sta, result = call_cmd(cmd)
   return True if sta == 0 else False
Ejemplo n.º 30
0
 def _scp_file(self, filename, filename_tmp):
   cmd = 'scp search@%s:%s/%s %s/%s' % (self._remote_ip, self._remote_dir, filename, self._local_dir, filename_tmp)
   #self._logger.debug('excute cmd:%s', cmd)
   sta, result = call_cmd(cmd)
   return True if sta==0 else False