Esempio n. 1
0
def compute_pk_large(cfg_file, random_seed, file_str='rank3'):
    settings, paths, working_folder = cfg.get_settings(cfg_file, 'sub')
    num_batch = int(settings['num_of_batch_per_subset'])
    num_subset = int(settings['num_of_subset'])
    batch_sizes = np.array(range(num_subset)) * num_batch + num_batch
    working_folder = '../results/' + working_folder + '_rnd' + str(
        random_seed) + '/'
    multi_ker_str = ana_utils.get_top_kern_cmb(batch_sizes, 3, working_folder)
    print(multi_ker_str)
    samplesize = 1000
    prior_pg = 0.1
    for s in batch_sizes:
        lk.compute_pk(s,
                      file_str,
                      samplesize,
                      prior_pg,
                      top_n=None,
                      multi_ker_str=multi_ker_str,
                      working_folder=working_folder,
                      normalized=True,
                      scaler=2)
    data_to_use_size = settings['data_to_use']
    minibatch_size = settings['minibatch_size']
    sp.line_plot_pk(working_folder, file_str, samplesize, prior_pg,
                    batch_sizes, True, minibatch_size, data_to_use_size)
    return
Esempio n. 2
0
def plot_pk_subset(cfg_file, random_seed):
    settings, paths, working_folder = cfg.get_settings(cfg_file, 'sub')
    data_to_use_size = settings['data_to_use']
    minibatch_size = settings['minibatch_size']
    num_batch = int(settings['num_of_batch_per_subset'])
    num_subset = int(settings['num_of_subset'])
    batch_sizes = np.array(range(num_subset)) * num_batch + num_batch
    working_folder = '../results/' + working_folder + '_rnd' + str(
        random_seed) + '/'
    datastd = 1

    # multi_ker_str = ['s', 'r', 'p', 'p*r+s', 'p*r*r', 'r+r*p', 'p*r', 'p+r*p']
    multi_ker_str = [
        's', 'r', 'p', 'p*r+s', 'p*r', 'p+r*p', 'p*p*p', 's*s+s', 'p+p*r',
        'r*r+s', 's*s+r', 'p+p*p'
    ]
    file_str = 'group2'
    samplesize = 2000
    prior = 0.1
    # compute pk
    for s in batch_sizes:
        lk.compute_pk(s,
                      file_str,
                      samplesize,
                      prior,
                      top_n=None,
                      multi_ker_str=multi_ker_str,
                      working_folder=working_folder,
                      normalized=True)
    # compute rmse
    for s in batch_sizes:
        BMA.subset_BMA(working_folder, file_str, s, samplesize, prior, True)

    # save rmse and plot
    pr.gather_and_plot_rmse(working_folder,
                            batch_sizes,
                            file_str,
                            samplesize,
                            prior,
                            minibatch_size,
                            data_to_use_size,
                            datastd,
                            normalized=True)
    # plot_single_pk
    sp.line_plot_pk(working_folder, file_str, samplesize, prior, batch_sizes,
                    True, minibatch_size, data_to_use_size)
    sp.bar_plot_pk(working_folder, file_str, samplesize, prior, batch_sizes,
                   True, 'subset')
Esempio n. 3
0
def plot_rmse(cfg_file, random_seed, file_str='top10'):
    settings, paths, working_folder = cfg.get_settings(cfg_file, 'sub')
    data_to_use_size = settings['data_to_use']
    minibatch_size = settings['minibatch_size']
    num_batch = int(settings['num_of_batch_per_subset'])
    num_subset = int(settings['num_of_subset'])
    batch_sizes = np.array(range(num_subset)) * num_batch + num_batch

    # save and plot the normalized results here
    datastd = 1
    working_folder = '../results/' + working_folder + '_rnd' + str(
        random_seed) + '/'
    samplesize = 1000
    prior_pg = 0.5
    # --- prob, BMA, plot ---
    for s in batch_sizes:
        lk.compute_pk(s,
                      file_str,
                      samplesize,
                      prior_pg,
                      top_n=10,
                      multi_ker_str=None,
                      working_folder=working_folder,
                      normalized=True)

    for s in batch_sizes:
        BMA.subset_BMA(working_folder, file_str, s, samplesize, prior_pg, True)

    pr.gather_and_plot_rmse(working_folder,
                            batch_sizes,
                            file_str,
                            samplesize,
                            prior_pg,
                            minibatch_size,
                            data_to_use_size,
                            datastd,
                            normalized=True)
Esempio n. 4
0
import sys
import utils.parse_cfg as cfg
import utils.get_init_dict as gidc


gpu_id = int(sys.argv[1])
use_gpu(gpu_id)
group = int(sys.argv[2])

start = int(sys.argv[3])
group_size = int(sys.argv[4])
ker = pd.read_csv('kernelstring3.csv')
multi_ker_str = np.array(ker).reshape(-1)[start + group * group_size:start + (group + 1) * group_size]

cfg_file = '../config/' + sys.argv[5]
settings, paths, working_folder = cfg.get_settings(cfg_file, "full")

datafile = paths['datafile']
multi_ker_init_dict = None
if paths['init_hyper_file'] is not None:
    multi_ker_init_dict = gidc.get_saved_init(multi_ker_str, paths['init_hyper_file'])

working_folder = '../results/' + working_folder + "/"
if not os.path.exists(working_folder):
    os.mkdir(working_folder)
settings['initfile'] = paths["init_hyper_file"]
print(settings)
df = pd.DataFrame(settings, index=[0])
df.to_csv(working_folder + "setting.csv")
twi.train_fulldata(datafile, settings, multi_ker_str, multi_ker_init_dict, working_folder)
Esempio n. 5
0
    return


if __name__ == "__main__":
    cfg_file = '../../config/swiss_cfg_1.json'
    file_str = 'swiss_top10'
    prior_pg = 0.5
    seed = 10

    # cfg_file = '../../config/air_cfg_1.json'
    # file_str = 'temper_top10'
    # prior_pg = 0.5
    # seed = 10

    settings, paths, wf = cfg.get_settings(cfg_file, 'full')
    elbo_file_name = '../../results/' + wf + '/' + 'fulldata_summary.csv'
    domain, evidence_all, train_time_all, rmse_all = pre_trained_res(elbo_file_name)
    datafile = '../'+paths['datafile']

    datastd = 1

    _, _, wf_sub = cfg.get_settings(cfg_file, 'sub')
    bayesian_file_name = '../../results/' + wf_sub + '_rnd' + str(seed) + '/' + 'plots_res/' + file_str + "_RMSE_comparison_" + 'ss' + str(
        1000) + '_p' + str(prior_pg) + "_normalized.csv"
    subset_size = 200
    bo_res_file = '../../results/' + wf + '/' + 'bo_subsize' + str(subset_size) + '.pkl'
    plot_rmse_cmp(elbo_file_name, bayesian_file_name, bo_res_file, datastd)

    # ==averaging multiple runs:
    # saving_folder = '../../results/multiple_results/'
Esempio n. 6
0
import utils.parse_cfg as cfg
import training_wrap_inputs as twi

gpu_id = int(sys.argv[1])
use_gpu(gpu_id)
group = int(sys.argv[2])
random_seed = int(sys.argv[3])  # multiple runs

start = int(sys.argv[4])
group_size = int(sys.argv[5])
ker = pd.read_csv('kernelstring3.csv')
multi_ker_str = np.array(ker).reshape(-1)[start + group * group_size:start +
                                          (group + 1) * group_size]

cfg_file = '../config/' + sys.argv[6]
settings, paths, working_folder = cfg.get_settings(cfg_file, 'sub')

num_batch = int(settings['num_of_batch_per_subset'])
num_subset = int(settings['num_of_subset'])
batch_sizes = np.array(range(num_subset)) * num_batch + num_batch
reuse_batch_sizes = np.array(range(num_subset)) * num_batch

datafile = paths['datafile']
multi_ker_init_dict = None
if paths['init_hyper_file'] is not None:
    multi_ker_init_dict = gidc.get_saved_init(multi_ker_str,
                                              paths['init_hyper_file'])

settings['random_s'] = random_seed
settings['initfile'] = paths["init_hyper_file"]
print(settings)
Esempio n. 7
0
                  'best_ker': domain[best_ker_ind],
                  'best_ker_elbo': max(evidence_all)}

    with open(result_filename, 'wb') as fout:
        pickle.dump(bo_results, fout)


if __name__ == "__main__":
    cfg_file = '../../config/swiss_cfg_1.json'
    subset_candi_size = 200000

    # cfg_file = '../../config/air_cfg_1.json'
    # subset_candi_size = 10000

    subset_size = 200
    _, paths, wf = cfg.get_settings(cfg_file, 'full')
    wf = '../../results/' + wf + "/"
    elbo_file_name = wf + 'fulldata_summary.csv'

    if not os.path.exists(elbo_file_name):
        k = gtr.get_kernel_names(wf)
        df = gtr.get_fulldata_res(wf, k)
        df.to_csv(elbo_file_name)

    domain, evidence_all, train_time_all, _ = pre_trained_res(elbo_file_name)

    evidence_all = evidence_all
    datafile = '../' + paths['datafile']
    bo_res_file = wf + 'bo_subsize' + str(subset_size) + '.pkl'
    run_bo(datafile, subset_candi_size, subset_size, bo_res_file)