def test_gp_dfixed(): ''' Sets up uniform generation with all discrete parameters fixed to set values. ''' n_samples = 100 seed = 2 np.random.seed(seed) domain = Domain() sampling_strategy = UniformSamplingStrategy() domain.fix_param(domain.params[1], 'tungsten') domain.fix_param(domain.params[2], 'SiC') domain.fix_param(domain.params[3], 'H2O') domain.fix_param(domain.params[5], 'SiC') domain.fix_param(domain.params[6], 'Li4SiO4') domain.fix_param(domain.params[7], 'Be') domain.fix_param(domain.params[8], 'H20') df = domain.gen_data_frame(sampling_strategy, n_samples) df.to_csv('params/100params0000000.csv', index=False)
def main(): ''' Perform quality-adaptive sampling algorithm ''' # Parse inputs and store in relevant variables. args = input_parse() init_samples = args.init_samples step_samples = args.step_samples step_candidates = args.step_candidates d_params = disctrans(args.disc_fix) # Collect surrogate model type and theory under study. thismodel = get_model_factory()[args.model](cli_args=sys.argv[7:]) thistheory = globals()["theory_" + args.theory] domain = Domain() if args.saved_init: # load data as initial evaluated samples df = load_batches(args.saved_init, (0, 1 + int(init_samples/1000))) X_init, d, y_multiple = c_d_y_split(df.iloc[0:init_samples]) d_params = d.values[0] print(d.values[0][0]) y_init = y_multiple['tbr'] domain.fix_param(domain.params[1], d_params[0]) domain.fix_param(domain.params[2], d_params[1]) domain.fix_param(domain.params[3], d_params[2]) domain.fix_param(domain.params[5], d_params[3]) domain.fix_param(domain.params[6], d_params[4]) domain.fix_param(domain.params[7], d_params[5]) domain.fix_param(domain.params[8], d_params[6]) if not args.saved_init: # generate initial parameters sampling_strategy = UniformSamplingStrategy() c = domain.gen_data_frame(sampling_strategy, init_samples) print(c.columns) # evaluate initial parameters in given theory print("Evaluating initial " + str(init_samples) + " samples in " + args.theory + " theory.") output = thistheory(params = c, domain = domain, n_samples = init_samples) X_init, d, y_multiple = c_d_y_split(output) y_init = y_multiple['tbr'] current_samples, current_tbr = X_init, y_init # MAIN QASS LOOP complete_condition = False iter_count = 0 err_target = 0.0001 max_iter_count = 70 all_metrics = pd.DataFrame() current_samples = current_samples.sort_index(axis=1) print(f'Features in order are: {list(current_samples.columns)}') X_train, X_test, y_train, y_test = train_test_split(current_samples, current_tbr, test_size=0.5, random_state=1) thismodel.enable_renormalization(100) while not complete_condition: iter_count += 1 samp_size = X_train.shape[0] * 2 print("Iteration " + str(iter_count) + " -- Total samples: " + str(samp_size)) # Train surrogate for theory, and plot results if iter_count == 1: new_samples, new_tbr = X_train, y_train train(thismodel, new_samples, new_tbr) test(thismodel, X_test, y_test) plot("qassplot", thismodel, X_test, y_test) this_metrics = get_metrics(thismodel, X_test, y_test) this_metrics['numdata'] = samp_size print(this_metrics) # Calculate error data for this training iteration y_train_pred = thismodel.predict(X_train.to_numpy()) y_test_pred = thismodel.predict(X_test.to_numpy()) train_err = abs(y_train - y_train_pred) test_err = abs(y_test - y_test_pred) # Train neural network surrogate for error function (Failed) X_test = X_test.sort_index(axis=1) X_test1, X_test2, test_err1, test_err2 = train_test_split(X_test, test_err, test_size=0.5, random_state=1) #errmodel = get_model_factory()["nn"](cli_args=["--epochs=50", "--batch-size=200" # ,"--arch-type=4F_512"]) #errmodel = get_model_factory()["rbf"](cli_args=["--d0=20"]) #scaled_X_test1, scaled_test_err1 = errmodel.scale_training_set(X_test1, test_err1) #scaled_X_test2, scaled_test_err2 = errmodel.scale_testing_set(X_test2, test_err2) #dtest1 = pd.DataFrame(scaled_X_test1, columns = X_test1.columns, # index = X_test1.index) #dtest2 = pd.DataFrame(scaled_X_test2, columns = X_test2.columns, # index = X_test2.index) #derr1 = pd.Series(scaled_test_err1, index = test_err1.index) #derr2 = pd.Series(scaled_test_err2, index = test_err2.index) #print(type(test_err1)) #print(type(scaled_test_err1)) #train(errmodel, dtest1, derr1) #test(errmodel, dtest2, derr2) #print(X_test1) #print(scaled_X_test1) #print(dtest1) #plot("qassplot3", errmodel, dtest2, derr2) #tri = Delaunay(X_test1.values, qhull_options="Qc QbB Qx Qz") #f = interpolate.LinearNDInterpolator(tri, test_err1.values) # Test surrogate (nearest neighbor interpolator) on split error data errordist_test = interpolate.NearestNDInterpolator(X_test1.values, test_err1.values) pred_err1 = errordist_test(X_test1.values) pred_err2 = errordist_test(X_test2.values) # Train surrogate (nearest neighbor interpolator) for error function errordist = interpolate.NearestNDInterpolator(X_test.values, test_err.values) pred_err = errordist(X_test.values) max_err = max(test_err.values) print('Max error: ' + str(max_err)) this_metrics['maxerr'] = max_err plot_results("qassplot2", pred_err1, test_err1) plt.figure() plot_results("qassplot3", pred_err2, test_err2) plt.figure() plt.hist(test_err.values, bins=100) plt.savefig("qassplot4.pdf", format="pdf") # Perform MCMC on error function saveinterval = 1 nburn = 1000 nrun = 10000 initial_sample = X_train.iloc[0] #print(initial_sample.values) #print(errordist(initial_sample.values)) burnin_sample, burnin_dist, burnin_acceptance = burnin_MH(errordist, initial_sample.values, nburn) saved_samples, saved_dists, run_acceptance = run_MH(errordist, burnin_sample, nrun, saveinterval) plt.figure() plt.hist(saved_dists, bins=100) plt.savefig("qassplot5.pdf", format="pdf") print('MCMC run finished.') print('Burn-In Acceptance: ' + str(burnin_acceptance)) print('Run Acceptance: ' + str(run_acceptance)) this_metrics['burn_acc'] = burnin_acceptance this_metrics['run_acc'] = run_acceptance # Extract candidate samples from MCMC output and calculate mutual crowding distance cand_cdms = [] print(saved_samples.shape) samplestep = int(saved_samples.shape[0] / step_candidates) print(samplestep) candidates = saved_samples[::samplestep] for candidate in candidates: cand_cdms.append( cdm(candidate,candidates) ) # Rank candidate samples by error value, and filter out crowded samples new_samples = pd.DataFrame(candidates, columns = current_samples.columns) new_samples['error'] = saved_dists[::samplestep] new_samples['cdm'] = cand_cdms #print(new_samples) #print(new_samples.shape) new_samples = new_samples.sort_values(by='error', ascending=False) indexNames = new_samples[ new_samples['cdm'] <= new_samples['cdm'].median() ].index new_samples.drop(indexNames , inplace=True) new_samples.drop(columns=['error', 'cdm']) new_samples = new_samples.head(step_samples).reset_index() # Add new samples and corresponding TBR evaluations to current sample set new_output = thistheory(params = new_samples.join(pd.concat([d.head(1)]*step_samples, ignore_index=True)), domain = domain, n_samples = step_samples) new_samples, new_d, new_y_multiple = c_d_y_split(new_output) new_tbr = new_y_multiple['tbr'] #print(new_samples) new_samples = new_samples.sort_index(axis=1) #new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_samples, new_tbr,test_size=0.5, random_state=1) X_train = pd.concat([X_train, new_samples], ignore_index=True) #X_test = pd.concat([X_test, new_X_test], ignore_index=True) y_train = pd.concat([y_train, new_tbr], ignore_index=True) #y_test = pd.concat([y_test, new_y_test], ignore_index=True) # Check completion conditions and close loop if max_err < err_target or iter_count > max_iter_count: complete_condition = True all_metrics = pd.concat([all_metrics,this_metrics], ignore_index=True) print(all_metrics) all_metrics.to_csv('qassmetrics.csv') print('QASS finished.')
def main(): ''' Perform FAKE quality-adaptive sampling algorithm ''' # Parse inputs and store in relevant variables. args = input_parse() init_samples = args.init_samples step_samples = args.step_samples step_candidates = args.step_candidates eval_samples = args.eval_samples retrain = args.retrain d_params = disctrans(args.disc_fix) # Collect surrogate model type and theory under study. thismodel = get_model_factory()[args.model](cli_args=sys.argv[9:]) thistheory = globals()["theory_" + args.theory] domain = Domain() if args.saved_init: # load data as initial evaluated samples df = load_batches(args.saved_init, (0, 1 + int(init_samples / 1000))) X_init, d, y_multiple = c_d_y_split(df.iloc[0:init_samples]) d_params = d.values[0] print(d.values[0][0]) y_init = y_multiple['tbr'] domain.fix_param(domain.params[1], d_params[0]) domain.fix_param(domain.params[2], d_params[1]) domain.fix_param(domain.params[3], d_params[2]) domain.fix_param(domain.params[5], d_params[3]) domain.fix_param(domain.params[6], d_params[4]) domain.fix_param(domain.params[7], d_params[5]) domain.fix_param(domain.params[8], d_params[6]) if not args.saved_init: # generate initial parameters sampling_strategy = UniformSamplingStrategy() c = domain.gen_data_frame(sampling_strategy, init_samples) print(c.columns) # evaluate initial parameters in given theory print("Evaluating initial " + str(init_samples) + " samples in " + args.theory + " theory.") output = thistheory(params=c, domain=domain, n_samples=init_samples) X_init, d, y_multiple = c_d_y_split(output) y_init = y_multiple['tbr'] current_samples, current_tbr = X_init, y_init # MAIN QASS LOOP complete_condition = False iter_count = 0 trigger_retrain = False similarity = 0 err_target = 0.0001 max_iter_count = 10000 all_metrics = pd.DataFrame() while not complete_condition: iter_count += 1 samp_size = current_samples.shape[0] print("Iteration " + str(iter_count) + " -- Total samples: " + str(samp_size)) # Train surrogate for theory, and plot results X_train, X_test, y_train, y_test = train_test_split( current_samples, current_tbr, test_size=0.5, random_state=1) #play with this # Goldilocks retraining scheme if iter_count > 1: alt_scaler = thismodel.create_scaler() Xy_in = thismodel.join_sets(X_train, y_train) alt_scaler.fit(Xy_in) similarity = thismodel.scaler_similarity(thismodel.scaler, alt_scaler) if iter_count % 10000 == 0: #restart with new random weights #thismodel = get_model_factory()[args.model](cli_args=sys.argv[8:]) thismodel.scaler = alt_scaler train(thismodel, X_train, y_train) test(thismodel, X_test, y_test) plot("qassplot", thismodel, X_test, y_test) this_metrics = get_metrics(thismodel, X_test, y_test) this_metrics['numdata'] = samp_size this_metrics['similarity'] = similarity print(this_metrics) # True evaluation test on uniform random data evaltest_samples = domain.gen_data_frame(sampling_strategy, eval_samples) eval_output = thistheory(params=evaltest_samples, domain=domain, n_samples=eval_samples) evaltest_samples, evaltest_d, evaltest_y_multiple = c_d_y_split( eval_output) evaltest_tbr = evaltest_y_multiple['tbr'] test(thismodel, evaltest_samples, evaltest_tbr) plot("qassplot2", thismodel, evaltest_samples, evaltest_tbr) eval_metrics = get_metrics(thismodel, evaltest_samples, evaltest_tbr) print(eval_samples) this_metrics['E_MAE'] = eval_metrics['MAE'] this_metrics['E_S'] = eval_metrics['S'] this_metrics['E_R2'] = eval_metrics['R2'] this_metrics['E_R2(adj)'] = eval_metrics['R2(adj)'] # Generate uniform random new samples new_samples = domain.gen_data_frame(sampling_strategy, step_samples) new_output = thistheory(params=new_samples, domain=domain, n_samples=step_samples) new_samples, new_d, new_y_multiple = c_d_y_split(new_output) new_tbr = new_y_multiple['tbr'] current_samples = pd.concat([current_samples, new_samples], ignore_index=True) current_tbr = pd.concat([current_tbr, new_tbr], ignore_index=True) # Check completion conditions and close loop if iter_count > max_iter_count: complete_condition = True all_metrics = pd.concat([all_metrics, this_metrics], ignore_index=True) print(all_metrics) all_metrics.to_csv('qassfakemetrics.csv') print('FAKE QASS finished.')