def test_gp_dfixed():
    '''
    Sets up uniform generation with all discrete parameters fixed to set values.
    '''

    n_samples = 100
    seed = 2

    np.random.seed(seed)

    domain = Domain()
    sampling_strategy = UniformSamplingStrategy()

    domain.fix_param(domain.params[1], 'tungsten')
    domain.fix_param(domain.params[2], 'SiC')
    domain.fix_param(domain.params[3], 'H2O')
    domain.fix_param(domain.params[5], 'SiC')
    domain.fix_param(domain.params[6], 'Li4SiO4')
    domain.fix_param(domain.params[7], 'Be')
    domain.fix_param(domain.params[8], 'H20')

    df = domain.gen_data_frame(sampling_strategy, n_samples)
    df.to_csv('params/100params0000000.csv', index=False)
Exemple #2
0
def main():
    '''
    Perform quality-adaptive sampling algorithm
    '''
    
    # Parse inputs and store in relevant variables.
    args = input_parse()
    
    init_samples = args.init_samples
    step_samples = args.step_samples
    step_candidates = args.step_candidates
    d_params = disctrans(args.disc_fix)
    
    # Collect surrogate model type and theory under study.
    thismodel = get_model_factory()[args.model](cli_args=sys.argv[7:])
    thistheory = globals()["theory_" + args.theory]
    
    
    domain = Domain()

    if args.saved_init:
        # load data as initial evaluated samples
        df = load_batches(args.saved_init, (0, 1 + int(init_samples/1000)))
        X_init, d, y_multiple = c_d_y_split(df.iloc[0:init_samples])
        d_params = d.values[0]
        print(d.values[0][0])
        y_init = y_multiple['tbr']
        
    domain.fix_param(domain.params[1], d_params[0])
    domain.fix_param(domain.params[2], d_params[1])
    domain.fix_param(domain.params[3], d_params[2])
    domain.fix_param(domain.params[5], d_params[3])
    domain.fix_param(domain.params[6], d_params[4])
    domain.fix_param(domain.params[7], d_params[5])
    domain.fix_param(domain.params[8], d_params[6])
    
    if not args.saved_init:
        # generate initial parameters
        sampling_strategy = UniformSamplingStrategy()
        c = domain.gen_data_frame(sampling_strategy, init_samples)
        print(c.columns)
        # evaluate initial parameters in given theory
        print("Evaluating initial " + str(init_samples) + " samples in " + args.theory + " theory.")
        output = thistheory(params = c, domain = domain, n_samples = init_samples)
        X_init, d, y_multiple = c_d_y_split(output)
        y_init = y_multiple['tbr']
        current_samples, current_tbr = X_init, y_init
    
    
    # MAIN QASS LOOP
    
    complete_condition = False
    iter_count = 0
    
    err_target = 0.0001
    max_iter_count = 70
    
    all_metrics = pd.DataFrame()
    
    current_samples = current_samples.sort_index(axis=1)
    
    print(f'Features in order are: {list(current_samples.columns)}')
    
    X_train, X_test, y_train, y_test = train_test_split(current_samples, current_tbr, 
                                           test_size=0.5, random_state=1)
                                           
    thismodel.enable_renormalization(100)
     
        
    while not complete_condition:
        iter_count += 1
        samp_size = X_train.shape[0] * 2
        print("Iteration " + str(iter_count) + " -- Total samples: " + str(samp_size))
        
        # Train surrogate for theory, and plot results
                
        if iter_count == 1:                           
            new_samples, new_tbr = X_train, y_train
        train(thismodel, new_samples, new_tbr)
        test(thismodel, X_test, y_test)
        
        plot("qassplot", thismodel, X_test, y_test)
        this_metrics = get_metrics(thismodel, X_test, y_test)
        this_metrics['numdata'] = samp_size
        print(this_metrics)
        
        
        # Calculate error data for this training iteration
        
        y_train_pred = thismodel.predict(X_train.to_numpy())
        y_test_pred = thismodel.predict(X_test.to_numpy())
        
        train_err = abs(y_train - y_train_pred)
        test_err = abs(y_test - y_test_pred)
       
        
        
        # Train neural network surrogate for error function (Failed)
        
        X_test = X_test.sort_index(axis=1)
        
        X_test1, X_test2, test_err1, test_err2 = train_test_split(X_test, test_err, 
                                               test_size=0.5, random_state=1)
            
            #errmodel = get_model_factory()["nn"](cli_args=["--epochs=50", "--batch-size=200"
                                                             # ,"--arch-type=4F_512"])
            #errmodel = get_model_factory()["rbf"](cli_args=["--d0=20"])
                                               
            #scaled_X_test1, scaled_test_err1 = errmodel.scale_training_set(X_test1, test_err1)
            #scaled_X_test2, scaled_test_err2 = errmodel.scale_testing_set(X_test2, test_err2)
            #dtest1 = pd.DataFrame(scaled_X_test1, columns = X_test1.columns,
                                                #  index = X_test1.index)
            #dtest2 = pd.DataFrame(scaled_X_test2, columns = X_test2.columns,
                                                #  index = X_test2.index)
            #derr1 = pd.Series(scaled_test_err1, index = test_err1.index)
            #derr2 = pd.Series(scaled_test_err2, index = test_err2.index)
            
            #print(type(test_err1))
            #print(type(scaled_test_err1))
            #train(errmodel, dtest1, derr1)
            #test(errmodel, dtest2, derr2)
            #print(X_test1)
            #print(scaled_X_test1)
            #print(dtest1)
            
            #plot("qassplot3", errmodel, dtest2, derr2) 
            
            
                                               
                #tri = Delaunay(X_test1.values, qhull_options="Qc QbB Qx Qz")                 
                #f = interpolate.LinearNDInterpolator(tri, test_err1.values)
                
                 
        # Test surrogate (nearest neighbor interpolator) on split error data        
                                 
        errordist_test = interpolate.NearestNDInterpolator(X_test1.values, test_err1.values)
        pred_err1 = errordist_test(X_test1.values)    
        pred_err2 = errordist_test(X_test2.values)
        
        # Train surrogate (nearest neighbor interpolator) for error function
        
        errordist = interpolate.NearestNDInterpolator(X_test.values, test_err.values)
        pred_err = errordist(X_test.values)
        
        max_err = max(test_err.values)
        print('Max error: ' + str(max_err))
        this_metrics['maxerr'] = max_err
        
        plot_results("qassplot2", pred_err1, test_err1)
        plt.figure()
        plot_results("qassplot3", pred_err2, test_err2) 
        
        plt.figure()
        plt.hist(test_err.values, bins=100)
        plt.savefig("qassplot4.pdf", format="pdf")   
        
        
        
        # Perform MCMC on error function
        
        saveinterval = 1
        nburn = 1000
        nrun = 10000
        
        initial_sample = X_train.iloc[0]
        #print(initial_sample.values)
        #print(errordist(initial_sample.values))
        burnin_sample, burnin_dist, burnin_acceptance = burnin_MH(errordist, initial_sample.values, nburn)
        saved_samples, saved_dists, run_acceptance = run_MH(errordist, burnin_sample, nrun, saveinterval)
        
        plt.figure()
        plt.hist(saved_dists, bins=100)
        plt.savefig("qassplot5.pdf", format="pdf") 
        
        print('MCMC run finished.')
        print('Burn-In Acceptance: ' + str(burnin_acceptance))
        print('Run Acceptance: ' + str(run_acceptance))
        this_metrics['burn_acc'] = burnin_acceptance
        this_metrics['run_acc'] = run_acceptance
        
                
        # Extract candidate samples from MCMC output and calculate mutual crowding distance
        
        cand_cdms = []
        print(saved_samples.shape)
        samplestep = int(saved_samples.shape[0] / step_candidates)
        print(samplestep)
        candidates = saved_samples[::samplestep]

        for candidate in candidates:
            cand_cdms.append( cdm(candidate,candidates) )

        # Rank candidate samples by error value, and filter out crowded samples
        
        new_samples = pd.DataFrame(candidates, columns = current_samples.columns)
        new_samples['error'] = saved_dists[::samplestep]
        new_samples['cdm'] = cand_cdms 
        
        #print(new_samples)
        #print(new_samples.shape)
            
        new_samples = new_samples.sort_values(by='error', ascending=False)

        indexNames = new_samples[ new_samples['cdm'] <= new_samples['cdm'].median() ].index
        new_samples.drop(indexNames , inplace=True)
        
        new_samples.drop(columns=['error', 'cdm'])
        new_samples = new_samples.head(step_samples).reset_index()
        
        
        # Add new samples and corresponding TBR evaluations to current sample set
        
        new_output = thistheory(params = new_samples.join(pd.concat([d.head(1)]*step_samples, ignore_index=True)), domain = domain, n_samples = step_samples)
        new_samples, new_d, new_y_multiple = c_d_y_split(new_output)
        new_tbr = new_y_multiple['tbr']
        
        #print(new_samples) 
        
        new_samples = new_samples.sort_index(axis=1)
        
        #new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_samples, new_tbr,test_size=0.5, random_state=1)

        X_train = pd.concat([X_train, new_samples], ignore_index=True)
        #X_test = pd.concat([X_test, new_X_test], ignore_index=True)
        y_train = pd.concat([y_train, new_tbr], ignore_index=True)
        #y_test = pd.concat([y_test, new_y_test], ignore_index=True)
    
        # Check completion conditions and close loop
    
        if max_err < err_target or iter_count > max_iter_count:
            complete_condition = True
        
        all_metrics = pd.concat([all_metrics,this_metrics], ignore_index=True)
        print(all_metrics)
        all_metrics.to_csv('qassmetrics.csv')


    print('QASS finished.')
def main():
    '''
    Perform FAKE quality-adaptive sampling algorithm
    '''

    # Parse inputs and store in relevant variables.
    args = input_parse()

    init_samples = args.init_samples
    step_samples = args.step_samples
    step_candidates = args.step_candidates
    eval_samples = args.eval_samples
    retrain = args.retrain
    d_params = disctrans(args.disc_fix)

    # Collect surrogate model type and theory under study.
    thismodel = get_model_factory()[args.model](cli_args=sys.argv[9:])
    thistheory = globals()["theory_" + args.theory]

    domain = Domain()

    if args.saved_init:
        # load data as initial evaluated samples
        df = load_batches(args.saved_init, (0, 1 + int(init_samples / 1000)))
        X_init, d, y_multiple = c_d_y_split(df.iloc[0:init_samples])
        d_params = d.values[0]
        print(d.values[0][0])
        y_init = y_multiple['tbr']

    domain.fix_param(domain.params[1], d_params[0])
    domain.fix_param(domain.params[2], d_params[1])
    domain.fix_param(domain.params[3], d_params[2])
    domain.fix_param(domain.params[5], d_params[3])
    domain.fix_param(domain.params[6], d_params[4])
    domain.fix_param(domain.params[7], d_params[5])
    domain.fix_param(domain.params[8], d_params[6])

    if not args.saved_init:
        # generate initial parameters
        sampling_strategy = UniformSamplingStrategy()
        c = domain.gen_data_frame(sampling_strategy, init_samples)
        print(c.columns)
        # evaluate initial parameters in given theory
        print("Evaluating initial " + str(init_samples) + " samples in " +
              args.theory + " theory.")
        output = thistheory(params=c, domain=domain, n_samples=init_samples)
        X_init, d, y_multiple = c_d_y_split(output)
        y_init = y_multiple['tbr']
        current_samples, current_tbr = X_init, y_init

    # MAIN QASS LOOP

    complete_condition = False
    iter_count = 0
    trigger_retrain = False
    similarity = 0

    err_target = 0.0001
    max_iter_count = 10000

    all_metrics = pd.DataFrame()

    while not complete_condition:
        iter_count += 1
        samp_size = current_samples.shape[0]
        print("Iteration " + str(iter_count) + " -- Total samples: " +
              str(samp_size))

        # Train surrogate for theory, and plot results

        X_train, X_test, y_train, y_test = train_test_split(
            current_samples, current_tbr, test_size=0.5,
            random_state=1)  #play with this

        # Goldilocks retraining scheme

        if iter_count > 1:
            alt_scaler = thismodel.create_scaler()
            Xy_in = thismodel.join_sets(X_train, y_train)
            alt_scaler.fit(Xy_in)
            similarity = thismodel.scaler_similarity(thismodel.scaler,
                                                     alt_scaler)
            if iter_count % 10000 == 0:  #restart with new random weights
                #thismodel = get_model_factory()[args.model](cli_args=sys.argv[8:])
                thismodel.scaler = alt_scaler

        train(thismodel, X_train, y_train)
        test(thismodel, X_test, y_test)

        plot("qassplot", thismodel, X_test, y_test)
        this_metrics = get_metrics(thismodel, X_test, y_test)
        this_metrics['numdata'] = samp_size
        this_metrics['similarity'] = similarity
        print(this_metrics)

        # True evaluation test on uniform random data

        evaltest_samples = domain.gen_data_frame(sampling_strategy,
                                                 eval_samples)

        eval_output = thistheory(params=evaltest_samples,
                                 domain=domain,
                                 n_samples=eval_samples)
        evaltest_samples, evaltest_d, evaltest_y_multiple = c_d_y_split(
            eval_output)
        evaltest_tbr = evaltest_y_multiple['tbr']

        test(thismodel, evaltest_samples, evaltest_tbr)
        plot("qassplot2", thismodel, evaltest_samples, evaltest_tbr)
        eval_metrics = get_metrics(thismodel, evaltest_samples, evaltest_tbr)
        print(eval_samples)

        this_metrics['E_MAE'] = eval_metrics['MAE']
        this_metrics['E_S'] = eval_metrics['S']
        this_metrics['E_R2'] = eval_metrics['R2']
        this_metrics['E_R2(adj)'] = eval_metrics['R2(adj)']

        # Generate uniform random new samples

        new_samples = domain.gen_data_frame(sampling_strategy, step_samples)

        new_output = thistheory(params=new_samples,
                                domain=domain,
                                n_samples=step_samples)
        new_samples, new_d, new_y_multiple = c_d_y_split(new_output)
        new_tbr = new_y_multiple['tbr']

        current_samples = pd.concat([current_samples, new_samples],
                                    ignore_index=True)
        current_tbr = pd.concat([current_tbr, new_tbr], ignore_index=True)

        # Check completion conditions and close loop

        if iter_count > max_iter_count:
            complete_condition = True

        all_metrics = pd.concat([all_metrics, this_metrics], ignore_index=True)
        print(all_metrics)
        all_metrics.to_csv('qassfakemetrics.csv')

    print('FAKE QASS finished.')