Exemple #1
0
def run_skf_with_te_nofolds(inputs, plot_spline, smote_numel):
    shared, end, pre, filters, epochs, label_type = inputs
    hparams = create_hparams(shared_layers=[30, 30], ts_layers=[5, 5], cs_layers=[5, 5],
                             shared=shared, end=end, pre=pre, filters=filters, epochs=epochs,
                             reg_l1=0.0005, reg_l2=0.,
                             max_depth=100, num_est=1000,
                             epsilon=0.0001, c=0.001,
                             activation='relu', batch_size=4, verbose=0)

    write_dir = create_results_directory('./results/skf',
                                         folders=['plots', 'models', 'learning rate plots'],
                                         excels=['skf_results', 'te.xlsx'])
    fl = load_data_to_fl('./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx',
                         label_type=label_type,
                         normalise_labels=False,
                         norm_mask=[0, 1, 3, 4, 5])

    if smote_numel:
        fl_store = fl.fold_smote_kf_augment(k_folds=10, shuffle=True, numel=smote_numel)
    else:
        fl_store = fl.create_kf(k_folds=10, shuffle=True)

    run_skf_with_training_error(model_mode='ann3', loss_mode='ann', fl=fl, fl_store=[[fl, fl]], hparams=hparams,
                                skf_file=write_dir + '/skf_results.xlsx',
                                te_sheet=write_dir + '/te.xlsx',
                                skf_sheet=None,
                                k_folds=10, k_shuffle=True, save_model=True, save_model_name=None,
                                save_model_dir=write_dir + '/models/',
                                plot_name=write_dir + '/learning rate plots/plot')

    write_excel = create_excel_file('{}/training_error.xlsx'.format(write_dir))
    testset_model_results_to_excel(write_excel=write_excel, model_dir_store=['{}/models'.format(write_dir)],
                                   loader_excel='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx',
                                   testset_excel_dir='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx',
                                   fn=6, numel=3, chunks=10)
def svm_hparam_opt(grid_fl_dir, total_run, write_excel_dir):
    with open(grid_fl_dir, 'rb') as fp:
        fl = pickle.load(fp)

    run_count = 0
    gamma = Real(low=0.1, high=300, name='gamma')
    dimensions = [gamma]
    default_parameters = [130]

    fl_store = fl.create_kf(k_folds=10, shuffle=True)

    @use_named_args(dimensions=dimensions)
    def fitness(gamma):
        nonlocal run_count, fl_store
        run_count += 1
        # Run k model instance to perform skf
        predicted_labels_store = []
        val_labels = []
        for fold, fl_tuple in enumerate(fl_store):
            (ss_fl, i_ss_fl
             ) = fl_tuple  # ss_fl is training fl, i_ss_fl is validation fl

            # Training
            model = SVMmodel(fl=ss_fl, gamma=gamma)
            model.train_model(fl=ss_fl)

            # Evaluation
            predicted_labels = model.predict(i_ss_fl).flatten().tolist()
            predicted_labels_store.extend(predicted_labels)
            val_labels.extend(i_ss_fl.labels.flatten().tolist())

        # Calculating metrics based on complete validation prediction
        mcc = matthews_corrcoef(y_true=val_labels,
                                y_pred=predicted_labels_store)
        if run_count % 10 == 0:  # Print every 10 iteration
            print(f'Run Number {run_count}')
        return -mcc

    search_result = gp_minimize(
        func=fitness,
        dimensions=dimensions,
        acq_func='EI',  # Expected Improvement.
        n_calls=total_run,
        x0=default_parameters)
    print('Best Loss = {}'.format(search_result.fun))
    print('Best Gamma = {}'.format(search_result.x[0]))
    x = [x[0] for x in search_result.x_iters]
    results = pd.DataFrame([x] + [(-search_result.func_vals).tolist()]).T
    results.columns = ['Gamma', 'mcc']
    results = results.sort_values(by='mcc', ascending=False)

    write_excel_dir = create_excel_file(write_excel_dir)
    wb = openpyxl.load_workbook(write_excel_dir)
    ws = wb[wb.sheetnames[-1]]
    print_df_to_excel(df=results, ws=ws)
    wb.save(write_excel_dir)
    wb.close()
Exemple #3
0
def decomp_combi(var_name, numel, subgroup_size):
    results_dir = './results/{} Done'.format(var_name)
    post = Postdata(results_dir=results_dir,
                    var_name=var_name,
                    calculations=False,
                    star=True)
    all_h_y_hat = [
        np.array(ar.tolist() + pca.tolist() + umap.tolist())
        for ar, pca, umap in zip(post.testset_AR_y_hat, post.testset_PCA_y_hat,
                                 post.testset_UMAP_y_hat)
    ]
    model_count = [
        single_all_y_hat.shape[0] for single_all_y_hat in all_h_y_hat
    ]
    if any(subgroup_size >= np.array(model_count)):
        raise ValueError(
            'subgroup_size given is {} which is >= model_count value of {}.'
            ' Choose a smaller subgroup_size'.format(subgroup_size,
                                                     model_count))

    excel_dir = create_excel_file(
        './results/{} Done/decomp_combi.xlsx'.format(var_name))
    wb = openpyxl.load_workbook(excel_dir)

    selections = [
        random.sample(list(range(model_count[0])), k=subgroup_size)
        for _ in range(numel)
    ]
    all_h_p_y_hat = []
    all_h_rmse = []
    for single_all_y_hat, single_y, h_label in zip(all_h_y_hat,
                                                   post.testset_AR_y,
                                                   post.hsteps):
        # perform sub selection for each h step ahead
        sub_y_hat_store = np.array(
            [single_all_y_hat[selection, :] for selection in selections])
        sub_y_mean_hat = np.mean(sub_y_hat_store, axis=1)
        sub_y_invvar_hat = np.reciprocal(np.var(sub_y_hat_store, axis=1))
        total_weights = np.sum(sub_y_invvar_hat, axis=0)
        p_y = np.sum((1 / total_weights * sub_y_mean_hat * sub_y_invvar_hat),
                     axis=0)
        all_h_p_y_hat.append(p_y)
        all_h_rmse.append(np.sqrt(np.average(np.square(p_y - single_y))))
        wb.create_sheet('h={}'.format(h_label))
        ws = wb[wb.sheetnames[-1]]

        ws.cell(1, 1).value = 'numel'
        ws.cell(1, 2).value = numel
        ws.cell(1, 3).value = 'subgroup_size'
        ws.cell(1, 4).value = subgroup_size
        ws.cell(2, 2).value = 'rmse'
        print_array_to_excel(array=single_y, first_cell=(3, 3), ws=ws, axis=1)
        ws.cell(3, 2).value = ''
        ws.cell(4, 2).value = all_h_rmse[-1]
        print_array_to_excel(array=p_y, first_cell=(4, 3), ws=ws, axis=1)

    wb.save(excel_dir)
def l2_tracker(write_excel_dir, final_excel_loader, last_idx_store):
    '''
    To calculate the average min(L2 distance) over all the data points.
    The avg min L2 is caclulated for each active learnning round, as indicated by the last_idx_store
    :param write_excel_dir: Excel directory to write the data to
    :param final_excel_loader: The excel loader file that contains the feature information
    :param last_idx_store: A list to indicate which experiment number is the last experiment for that batch of active
    learning round. For example, we have 3 active learning rounds with 5, 10, and 3 experiments per round.
    So the last idx store will be [5, 15, 18]

    Saves a new excel file which contains the L2 information
    1) It contains the avg min L2 for each batch of active learning round
    2) The avg min L2 distance for the batch of suggestions for the next active learning round.
    Since the last round has no additional suggestions, the last round has no calculated value for this.
    '''
    write_excel_dir = create_excel_file(write_excel_dir)
    wb = openpyxl.Workbook()
    wb.create_sheet('L2 Results')
    ws = wb[wb.sheetnames[-1]]
    scaler = MinMaxScaler()
    scaler.fit(np.array([[200], [2000]]))
    fl = load_data_to_fl(data_loader_excel_file=final_excel_loader,
                         normalise_labels=False,
                         scaler=scaler,
                         norm_mask=[0, 1, 3, 4, 5])
    final_features = fl.features_c_norm
    suggestions_store = [
        y2 - y1 for y2, y1 in zip(last_idx_store[1:], last_idx_store[:-1])
    ] + [0]
    batch_l2_store = []
    batch_l2_suggestions_store = []
    for last_idx, suggestions_numel in zip(last_idx_store, suggestions_store):
        features = final_features[:last_idx, :].tolist()

        l2_store = []
        for idx, x in enumerate(features):
            other_features = np.array(features[:idx] + features[idx + 1:])
            l2_distance = np.linalg.norm(x=other_features -
                                         np.array(x).reshape((1, -1)),
                                         ord=2,
                                         axis=1)
            l2_store.append(np.min(l2_distance))
        batch_l2_store.append(np.mean(l2_store))

    df = pd.DataFrame(data=np.concatenate((
        np.array(last_idx_store).reshape(-1, 1),
        np.array(batch_l2_store).reshape(-1, 1),
    ),
                                          axis=1),
                      columns=['Expt Batches', 'Mean Min L2'],
                      index=range(1,
                                  len(last_idx_store) + 1))
    print_df_to_excel(df=df, ws=ws)
    wb.save(write_excel_dir)
Exemple #5
0
def read_col_data_store(name):
    with open('./data_store.pkl', 'rb') as handle:
        data_store = pickle.load(handle)

    write_excel = create_excel_file('./results/{}_results.xlsx'.format(name))
    wb = openpyxl.load_workbook(write_excel)
    ws = wb[wb.sheetnames[-1]]
    print_df_to_excel(df=pd.DataFrame(data=data_store[1],
                                      columns=data_store[0]),
                      ws=ws)
    wb.save(write_excel)
Exemple #6
0
def compile_pm_rm_excel(excel_dir_store):
    master_pm = [[] for x in range(5)]
    master_rm = [[] for x in range(5)]
    for excel_dir in excel_dir_store:
        xls = pd.ExcelFile(excel_dir)
        sheet_names = xls.sheet_names[1:]
        for sheet, pm_store, rm_store in zip(sheet_names, master_pm,
                                             master_rm):
            df = pd.read_excel(excel_dir, sheet_name=sheet,
                               index_col=None).values
            pm_store.append(df[1:10, :])
            rm_store.append(df[11:, 0][..., None])

    for idx, pm_h in enumerate(master_pm):
        pm = pm_h[0]
        for pm_hh in pm_h[1:]:
            pm = np.concatenate((pm, pm_hh), axis=1)
        master_pm[idx] = pm

    for idx, pm_h in enumerate(master_rm):
        rm = pm_h[0]
        for pm_hh in pm_h[1:]:
            rm = np.concatenate((rm, pm_hh), axis=1)
        master_rm[idx] = rm

    excel_dir = create_excel_file('./results/master_pm_rd.xlsx')
    wb = openpyxl.load_workbook(excel_dir)
    for idx, (pm, rm) in enumerate(zip(master_pm, master_rm)):

        pm_name = 'pm_h{}'.format([1, 3, 6, 12, 24][idx])
        rm_name = 'rm_h{}'.format([1, 3, 6, 12, 24][idx])
        wb.create_sheet(pm_name)
        wb.create_sheet(rm_name)

        ws = wb[pm_name]
        pm_df = pd.DataFrame(data=pm,
                             columns=['m', 'p'] * len(excel_dir_store))
        rows = dataframe_to_rows(pm_df, index=False)
        for r_idx, row in enumerate(rows, 1):
            for c_idx, value in enumerate(row, 1):
                ws.cell(row=r_idx + 1, column=c_idx, value=value)

        ws = wb[rm_name]
        rm_df = pd.DataFrame(data=rm,
                             columns=['Relative RMSE'] * len(excel_dir_store))
        rows = dataframe_to_rows(rm_df, index=False)
        for r_idx, row in enumerate(rows, 1):
            for c_idx, value in enumerate(row, 1):
                ws.cell(row=r_idx + 1, column=c_idx, value=value)

    wb.save(excel_dir)

    pass
Exemple #7
0
def create_invariant_testset(testset_excel_dir, numel):
    df = pd.read_excel(testset_excel_dir, index_col=0, sheet_name='Sheet')

    features, labels = produce_invariant(features=df.values[:, :6], labels=df.values[:, 6:], numel=numel)
    new_data = np.concatenate((features, labels), axis=1)
    columns = df.columns
    new_df = pd.DataFrame(data=new_data, columns=columns)
    df = df.append(new_df)

    write_excel = '{} Invariant {}.xlsx'.format(testset_excel_dir.partition('.xlsx')[0], numel)
    write_excel = create_excel_file(write_excel)
    wb = openpyxl.load_workbook(write_excel)
    ws = wb[wb.sheetnames[-1]]
    print_df_to_excel(df=df, ws=ws)
    wb.save(write_excel)
Exemple #8
0
def get_final_submission_excel(excel_dir, read_excel_dir):
    xls = pd.ExcelFile(
        read_excel_dir
    )  # './results/expt1/a_Final_submission_expt1/combined_poos_results_CPIA.xlsx')
    data = {'rmse': [], 'rel_rmse': []}
    for sheet in xls.sheet_names:
        if 'rel_rmse_sel' in sheet:
            temp_df = pd.read_excel(xls, sheet_name=sheet).iloc[-2:, :]
            temp_df.index = [['2005:1~2019:12', '2020:1~2020:6']]
            data['rel_rmse'].append(temp_df)
        elif 'rmse_sel' in sheet:
            temp_df = pd.read_excel(xls, sheet_name=sheet).iloc[-2:, :]
            temp_df.index = [['2005:1~2019:12', '2020:1~2020:6']]
            data['rmse'].append(temp_df)

    excel_dir = create_excel_file(excel_dir)
    wb = openpyxl.load_workbook(excel_dir)
    for k, v in data.items():
        temp_df = pd.concat(v, axis=0)
        temp_df.index = temp_df.iloc[:, 0]
        temp_df.drop(labels=temp_df.columns[0], axis=1, inplace=True)
        wb.create_sheet(k)
        ws = wb[k]
        print_df_to_excel(df=temp_df, ws=ws)
    '''
    columns = ['Horizons', 'RW', 'AR', 'PCA']+ [f'{y}-{x}' for y in ['XGBA(rh)', 'XGBA(rfcv)'] for x in ['oracle', 'rw', 'hparam','ll', 'll*ln', 'rw_ll*ln']] + ['RF(rh)', 'RF(rfcv)']
    df = pd.read_excel('./results/expt1/a_Final_submission_expt1/final_table_IND.xlsx', sheet_name='rmse')
    df.columns = columns
    df['h'] = [x for x in [1,3,6,12,24] for _ in range(2)]
    df.iloc[:,1:-1] = df.iloc[:,1:-1].div([1,6,1,4,1,2,1,1.5,1,1], axis=0)
    df = df.melt(id_vars=['h', 'Horizons'], var_name='Model', value_name='RMSE')
    df = df[df['Model'].isin(['RW', 'AR', 'PCA', 'XGBA(rh)-rw', 'XGBA(rh)-hparam', 'RF(rh)'])]
    df = df.replace(['XGBA(rh)-rw', 'XGBA(rh)-hparam', 'RF(rh)'], ['XR', 'XH', 'RF'])
    sns.catplot(x="Model", y="RMSE",
                hue="Horizons", col="h",
                data=df, kind="bar", height=2.5, aspect=1.5, sharey=False, legend=False)
    # plt.subplots_adjust(wspace=0)
    plt.legend(bbox_to_anchor=(1.15, 1))
    g = plt.gcf()
    for ax1, (_, subdata), divby in zip(g.axes, df.groupby('h'), [6,4,2,1.5,1]):
        ax2=ax1.twinx()
        ax2.set_ylim(ax1.get_ylim())
        ax2.set_yticklabels(np.round(ax1.get_yticks() *divby, 1))
    
    plt.show()
    '''

    wb.save(excel_dir)
Exemple #9
0
def run_skf_conv1(inputs, plot_spline, smote_numel):
    shared, end, pre, filters, epochs, label_type = inputs
    hparams = create_hparams(shared_layers=[30, 30], ts_layers=[5, 5], cs_layers=[5, 5],
                             shared=shared, end=end, pre=pre, filters=filters, epochs=epochs,
                             reg_l1=0.05, reg_l2=0.,
                             max_depth=5, num_est=200,
                             activation='relu', batch_size=16, verbose=0)

    write_dir = create_results_directory('./results/skf',
                                         folders=['plots', 'models', 'learning rate plots'],
                                         excels=['skf_results'])
    fl = load_data_to_fl('./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx',
                         label_type=label_type,
                         normalise_labels=True,
                         norm_mask=[0, 0, 0, 1, 1, 1])

    if smote_numel:
        fl_store = fl.fold_smote_kf_augment(k_folds=10, shuffle=True, numel=smote_numel)
    else:
        fl_store = fl.create_kf(k_folds=10, shuffle=True)

    run_skf(model_mode='dtr', loss_mode='dtr', fl=fl, fl_store=fl_store, hparams=hparams,
            skf_file=write_dir + '/skf_results.xlsx',
            skf_sheet=None,
            k_folds=10, k_shuffle=True, save_model=True, save_model_name=None, save_model_dir=write_dir + '/models/',
            plot_name=write_dir + '/learning rate plots/plot')
    if plot_spline:
        if label_type == 'points':
            plot_arcsinh_predicted_splines(plot_dir='{}/plots'.format(write_dir),
                                           results_excel_dir='{}/skf_results.xlsx'.format(write_dir),
                                           end_excel_dir='./results/combine Round 6/end 6e.xlsx',
                                           sheets=['ann3'], fn=6, numel=100)
        elif label_type == 'cutoff':
            plot_cutoff(plot_dir='{}/plots'.format(write_dir),
                        results_excel_dir='{}/skf_results.xlsx'.format(write_dir),
                        sheets=['ann3'], fn=6, numel=3)

    write_excel = create_excel_file('{}/training_error.xlsx'.format(write_dir))
    testset_model_results_to_excel(write_excel=write_excel, model_dir_store=['{}/models'.format(write_dir)],
                                   loader_excel='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx',
                                   testset_excel_dir='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx',
                                   fn=6, numel=3, chunks=10)

    return write_dir
Exemple #10
0
def create_data_loader_excel(excel_dir, results_dir):
    ymain_df = pd.read_excel(excel_dir, sheet_name='y transformed', index_col=0)
    xmain_df = pd.read_excel(excel_dir, 'transformation', index_col=0)

    # Find unique var name for forecasting
    var_names = list(set([item.partition('_h')[0] for item in ymain_df.columns]))

    for var_name in var_names:
        excel_name = create_excel_file('{}/{}_data_loader.xlsx'.format(results_dir, var_name))
        wb = openpyxl.load_workbook(excel_name)
        wb.create_sheet('x')
        wb.create_sheet('yo')
        wb.create_sheet('y')
        print_df_to_excel(df=xmain_df.loc[:, xmain_df.columns != var_name], ws=wb['x'])
        print_df_to_excel(df=xmain_df.loc[:, [var_name]], ws=wb['yo'])
        mask = np.flatnonzero(np.core.defchararray.find(ymain_df.columns.values.astype(str), var_name) != -1)
        print_df_to_excel(df=ymain_df.iloc[:, mask], ws=wb['y'])
        wb.save(excel_name)

    pass
Exemple #11
0
def eval_combination_on_testset(av_excel, y_dat, combination_dat):
    with open(y_dat, "rb") as f:
        y = pickle.load(f)
    with open(combination_dat, "rb") as f:
        p_y_store = pickle.load(f)
        p_y_store = np.array([x[1] for x in p_y_store])
    if av_excel:
        av = pd.read_excel(av_excel, sheet_name='av', index_col=None)
        selected_mask = [
            idx for idx, value in enumerate(av.iloc[:, -1].values)
            if value == 1
        ]
    else:
        selected_mask = [1] * len(p_y_store)

    p_y_selected_mean = np.mean(p_y_store[selected_mask, :, :], axis=0)
    re = np.mean(np.abs(y - p_y_selected_mean) / y)

    data = np.concatenate((y, p_y_selected_mean), axis=1)
    df = pd.DataFrame(
        data=data,
        columns=['cut=10', 'cut=100', 'End', 'P_cut=10', 'P_cut=100', 'P_End'])

    wb = openpyxl.Workbook()
    ws = wb[wb.sheetnames[-1]]
    print_df_to_excel(df=df, ws=ws)

    wb.create_sheet('Models')
    ws = wb[wb.sheetnames[-1]]
    ws.cell(1, 1).value = 'Names'
    try:
        print_array_to_excel(array=av.iloc[:, 0].values[selected_mask],
                             first_cell=(2, 1),
                             ws=ws,
                             axis=0)
    except:
        pass
    ws.cell(1, 2).value = 'RE'
    ws.cell(1, 3).value = re
    excel_dir = create_excel_file('./results/eval_combi.xlsx')
    wb.save(excel_dir)
Exemple #12
0
def inverse_design(targets, loss_func, bounds, int_idx, init_guess,
                   model_directory_store, svm_directory, loader_file,
                   write_dir, opt_mode):
    model_store = []
    for model_directory in model_directory_store:
        model_store.extend(load_model_ensemble(model_directory))
    svm_store = load_svm_ensemble(svm_directory)
    fl = load_data_to_fl(loader_file,
                         norm_mask=[0, 1, 3, 4, 5],
                         normalise_labels=False,
                         label_type='cutoff')

    data_store = []
    if opt_mode == 'psoga':

        def fitness(params):
            nonlocal data_store
            features = np.array(params)
            x = features[0]
            y = features[1]
            if x + y > 1:
                u = -y + 1
                v = -x + 1
                features[0:2] = np.array([u, v])

            # SVM Check
            p_class, distance = svm_ensemble_prediction(
                svm_store, features[0:2])

            if distance.item() < 0:
                # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
                # The more negative the a_score is, the further the composition is from the hyperplane,
                # hence, the less likely the optimizer will select examples with class 0.
                mse = 10e5 * distance.item()
                prediction_mean = [-1] * fl.labels_dim
                prediction_std = [-1] * fl.labels_dim
                disagreement = -1
            elif features[0] + features[1] > 1:
                # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
                # The more negative the a_score is, the further the composition is from the hyperplane,
                # hence, the less likely the optimizer will select examples with class 0.
                mse = 10e5 * (1 - (features[0] + features[1]))
                prediction_mean = [-1] * fl.labels_dim
                prediction_std = [-1] * fl.labels_dim
                disagreement = -1
            else:
                features_c = features[:-1]
                onehot = features[-1].item()
                if onehot == 0:
                    features_in = np.concatenate(
                        (features_c, np.array([1, 0, 0])))
                elif onehot == 1:
                    features_in = np.concatenate(
                        (features_c, np.array([0, 1, 0])))
                elif onehot == 2:
                    features_in = np.concatenate(
                        (features_c, np.array([0, 0, 1])))
                features_input_norm = fl.apply_scaling(features_in)
                prediction_mean, prediction_std = model_ensemble_prediction(
                    model_store, features_input_norm)
                mse = -loss_func(targets, prediction_mean)
                disagreement = np.mean(prediction_std)
                prediction_mean = prediction_mean.tolist()
                prediction_std = prediction_std.tolist()

            data = list(features) + [-mse, disagreement
                                     ] + prediction_mean + prediction_std
            data_store.append(data)
            return (-mse, )

        pmin = [x[0] for x in bounds]
        pmax = [x[1] for x in bounds]

        smin = [abs(x - y) * 0.001 for x, y in zip(pmin, pmax)]
        smax = [abs(x - y) * 0.5 for x, y in zip(pmin, pmax)]

        pso_params = {
            'c1': 1.5,
            'c2': 1.5,
            'wmin': 0.4,
            'wmax': 0.9,
            'ga_iter_min': 2,
            'ga_iter_max': 10,
            'iter_gamma': 10,
            'ga_num_min': 5,
            'ga_num_max': 20,
            'num_beta': 15,
            'tourn_size': 3,
            'cxpd': 0.9,
            'mutpd': 0.05,
            'indpd': 0.5,
            'eta': 0.5,
            'pso_iter': 10,
            'swarm_size': 300
        }

        pso_ga(func=fitness,
               pmin=pmin,
               pmax=pmax,
               smin=smin,
               smax=smax,
               int_idx=[3],
               params=pso_params,
               ga=True,
               initial_guess=init_guess)

    elif opt_mode == 'forest' or opt_mode == 'dummy':
        space = [
            Real(low=bounds[0][0], high=bounds[0][1], name='CNT'),
            Real(low=bounds[1][0], high=bounds[1][1], name='PVA'),
            Real(low=bounds[2][0], high=bounds[2][1], name='Thickness'),
            Categorical(categories=[0, 1, 2], name='Dimension')
        ]

        iter_count = 0
        start = time.time()
        end = 0

        @use_named_args(space)
        def fitness(**params):
            nonlocal data_store, iter_count, start, end
            iter_count += 1
            features = np.array([x for x in params.values()])
            x = features[0]
            y = features[1]
            if x + y > 1:
                u = -y + 1
                v = -x + 1
                features[0:2] = np.array([u, v])
            # SVM Check
            p_class, distance = svm_ensemble_prediction(
                svm_store, features[0:2])
            if distance.item() < 0:
                # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
                # The more negative the a_score is, the further the composition is from the hyperplane,
                # hence, the less likely the optimizer will select examples with class 0.
                mse = 10e5 * distance.item()
                prediction_mean = [-1] * fl.labels_dim
                prediction_std = [-1] * fl.labels_dim
                disagreement = -1
            elif features[0] + features[1] > 1:
                # Sum of composition needs to be less than 1
                mse = 10e5 * (1 - (features[0] + features[1]))
                prediction_mean = [-1] * fl.labels_dim
                prediction_std = [-1] * fl.labels_dim
                disagreement = -1
            else:
                features_c = features[:-1]
                onehot = features[-1].item()
                if onehot == 0:
                    features_in = np.concatenate(
                        (features_c, np.array([1, 0, 0])))
                elif onehot == 1:
                    features_in = np.concatenate(
                        (features_c, np.array([0, 1, 0])))
                elif onehot == 2:
                    features_in = np.concatenate(
                        (features_c, np.array([0, 0, 1])))
                features_input_norm = fl.apply_scaling(features_in)
                prediction_mean, prediction_std = model_ensemble_prediction(
                    model_store, features_input_norm)
                mse = -loss_func(targets,
                                 prediction_mean)  # Some negative number
                disagreement = np.mean(prediction_std)
                prediction_mean = prediction_mean.tolist()
                prediction_std = prediction_std.tolist()

            data = list(features) + [-mse, disagreement
                                     ] + prediction_mean + prediction_std
            data_store.append(data)
            if iter_count % 10 == 0:
                end = time.time()
                print(
                    'Current Iteration {}. Time taken for past 10 evals: {}. '.
                    format(iter_count, end - start))
                start = time.time()
            return -mse  # Make negative become positive, and minimizing score towards 0.

        if opt_mode == 'forest':
            forest_minimize(
                func=fitness,
                dimensions=space,
                acq_func='EI',  # Expected Improvement.
                n_calls=1000,
                verbose=False)
        else:
            dummy_minimize(func=fitness,
                           dimensions=space,
                           n_calls=5000,
                           verbose=False)

    p_mean_name = np.array(
        ['Pmean_' + str(x) for x in list(map(str, np.arange(1, 4)))])
    p_std_name = np.array(
        ['Pstd_' + str(x) for x in list(map(str, np.arange(1, 4)))])

    columns = np.concatenate(
        (np.array(fl.features_c_names[:-2]), np.array(['mse']),
         np.array(['Disagreement']), p_mean_name, p_std_name))

    iter_df = pd.DataFrame(data=data_store, columns=columns)

    iter_df = iter_df.sort_values(by=['mse'], ascending=True)

    excel_dir = create_excel_file('{}/inverse_design_{}_{}.xlsx'.format(
        write_dir, opt_mode, targets))
    wb = openpyxl.load_workbook(excel_dir)
    ws = wb[wb.sheetnames[
        -1]]  # Taking the ws name from the back ensures that if SNN1 is the new ws, it works
    ws.cell(1, 1).value = 'Target'
    print_array_to_excel(array=targets, first_cell=(1, 2), axis=1, ws=ws)
    print_df_to_excel(df=iter_df, ws=ws, start_row=3)

    wb.save(excel_dir)
    wb.close()
Exemple #13
0
    def combination(self):
        """

        :param type: Either 'AIC_t' or 'BIC_t' for AWA and BWA respectively
        :return:
        """
        aic_bic_store = [self.AR_AIC_BIC, self.PCA_AIC_BIC, self.UMAP_AIC_BIC]
        pls_store = [self.AR_PLS, self.PCA_PLS, self.UMAP_PLS]
        testset_y_store = [
            self.testset_AR_y, self.testset_PCA_y, self.testset_UMAP_y
        ]
        testset_y_hat_store = [
            self.testset_AR_y_hat, self.testset_PCA_y_hat,
            self.testset_UMAP_y_hat
        ]
        self.testset_AR_AWA_y_hat = []
        self.testset_AR_BWA_y_hat = []
        self.testset_AR_AVG_y_hat = []
        self.testset_AR_GR_y_hat = []
        self.testset_PCA_AWA_y_hat = []
        self.testset_PCA_BWA_y_hat = []
        self.testset_PCA_AVG_y_hat = []
        self.testset_PCA_GR_y_hat = []
        self.testset_UMAP_AWA_y_hat = []
        self.testset_UMAP_BWA_y_hat = []
        self.testset_UMAP_AVG_y_hat = []
        self.testset_UMAP_GR_y_hat = []
        self.testset_PU_AVG_y_hat = []
        self.testset_PU_GR_y_hat = []

        for skip_idx, (aic_bic_all_h, pls_all_h, testset_y, testset_y_hat, awa_y_hat, bwa_y_hat, avg_y_hat, gr_y_hat) \
                in enumerate(zip(aic_bic_store, pls_store, testset_y_store, testset_y_hat_store,
                                 [self.testset_AR_AWA_y_hat, self.testset_PCA_AWA_y_hat, self.testset_UMAP_AWA_y_hat],
                                 [self.testset_AR_BWA_y_hat, self.testset_PCA_BWA_y_hat, self.testset_UMAP_BWA_y_hat],
                                 [self.testset_AR_AVG_y_hat, self.testset_PCA_AVG_y_hat, self.testset_UMAP_AVG_y_hat],
                                 [self.testset_AR_GR_y_hat, self.testset_PCA_GR_y_hat, self.testset_UMAP_GR_y_hat])):
            i = 0
            for idx, (ic, pls, y, y_hat, rm) in enumerate(
                    zip(aic_bic_all_h, pls_all_h, testset_y, testset_y_hat,
                        self.rm_store)):
                # Simple average AVG
                t_idx = 3 + 8 * skip_idx
                y_combi_hat = np.mean(y_hat, axis=0)
                avg_y_hat.append(y_combi_hat)
                rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2))
                rm[t_idx] = round(rmse_combi / self.benchmark_rmse[idx], 4)
                if np.all(self.benchmarky[i] != y_combi_hat):
                    dm_r = dm_test(y,
                                   self.benchmarky[i],
                                   y_combi_hat,
                                   h=self.hsteps[i],
                                   crit="MSE")
                    pvalue = dm_r[1]
                    if pvalue <= 0.05 and self.star:
                        rm[t_idx] = '{}*'.format(round(rm[t_idx], 4))

                # AWA
                type = 'AIC_t'
                t_idx = 4 + 8 * skip_idx
                ic_values = ic[type].values
                min_ic = np.min(ic_values)
                ic_values += -min_ic
                weights = np.exp(-ic_values / 2)
                weights = weights / np.sum(weights)
                y_combi_hat = np.sum(y_hat * weights[:, None], axis=0)
                awa_y_hat.append(y_combi_hat)
                rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2))
                rm[t_idx] = round(rmse_combi / self.benchmark_rmse[idx], 4)
                if np.all(self.benchmarky[i] != y_combi_hat):
                    dm_r = dm_test(y,
                                   self.benchmarky[i],
                                   y_combi_hat,
                                   h=self.hsteps[i],
                                   crit="MSE")
                    pvalue = dm_r[1]
                    if pvalue <= 0.05 and self.star:
                        rm[t_idx] = '{}*'.format(round(rm[t_idx], 4))

                # BWA
                type = 'BIC_t'
                t_idx = 5 + 8 * skip_idx
                ic_values = ic[type].values
                min_ic = np.min(ic_values)
                ic_values += -min_ic
                weights = np.exp(-ic_values / 2)
                weights = weights / np.sum(weights)
                y_combi_hat = np.sum(y_hat * weights[:, None], axis=0)
                bwa_y_hat.append(y_combi_hat)
                rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2))
                rm[t_idx] = round(rmse_combi / self.benchmark_rmse[idx], 4)
                if np.all(self.benchmarky[i] != y_combi_hat):
                    dm_r = dm_test(y,
                                   self.benchmarky[i],
                                   y_combi_hat,
                                   h=self.hsteps[i],
                                   crit="MSE")
                    pvalue = dm_r[1]
                    if pvalue <= 0.05 and self.star:
                        rm[t_idx] = '{}*'.format(round(rm[t_idx], 4))

                # GR
                t_idx = 6 + 8 * skip_idx
                y_pls = np.array(pls.columns.tolist()[5:])
                y_hat_pls = pls.iloc[:, 5:].values
                #m = np.shape(y_hat_pls)[0] + 1  # number of models + 1 constant term
                m = np.shape(y_hat_pls)[
                    0]  # number of models + 1 constant term
                n = np.shape(y_hat_pls)[1]  # number of timesteps
                beta = cp.Variable(shape=(m, 1))

                # pc_1 = np.ones((1, m - 1)) @ beta[1:, 0] == 1
                pc_1 = np.ones((1, m)) @ beta == 1
                pc_2 = beta >= 0
                constraints = [pc_1, pc_2]

                # X = np.concatenate((np.ones((n, 1)), y_hat_pls.T), axis=1)
                X = y_hat_pls.T
                z = np.ones((1, n)) @ (y_pls[:, None] - X @ beta)**2
                objective = cp.Minimize(z)
                prob = cp.Problem(objective, constraints)

                prob.solve(solver='GUROBI')
                beta_hat = beta.value
                '''
                print('Skip_idx: {} idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format(skip_idx, idx,
                                                                                                       np.sum(beta_hat),
                                                                                                       np.min(beta_hat),
                                                                                                       np.max(beta_hat)))
                
                print('Skip_idx: {} idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format(skip_idx, idx,
                                                                                                       np.sum(beta_hat[
                                                                                                              1:]),
                                                                                                       np.min(beta_hat[
                                                                                                              1:]),
                                                                                                       np.max(beta_hat[
                                                                                                              1:])))
                '''
                y_combi_hat = np.sum(y_hat * beta_hat[:, 0][:, None], axis=0)
                gr_y_hat.append(y_combi_hat)
                rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2))
                rm[t_idx] = round(rmse_combi / self.benchmark_rmse[idx], 4)
                if np.all(self.benchmarky[i] != y_combi_hat):
                    dm_r = dm_test(y,
                                   self.benchmarky[i],
                                   y_combi_hat,
                                   h=self.hsteps[i],
                                   crit="MSE")
                    pvalue = dm_r[1]
                    if pvalue <= 0.05 and self.star:
                        rm[t_idx] = '{}*'.format(round(rm[t_idx], 4))

                # GR with intercept
                t_idx = 7 + 8 * skip_idx
                y_pls = np.array(pls.columns.tolist()[5:])
                y_hat_pls = pls.iloc[:, 5:].values
                m = np.shape(
                    y_hat_pls)[0] + 1  # number of models + 1 constant term
                #m = np.shape(y_hat_pls)[0]  # number of models + 1 constant term
                n = np.shape(y_hat_pls)[1]  # number of timesteps
                beta = cp.Variable(shape=(m, 1))

                pc_1 = np.ones((1, m - 1)) @ beta[1:, 0] == 1
                # pc_1 = np.ones((1, m)) @ beta == 1
                pc_2 = beta >= 0
                constraints = [pc_1, pc_2]

                X = np.concatenate((np.ones((n, 1)), y_hat_pls.T), axis=1)
                # X = y_hat_pls.T
                z = np.ones((1, n)) @ (y_pls[:, None] - X @ beta)**2
                objective = cp.Minimize(z)
                prob = cp.Problem(objective, constraints)

                prob.solve(solver='GUROBI')
                beta_hat = beta.value
                '''
                print('Skip_idx: {} idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format(skip_idx, idx,
                                                                                                       np.sum(beta_hat),
                                                                                                       np.min(beta_hat),
                                                                                                       np.max(beta_hat)))

                print('Skip_idx: {} idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format(skip_idx, idx,
                                                                                                       np.sum(beta_hat[
                                                                                                              1:]),
                                                                                                       np.min(beta_hat[
                                                                                                              1:]),
                                                                                                       np.max(beta_hat[
                                                                                                              1:])))
                '''
                y_combi_hat = np.sum(y_hat * beta_hat[1:, 0][:, None] +
                                     beta_hat[0, 0],
                                     axis=0)
                gr_y_hat.append(y_combi_hat)
                rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2))
                rm[t_idx] = round(rmse_combi / self.benchmark_rmse[idx], 4)
                if np.all(self.benchmarky[i] != y_combi_hat):
                    dm_r = dm_test(y,
                                   self.benchmarky[i],
                                   y_combi_hat,
                                   h=self.hsteps[i],
                                   crit="MSE")
                    pvalue = dm_r[1]
                    if pvalue <= 0.05 and self.star:
                        rm[t_idx] = '{}*'.format(round(rm[t_idx], 4))

                i = i + 1
        i = 0
        # PCA+UMAP
        for idx, (pca_pls, umap_pls, y, pca_y_hat, umap_y_hat,
                  rm) in enumerate(
                      zip(self.PCA_PLS, self.UMAP_PLS, self.testset_PCA_y,
                          self.testset_PCA_y_hat, self.testset_UMAP_y_hat,
                          self.rm_store)):
            # AVG
            y_pls = np.array(pca_pls.columns.tolist()[5:])
            pca_y_hat_pls = pca_pls.iloc[:, 5:].values
            umap_y_hat_pls = umap_pls.iloc[:, 5:].values

            y_combi_hat = np.mean(np.concatenate((pca_y_hat, umap_y_hat),
                                                 axis=0),
                                  axis=0)
            self.testset_PU_AVG_y_hat.append(y_combi_hat)
            rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2))
            rm[24] = round(rmse_combi / self.benchmark_rmse[idx], 4)
            if np.all(self.benchmarky[i] != y_combi_hat):
                dm_r = dm_test(y,
                               self.benchmarky[i],
                               y_combi_hat,
                               h=self.hsteps[i],
                               crit="MSE")
                pvalue = dm_r[1]
                if pvalue <= 0.05 and self.star:
                    rm[24] = '{}*'.format(round(rm[24], 4))

            # GR
            y_hat_pls = np.concatenate((pca_y_hat_pls, umap_y_hat_pls), axis=0)
            #m = np.shape(y_hat_pls)[0] + 1  # number of models + 1 constant term
            m = np.shape(y_hat_pls)[0]
            n = np.shape(y_hat_pls)[1]  # number of timesteps
            beta = cp.Variable(shape=(m, 1))

            #pc_1 = np.ones((1, m - 1)) @ beta[1:, 0] == 1
            pc_1 = np.ones((1, m)) @ beta == 1
            pc_2 = beta >= 0
            constraints = [pc_1, pc_2]

            #X = np.concatenate((np.ones((n, 1)), y_hat_pls.T), axis=1)
            X = y_hat_pls.T

            z = np.ones((1, n)) @ (y_pls[:, None] - X @ beta)**2
            objective = cp.Minimize(z)
            prob = cp.Problem(objective, constraints)

            prob.solve(solver='GUROBI')
            beta_hat = beta.value
            '''
            print('idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format( idx,
                                                                                                   np.sum(beta_hat),
                                                                                                   np.min(beta_hat),
                                                                                                   np.max(beta_hat)))
            
            print('idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format(idx,
                                                                                      np.sum(beta_hat[1:]),
                                                                                      np.min(beta_hat[1:]),
                                                                                      np.max(
                                                                                          beta_hat[1:])))
            '''
            y_hat = np.concatenate((pca_y_hat, umap_y_hat), axis=0)
            y_combi_hat = np.sum(y_hat * beta_hat[:, 0][:, None], axis=0)
            self.testset_PU_GR_y_hat.append(y_combi_hat)
            rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2))
            rm[25] = round(rmse_combi / self.benchmark_rmse[idx], 4)
            if np.all(self.benchmarky[i] != y_combi_hat):
                dm_r = dm_test(y,
                               self.benchmarky[i],
                               y_combi_hat,
                               h=self.hsteps[i],
                               crit="MSE")
                pvalue = dm_r[1]
                if pvalue <= 0.05 and self.star:
                    rm[25] = '{}*'.format(round(rm[25], 4))

            # GR with intercept
            y_hat_pls = np.concatenate((pca_y_hat_pls, umap_y_hat_pls), axis=0)
            m = np.shape(
                y_hat_pls)[0] + 1  # number of models + 1 constant term
            # m = np.shape(y_hat_pls)[0]
            n = np.shape(y_hat_pls)[1]  # number of timesteps
            beta = cp.Variable(shape=(m, 1))

            pc_1 = np.ones((1, m - 1)) @ beta[1:, 0] == 1
            #pc_1 = np.ones((1, m)) @ beta == 1
            pc_2 = beta >= 0
            constraints = [pc_1, pc_2]

            X = np.concatenate((np.ones((n, 1)), y_hat_pls.T), axis=1)
            #X = y_hat_pls.T

            z = np.ones((1, n)) @ (y_pls[:, None] - X @ beta)**2
            objective = cp.Minimize(z)
            prob = cp.Problem(objective, constraints)

            prob.solve(solver='GUROBI')
            beta_hat = beta.value
            '''
            print('idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format(idx,
                                                                                      np.sum(beta_hat),
                                                                                      np.min(beta_hat),
                                                                                      np.max(beta_hat)))
            
            print('idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format(idx,
                                                                                      np.sum(beta_hat[1:]),
                                                                                      np.min(beta_hat[1:]),
                                                                                      np.max(
                                                                                          beta_hat[1:])))
            '''
            y_hat = np.concatenate((pca_y_hat, umap_y_hat), axis=0)
            y_combi_hat = np.sum(y_hat * beta_hat[1:, 0][:, None] +
                                 beta_hat[0, 0],
                                 axis=0)
            self.testset_PU_GR_y_hat.append(y_combi_hat)
            rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2))
            rm[26] = round(rmse_combi / self.benchmark_rmse[idx], 4)
            if np.all(self.benchmarky[i] != y_combi_hat):
                dm_r = dm_test(y,
                               self.benchmarky[i],
                               y_combi_hat,
                               h=self.hsteps[i],
                               crit="MSE")
                pvalue = dm_r[1]
                if pvalue <= 0.05 and self.star:
                    rm[26] = '{}*'.format(round(rm[26], 4))

            i = i + 1

        i = 0

        # decomp_combi
        def run_decomp_combi(subgroup_size, numel, rm_idx):
            all_h_y_hat = [
                np.array(ar.tolist() + pca.tolist() + umap.tolist()) for ar,
                pca, umap in zip(self.testset_AR_y_hat, self.testset_PCA_y_hat,
                                 self.testset_UMAP_y_hat)
            ]
            model_count = [
                single_all_y_hat.shape[0] for single_all_y_hat in all_h_y_hat
            ]
            selections = [
                random.sample(list(range(model_count[0])), k=subgroup_size)
                for _ in range(numel)
            ]
            for idx, (single_all_y_hat, single_y, h_label, rm) in enumerate(
                    zip(all_h_y_hat, self.testset_AR_y, self.hsteps,
                        self.rm_store)):
                # perform sub selection for each h step ahead
                sub_y_hat_store = np.array([
                    single_all_y_hat[selection, :] for selection in selections
                ])
                sub_y_mean_hat = np.mean(sub_y_hat_store, axis=1)
                sub_y_invvar_hat = np.reciprocal(
                    np.var(sub_y_hat_store, axis=1))
                total_weights = np.sum(sub_y_invvar_hat, axis=0)
                p_y = np.sum(
                    (1 / total_weights * sub_y_mean_hat * sub_y_invvar_hat),
                    axis=0)
                rm[rm_idx] = round(
                    np.sqrt(np.average(np.square(p_y - single_y))) /
                    self.benchmark_rmse[idx], 4)

        subgroup_size = 20
        numel = 50
        rm_idx = 27
        run_decomp_combi(subgroup_size=subgroup_size,
                         numel=numel,
                         rm_idx=rm_idx)

        subgroup_size = 20
        numel = 500
        rm_idx = 28
        run_decomp_combi(subgroup_size=subgroup_size,
                         numel=numel,
                         rm_idx=rm_idx)

        subgroup_size = 20
        numel = 5000
        rm_idx = 29
        run_decomp_combi(subgroup_size=subgroup_size,
                         numel=numel,
                         rm_idx=rm_idx)

        subgroup_size = 10
        numel = 50
        rm_idx = 30
        run_decomp_combi(subgroup_size=subgroup_size,
                         numel=numel,
                         rm_idx=rm_idx)

        subgroup_size = 10
        numel = 500
        rm_idx = 31
        run_decomp_combi(subgroup_size=subgroup_size,
                         numel=numel,
                         rm_idx=rm_idx)

        subgroup_size = 10
        numel = 5000
        rm_idx = 32
        run_decomp_combi(subgroup_size=subgroup_size,
                         numel=numel,
                         rm_idx=rm_idx)

        # Printing to excel
        excel_dir = create_excel_file('{}/pm_rm_results.xlsx'.format(
            self.results_dir))
        wb = openpyxl.load_workbook(excel_dir)
        for idx in range(len(self.pm_store)):
            wb.create_sheet('h = {}'.format([1, 3, 6, 12, 24][idx]))
        sheet_names = wb.sheetnames

        for sheet, pm, rm in zip(sheet_names[1:], self.pm_store,
                                 self.rm_store):
            ws = wb[sheet]

            pm_df = pd.DataFrame(data=pm, columns=['m', 'p'])
            rows = dataframe_to_rows(pm_df, index=False)
            for r_idx, row in enumerate(rows, 1):
                for c_idx, value in enumerate(row, 1):
                    ws.cell(row=r_idx + 1, column=c_idx, value=value)

            skip = len(pm_df.index) + 1
            rm_df = pd.DataFrame(rm, columns=['Relative RMSE'])
            rows = dataframe_to_rows(rm_df, index=False)
            for r_idx, row in enumerate(rows, 1):
                for c_idx, value in enumerate(row, 1):
                    ws.cell(row=r_idx + 1 + skip, column=c_idx, value=value)

        wb.save(excel_dir)

        pass
def pso_ga(func, pmin, pmax, smin, smax, int_idx, params, ga, type):
    # Setting params
    c1, c2, wmin, wmax, ga_iter_min, ga_iter_max, iter_gamma, ga_num_min, ga_num_max, num_beta,\
    tourn_size, cxpb, mutpb, indpd, eta,\
    pso_iter, swarm_size = \
    params['c1'], params['c2'], params['wmin'], params['wmax'],\
    params['ga_iter_min'], params['ga_iter_max'], params['iter_gamma'],\
    params['ga_num_min'], params['ga_num_max'], params['num_beta'],\
    params['tourn_size'], params['cxpd'], params['mutpd'], params['indpd'], params['eta'],\
    params['pso_iter'], params['swarm_size']

    # int_idx must be a list. If a single number is given, convert to list.
    if isinstance(int_idx, int):
        int_idx = [int_idx]

    creator.create("FitnessMin", base.Fitness,
                   weights=(-1.0, ))  # Minimization of a single scalar value
    creator.create("Particle",
                   list,
                   fitness=creator.FitnessMin,
                   speed=list,
                   smin=None,
                   smax=None,
                   best=None,
                   int_idx=None)

    toolbox = base.Toolbox()
    toolbox.register("particle",
                     generate_part,
                     dim=len(pmin),
                     pmin=pmin,
                     pmax=pmax,
                     smin=smin,
                     smax=smax,
                     int_idx=int_idx)
    toolbox.register("population", tools.initRepeat, list, toolbox.particle)
    toolbox.register("update", updateParticle, c1=c1, c2=c2)
    toolbox.register("evaluate", func)

    toolbox.register("mate", tools.cxTwoPoint)
    #toolbox.register("mutate", ga_hybrid_polymutate, low=pmin, up=pmax, indpb=indpd, eta=eta)
    toolbox.register("mutate",
                     ga_hybrid_gaussianmutate,
                     low=pmin,
                     up=pmax,
                     indpb=indpd,
                     sigma=smax)

    pop = toolbox.population(n=swarm_size)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)

    logbook = tools.Logbook()
    logbook.header = ["gen", "evals"] + stats.fields

    best = None
    pso_hof_num = max(1, round(ga_num_min * 0.2))
    pso_hof = tools.HallOfFame(pso_hof_num)

    for g in range(pso_iter):
        # PSO segment first
        for part in pop:
            part.fitness.values = toolbox.evaluate(part)
            # Note: Fitness comparisons will compare the weighted value. Since weight is negative,
            # the comparison would be opposite unless you specify .values instead.
            if not part.best or part.best.fitness.values[
                    0] > part.fitness.values[0]:
                part.best = creator.Particle(part)
                part.best.fitness.values = part.fitness.values
            if not best or best.fitness.values[0] > part.fitness.values[0]:
                best = creator.Particle(part)
                best.fitness.values = part.fitness.values
            #time.sleep(1)
        for part in pop:
            # Linear annealing for inertia velocity coefficient (the w weights)
            toolbox.update(part,
                           best=best,
                           w=wmax - (wmax - wmin) * g / pso_iter)
            #time.sleep(1)
        if ga:
            # GA segment
            # Start at max and approach min
            ga_pop = round(ga_num_min + (g / pso_iter)**iter_gamma *
                           (ga_num_max - ga_num_min))
            ga_gen = round(ga_iter_min + (g / pso_iter)**num_beta *
                           (ga_iter_max - ga_iter_min))
            if len(pso_hof) == 0:
                ga_mask = [1 for _ in range(ga_pop)
                           ] + [0 for _ in range(swarm_size - ga_pop)]
                random.shuffle(ga_mask)
                population = [x for x, mask in zip(pop, ga_mask) if mask == 1]
            else:
                ga_pop += -pso_hof_num
                ga_mask = [1 for _ in range(ga_pop)
                           ] + [0 for _ in range(swarm_size - ga_pop)]
                random.shuffle(ga_mask)
                population = [x for x, mask in zip(pop, ga_mask) if mask == 1
                              ] + pso_hof.items

            halloffame = tools.HallOfFame(ga_pop)
            halloffame.update(population)
            ga_eval = 0
            # Begin the generational process
            for gen in range(ga_gen):
                # Select the next generation individuals. Built in tournament selector does not work for multi-objective
                # offspring = toolbox.select(population, len(population))
                # Own selection using tournment. Will work for multi-objective.
                chosen = []
                for i in range(ga_pop):
                    aspirants = selRandom(population, tourn_size)
                    scores = [x.fitness.values[0] for x in aspirants]
                    f = lambda i: scores[i]
                    chosen_idx = min(range(len(scores)), key=f)
                    chosen.append(aspirants[chosen_idx])
                    pass
                offspring = chosen

                # Vary the pool of individuals
                offspring = varAnd(offspring, toolbox, cxpb, mutpb)

                # Evaluate the individuals with an invalid fitness
                invalid_ind = [
                    ind for ind in offspring if not ind.fitness.valid
                ]
                ga_eval += len(invalid_ind)
                fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
                for ind, fit in zip(invalid_ind, fitnesses):
                    ind.fitness.values = fit

                # Update the hall of fame with the generated individuals
                halloffame.update(offspring)

                # Replace the current population by the offspring
                population[:] = offspring

            counter = 0
            if best.fitness.values[0] > halloffame[0].fitness.values[0]:
                best = creator.Particle(halloffame[0])
                best.fitness.values = halloffame[0].fitness.values
            for idx, mask in enumerate(ga_mask):
                if mask == 1:
                    try:
                        if pop[idx].fitness.values[0] > halloffame[
                                counter].fitness.values[0]:
                            pop[idx] = halloffame[counter]
                            pop[idx].best = creator.Particle(part)
                            pop[idx].best.fitness.values = halloffame[
                                counter].fitness.values
                        counter += 1
                    except IndexError:
                        break
        #time.sleep(1)
        pso_hof.update(pop)

        # Gather all the fitnesses in one list and print the stats
        try:
            logbook.record(gen=g,
                           evals=len(pop) + ga_eval,
                           **stats.compile(pop))
        except UnboundLocalError:
            # Means ga=False and ga_eval is not assigned
            logbook.record(gen=g, evals=len(pop), **stats.compile(pop))
        #print(best)
        print(logbook.stream)

    print(best.fitness.values)
    print(best)

    # Printing to excel
    write_excel = create_excel_file(
        './results/pso_ga_{}_results.xlsx'.format(type))
    wb = openpyxl.load_workbook(write_excel)
    ws = wb[wb.sheetnames[-1]]

    ws.cell(1, 1).value = 'Optimal Decision Values'
    print_array_to_excel([
        'inlettemp', 'catalystweight', 'residencetime', 'reactorP',
        'methanolCOratio'
    ], (2, 1),
                         ws=ws,
                         axis=1)
    print_array_to_excel(best, (3, 1), ws=ws, axis=1)

    genfit = logbook.select("gen")
    avgfit = logbook.select("avg")
    stdfit = logbook.select("std")
    minfit = logbook.select("min")
    maxfit = logbook.select("max")

    ws.cell(5, 1).value = 'gen'
    ws.cell(6, 1).value = 'avg'
    ws.cell(7, 1).value = 'std'
    ws.cell(8, 1).value = 'min'
    ws.cell(9, 1).value = 'max'

    print_array_to_excel(genfit, (5, 2), ws=ws, axis=1)
    print_array_to_excel(avgfit, (6, 2), ws=ws, axis=1)
    print_array_to_excel(stdfit, (7, 2), ws=ws, axis=1)
    print_array_to_excel(minfit, (8, 2), ws=ws, axis=1)
    print_array_to_excel(maxfit, (9, 2), ws=ws, axis=1)

    wb.save(write_excel)

    return pop, logbook, best
def inverse_design(targets, loss_func, bounds, init_guess, model_directory_store, svm_directory, loader_file, write_dir,
                   opt_mode, opt_params):
    '''
    Run inverse design experiment. Give a set of trained model and a target labels, this optimizer determines a list of
    suitable candidate experimental conditions to achieve those target labels.
    :param targets: Targets for the labels
    :param loss_func: Loss function which can be customized according to different logic
    :param bounds: Bounds on the feature search space
    :param init_guess: Initial guess for features. Set as None if nothing.
    :param model_directory_store: list of directories which contain the models used for inverse design
    :param svm_directory: directory that contains the SVM classifier to determine if a composition if feasible or not
    :param loader_file: data loader excel file for the final round used to trained the model. Is used to get the scaler
    for scaling the features
    :param write_dir: directory to write the excel results into
    :param opt_mode: to determine what type of optimizer to use for the inverse design
    :param opt_params: parameters for the optimizer
    1) psoga: Particle swarm, genetic algorithm hybrid optimizer
    2) forest: Forest optimizer from skopt package
    3) dummy: Random search from skopt package
    '''

    model_store = []
    for model_directory in model_directory_store:
        model_store.extend(load_model_ensemble(model_directory))
    svm_store = load_svm_ensemble(svm_directory)
    fl = load_data_to_fl(loader_file, norm_mask=[0, 1, 3, 4, 5], normalise_labels=False)
    data_store = []

    def calculate_score_from_features(features):
        # From features, calculate the score and other results
        x = features[0]
        y = features[1]
        # Ensure that composition sums to 1 by reflecting points across the plane y=1-x from top right to bottom left
        if x + y > 1:
            u = -y + 1
            v = -x + 1
            features[0:2] = np.array([u, v])
        p_class, distance = svm_ensemble_prediction(svm_store, features[0:2])  # SVM Check
        if distance.item() < 0:
            # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
            # The more negative the a_score is, the further the composition is from the hyperplane,
            # hence, the less likely the optimizer will select examples with class 0.
            score = -10e5 * distance.item()
            prediction_mean = [-1] * fl.labels_dim
            prediction_std = [-1] * fl.labels_dim
            disagreement = -1
        elif features[0] + features[1] > 1:
            # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
            # The more negative the a_score is, the further the composition is from the hyperplane,
            # hence, the less likely the optimizer will select examples with class 0.
            score = -10e5 * (1 - (features[0] + features[1]))
            prediction_mean = [-1] * fl.labels_dim
            prediction_std = [-1] * fl.labels_dim
            disagreement = -1
        else:
            features_c = features[:-1]
            onehot = features[-1].item()
            if onehot == 0:
                features_in = np.concatenate((features_c, np.array([1, 0, 0])))
            elif onehot == 1:
                features_in = np.concatenate((features_c, np.array([0, 1, 0])))
            elif onehot == 2:
                features_in = np.concatenate((features_c, np.array([0, 0, 1])))
            features_input_norm = fl.apply_scaling(features_in)
            prediction_mean, prediction_std = model_ensemble_prediction(model_store, features_input_norm)
            score = loss_func(targets, prediction_mean)
            disagreement = np.mean(prediction_std)
            prediction_mean = prediction_mean.tolist()
            prediction_std = prediction_std.tolist()
        return score, disagreement, prediction_mean, prediction_std

    if opt_mode == 'psoga':
        def fitness(params):
            nonlocal data_store
            features = np.array(params)
            score, disagreement, prediction_mean, prediction_std = calculate_score_from_features(features)
            data = list(features) + [score, disagreement] + prediction_mean + prediction_std
            data_store.append(data)
            return (score,)
        # pso_ga parameters
        pmin = [x[0] for x in bounds]
        pmax = [x[1] for x in bounds]
        smin = [abs(x - y) * 0.001 for x, y in zip(pmin, pmax)]
        smax = [abs(x - y) * 0.5 for x, y in zip(pmin, pmax)]
        # run pso_ga
        pso_ga(func=fitness, pmin=pmin, pmax=pmax,
               smin=smin, smax=smax,
               int_idx=[3], params=opt_params, ga=True, initial_guess=init_guess)
    elif opt_mode == 'forest' or opt_mode == 'dummy':
        # skopt parameters
        space = [Real(low=bounds[0][0], high=bounds[0][1], name='CNT'),
                 Real(low=bounds[1][0], high=bounds[1][1], name='PVA'),
                 Real(low=bounds[2][0], high=bounds[2][1], name='Thickness'),
                 Categorical(categories=[0, 1, 2], name='Dimension')]
        iter_count = 0
        start = time.time()
        end = 0
        @use_named_args(space)
        def fitness(**params):
            nonlocal data_store, iter_count, start, end
            iter_count +=1
            features = np.array([x for x in params.values()])
            score, disagreement, prediction_mean, prediction_std = calculate_score_from_features(features)
            data = list(features) + [score, disagreement] + prediction_mean + prediction_std
            data_store.append(data)
            if iter_count % 10 == 0:
                end = time.time()
                print('Current Iteration {}. Time taken for past 10 evals: {}. '.format(iter_count, end-start))
                start = time.time()
            return score
        # Run skopt optimizer
        if opt_mode == 'gp':
            gp_minimize(func=fitness,
                            dimensions=space,
                            acq_func='EI',  # Expected Improvement.
                            n_calls=opt_params['total_run'],
                            n_random_starts=opt_params['random_run'],
                            verbose=False)
        else:
            dummy_minimize(func=fitness,
                            dimensions=space,
                            n_calls=opt_params['total_run'],
                            verbose=False)

    # Preparing results dataframe
    p_mean_name = ['Pmean_' + str(x) for x in list(map(str, np.arange(1, 4)))]
    p_std_name = ['Pstd_' + str(x) for x in list(map(str, np.arange(1, 4)))]
    columns = fl.features_c_names[:-3].tolist()+['dim','score', 'disagreement']+p_mean_name+p_std_name
    iter_df = pd.DataFrame(data=data_store,
                           columns=columns)
    iter_df = iter_df.sort_values(by=['score'], ascending=True)
    # Print results to excel
    excel_dir = create_excel_file('{}/inverse_design_{}_{}.xlsx'.format(write_dir, opt_mode, targets))
    wb = openpyxl.load_workbook(excel_dir)
    ws = wb[wb.sheetnames[-1]]
    ws.cell(1, 1).value = 'Target'
    print_array_to_excel(array=targets, first_cell=(1, 2), axis=1, ws=ws)
    print_df_to_excel(df=iter_df, ws=ws, start_row=3)
    wb.save(excel_dir)
    wb.close()
Exemple #16
0
def run_testing():
    plt.rcParams["font.family"] = "Times New Roman"
    results_dir = create_results_directory('./results/simulation')
    n_total = 10
    t_train = 20
    t_test = 100
    simulation_runs = 20
    df_store = []

    def func(z):
        return 1 + 5 * z[:, [0]] + 2 * z[:, [1]] + z[:,
                                                     [2]] + np.random.normal(
                                                         0, 2, (z.shape[0], 1))

    def plot(cw, name):
        plt.plot(
            np.mean((sm.add_constant(z_test) @ np.cumsum(
                np.array(cw.bhat_new_store.toarray()), axis=0).T - y_test)**2,
                    axis=0)[5:])
        plt.xlabel('m iterations')
        plt.ylabel('Test MSE')
        plt.axvline(cw.m_star, linestyle='--')
        plt.savefig(f'{results_dir}/{name}.png')
        plt.close()
        final = min(cw.m_star + 25, cw.bhat_new_store.shape[0])
        plt.plot(
            np.mean((sm.add_constant(z_test) @ np.cumsum(
                np.array(cw.bhat_new_store.toarray()), axis=0).T - y_test)**2,
                    axis=0)[5:final])
        plt.xlabel('m iterations')
        plt.ylabel('Test MSE')
        plt.axvline(cw.m_star, linestyle='--')
        plt.savefig(f'{results_dir}/{name}_zoomed.png')
        plt.close()

    def cw_run(cw, hparams, store, idx, name):
        cw = cw(z_matrix=z, y_vec=y, hparams=hparams, r=None)
        if idx == 0:
            cw.fit(plot_name=f'{results_dir}/{name}')
        else:
            cw.fit()
        yhat = cw.predict(exog=sm.add_constant(z_test))
        ssr = sum((y_test - yhat)**2)
        store.append([(f'{name} MSE', ssr / t_test),
                      (f'{name} m_star', cw.m_star),
                      (f'{name} params', cw.params),
                      (f'{name} i frac', cw.i_star_frac)])
        if idx == 0:
            plot(cw, name)

    for idx in range(simulation_runs):
        z = np.random.normal(0, 1, (t_train, n_total))
        y = func(z)
        z_test = np.random.normal(0, 1, (t_test, n_total))
        y_test = func(z_test)

        ols = sm.OLS(endog=y, exog=sm.add_constant(z)).fit()
        yhat_ols = ols.predict(sm.add_constant(z_test))[..., None]
        ssr_ols = sum((y_test - yhat_ols)**2)

        # lasso 10CV
        space = [Real(low=-10, high=1, name='alpha')]

        @use_named_args(space)
        def fitness(**params):
            return -np.mean(
                cross_val_score(SMwrapper(sm.OLS, 10**params['alpha']),
                                sm.add_constant(z),
                                y,
                                cv=10,
                                scoring='neg_mean_squared_error'))

        results = gp_minimize(
            func=fitness,
            dimensions=space,
            acq_func='EI',  # Expected Improvement.
            n_calls=20,
            verbose=False)

        alpha = results.x[0]  # in lg10
        lasso = sm.OLS(endog=y, exog=sm.add_constant(z)).fit_regularized(
            L1_wt=1, alpha=10**alpha)
        yhat_lasso = lasso.predict(sm.add_constant(z_test))[..., None]
        ssr_lasso = sum((y_test - yhat_lasso)**2)

        results_store = {
            'n_total': n_total,
            'T_train': t_train,
            'T_test': t_test,
            'Simulation Runs': simulation_runs,
            'OLS MSE': ssr_ols / t_test,
            'Lasso MSE': ssr_lasso / t_test,
            'lasso_alpha': 10**alpha,
            'predictor': np.arange(n_total + 1),
            'True params': [1, 5, 2, 1] + [0] * (n_total - 3),
            'ols params': ols.params,
            'Lasso params': lasso.params,
        }

        store = []

        hparams = {
            'm_max': 500,
            'learning_rate': 0.1,
            'ic_mode': 'aic',
            'dropout': 0.5
        }
        cw_run(cw=ComponentwiseL2BoostDropout,
               hparams=hparams,
               store=store,
               idx=idx,
               name='cwd01_50')

        hparams = {
            'm_max': 500,
            'learning_rate': 0.3,
            'ic_mode': 'aic',
            'dropout': 0.5
        }
        cw_run(cw=ComponentwiseL2BoostDropout,
               hparams=hparams,
               store=store,
               idx=idx,
               name='cwd03_50')

        hparams = {'m_max': 2000, 'learning_rate': 0.1, 'ic_mode': 'aic'}
        cw_run(cw=ComponentwiseL2Boost,
               hparams=hparams,
               store=store,
               idx=idx,
               name='cw01')

        hparams = {'m_max': 2000, 'learning_rate': 0.3, 'ic_mode': 'aic'}
        cw_run(cw=ComponentwiseL2Boost,
               hparams=hparams,
               store=store,
               idx=idx,
               name='cw03')

        hparams = {
            'm_max': 500,
            'learning_rate': 0.1,
            'ic_mode': 'aic',
            'dropout': 0.5
        }
        cw_run(cw=ComponentwiseL2BoostDropout,
               hparams=hparams,
               store=store,
               idx=idx,
               name='cwd01_50')

        hparams = {
            'm_max': 500,
            'learning_rate': 0.3,
            'ic_mode': 'aic',
            'dropout': 0.5
        }
        cw_run(cw=ComponentwiseL2BoostDropout,
               hparams=hparams,
               store=store,
               idx=idx,
               name='cwd03_50')

        store = list(zip(*store))
        for item in store:
            results_store.update(item)

        df_store.append(
            pd.DataFrame({k: pd.Series(v)
                          for k, v in results_store.items()}))

    df = pd.concat(objs=df_store).groupby(level=0).mean()
    excel_name = f'{results_dir}/test_comparision.xlsx'
    excel_name = create_excel_file(excel_name)
    wb = openpyxl.load_workbook(excel_name)
    ws = wb[wb.sheetnames[-1]]
    print_df_to_excel(df=df, ws=ws)
    wb.save(excel_name)
def acquisition_opt(bounds,
                    svm_directory,
                    loader_file,
                    normalise_labels,
                    write_dir,
                    opt_mode,
                    opt_params,
                    batch_runs=1,
                    ignore_distance=False,
                    norm_mask=None):
    '''
    To perform batch-wise active learning for each round.
    :param bounds: Features search space
    :param svm_directory: Directory that contains the SVM models
    :param loader_file: fl excel data loader
    :param normalise_labels: for fl
    :param write_dir: Directory to write excel to and also where the model directory is in
    :param opt_mode: Choose the type of optimizer
    :param opt_params: Parameters for optimizer
    :param batch_runs: Number of batches of experiments to run
    :param ignore_distance: When calculating acquisition score, whether to consider L2 distance or not
    :param norm_mask: for fl
    '''
    # Load models from latest round
    model_store = load_model_ensemble(f'{write_dir}/models')
    svm_store = load_svm_ensemble(svm_directory)
    # Load latest round of fl class
    fl = load_data_to_fl(loader_file,
                         norm_mask=norm_mask,
                         normalise_labels=normalise_labels)
    excel_file = create_excel_file(f'{write_dir}/{opt_mode}_acq.xlsx')
    wb = openpyxl.Workbook()

    def calculate_score_from_features(features):
        x = features[0]
        y = features[1]
        if x + y > 1:
            u = -y + 1
            v = -x + 1
            features[0:2] = np.array([u, v])

        # SVM Check
        p_class, distance = svm_ensemble_prediction(svm_store, features[0:2])
        if distance.item() < 0:
            # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
            # The more negative the a_score is, the further the composition is from the hyperplane,
            # hence, the less likely the optimizer will select examples with class 0.
            a_score = 10e5 * distance.item()
            prediction_mean = [-1] * fl.labels_dim
            prediction_std = [-1] * fl.labels_dim
            l2_distance = -1
            disagreement = -1
        elif features[0] + features[1] > 1:
            # Sum of composition cannot be greater than 1
            a_score = 10e5 * (1 - (features[0] + features[1]))
            prediction_mean = [-1] * fl.labels_dim
            prediction_std = [-1] * fl.labels_dim
            l2_distance = -1
            disagreement = -1
        else:
            features_c = features[:-1]
            onehot = features[-1].item()
            if onehot == 0:
                features = np.concatenate((features_c, np.array([1, 0, 0])))
            elif onehot == 1:
                features = np.concatenate((features_c, np.array([0, 1, 0])))
            elif onehot == 2:
                features = np.concatenate((features_c, np.array([0, 0, 1])))

            features_input_norm = fl.apply_scaling(features)
            prediction_mean, prediction_std = model_ensemble_prediction(
                model_store, features_input_norm)
            prediction_mean = prediction_mean.tolist()
            prediction_std = prediction_std.tolist()
            # Greedy Sampling
            # Get L2 distance of sampled example to all existing example in fl class object
            # Note: L2 distance is calculated using the normalised features so that all feature have the same weight
            l2_distance = np.linalg.norm(x=fl.features_c_norm -
                                         features_input_norm.reshape((1, -1)),
                                         ord=2,
                                         axis=1)
            l2_distance = np.min(l2_distance)  # Take the minimum L2 dist.
            # Overall Acquisition Score. Higher score if l2 distance is larger and uncertainty (std) is larger.
            disagreement = np.sum(prediction_std)
            if ignore_distance:
                a_score = disagreement
            else:
                a_score = l2_distance * disagreement
        return a_score, l2_distance, disagreement, prediction_mean, prediction_std

    for batch in range(batch_runs):
        instance_start = time.time()
        iter_count = 0
        data_store = []
        if opt_mode in ['gp', 'dummy', 'forest']:
            # skopt parameters setup
            space = [
                Real(low=bounds[0][0], high=bounds[0][1], name='CNT'),
                Real(low=bounds[1][0], high=bounds[1][1], name='PVA'),
                Real(low=bounds[2][0], high=bounds[2][1], name='Thickness'),
                Categorical(categories=[0, 1, 2], name='Dimension')
            ]

            @use_named_args(space)
            def fitness(**params):
                nonlocal iter_count, data_store
                iter_count += 1
                features = np.array([x for x in params.values()])
                a_score, l2_distance, disagreement, prediction_mean, prediction_std = calculate_score_from_features(
                    features)
                # Storing intermediate results into list to print into excel later
                data = list(features) + [a_score, disagreement, l2_distance
                                         ] + prediction_mean + prediction_std
                data_store.append(data)
                if iter_count % 50 == 0:
                    print(
                        f'Current Iteration: {iter_count} out of {opt_params["total_run"]} for batch {batch + 1}.'
                    )
                return -a_score  # -ve to maximise the a_score

            if opt_mode == 'gp':
                search_result = gp_minimize(
                    func=fitness,
                    dimensions=space,
                    acq_func='EI',  # Expected Improvement.
                    n_calls=opt_params['total_run'],
                    n_random_starts=opt_params['random_run'],
                    verbose=False)
            elif opt_mode == 'dummy':
                search_result = dummy_minimize(func=fitness,
                                               dimensions=space,
                                               n_calls=opt_params['total_run'],
                                               verbose=False)
            elif opt_mode == 'forest':
                search_result = forest_minimize(
                    func=fitness,
                    dimensions=space,
                    acq_func='EI',  # Expected Improvement.
                    n_calls=opt_params['total_run'],
                    n_random_starts=opt_params['random_run'],
                    verbose=False)
            best_x = search_result.x
        elif opt_mode == 'psoga':
            # psoga parameters setup
            pmin = [x[0] for x in bounds]
            pmax = [x[1] for x in bounds]
            smin = [abs(x - y) * 0.001 for x, y in zip(pmin, pmax)]
            smax = [abs(x - y) * 0.5 for x, y in zip(pmin, pmax)]

            def fitness(params):
                nonlocal data_store
                features = np.array(params)
                a_score, l2_distance, disagreement, prediction_mean, prediction_std = calculate_score_from_features(
                    features)
                data = list(features) + [a_score, disagreement, l2_distance
                                         ] + prediction_mean + prediction_std
                data_store.append(data)
                return (-a_score, )

            _, _, best_x = pso_ga(func=fitness,
                                  pmin=pmin,
                                  pmax=pmax,
                                  smin=smin,
                                  smax=smax,
                                  int_idx=[3],
                                  params=opt_params,
                                  ga=True,
                                  initial_guess=None)
        else:
            raise TypeError(f'Invalid opt_mode {opt_mode}')

        # Prepare results dataframe
        p_mean_name = [
            'Pmean_' + str(x) for x in list(map(str, np.arange(1, 4)))
        ]
        p_std_name = [
            'Pstd_' + str(x) for x in list(map(str, np.arange(1, 4)))
        ]
        columns = fl.features_c_names[:-3].tolist() + [
            'dim', 'A_score', 'disagreement', 'L2'
        ] + p_mean_name + p_std_name
        iter_df = pd.DataFrame(data=data_store, columns=columns)
        iter_df = iter_df.sort_values(by=['A_score'], ascending=False)
        # Creating new worksheet.
        wb.create_sheet(title='Batch_{}'.format(batch + 1))
        ws = wb['Batch_{}'.format(batch + 1)]
        print_df_to_excel(df=iter_df, ws=ws)
        '''
        If more than one batch, prepare fl for next batch. The only difference is that the previous best trial point
        with the highest a_score will be added to fl.features_c_norm such that the L2 greedy distance will
        account for the fact that the previous batch would had contained the best example already.
        '''
        features = np.array(best_x)
        features_c = features[:-1]
        onehot = features[-1].item()
        if onehot == 0:
            features = np.concatenate((features_c, np.array([1, 0, 0])))
        elif onehot == 1:
            features = np.concatenate((features_c, np.array([0, 1, 0])))
        elif onehot == 2:
            features = np.concatenate((features_c, np.array([0, 0, 1])))
        fl.features_c_norm = np.concatenate(
            (fl.features_c_norm, fl.apply_scaling(features)), axis=0)

        instance_end = time.time()
        print('Batch {} completed. Time taken: {}'.format(
            batch + 1, instance_end - instance_start))
        wb.save(excel_file)
Exemple #18
0
import numpy as np
import pandas as pd
import openpyxl, pickle, os
from own_package.others import print_df_to_excel, create_excel_file
from own_package.smote.smote_code import produce_smote, create_invariant_testset
from own_package.features_labels_setup import load_data_to_fl
from own_package.data_store_analysis import get_best_trial_from_rounds, get_best_trial_from_rounds_custom_metric
#from own_package.hparam_opt import read_hparam_data


def selector(case, **kwargs):
    if case == 1:
        excel_dir = create_excel_file('./results/smote_data.xlsx')
        fl = load_data_to_fl(
            data_loader_excel_file=
            './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.format(
                13, 13),
            normalise_labels=True,
            label_type='cutoff',
            norm_mask=[0, 1, 3, 4, 5])
        f, l = produce_smote(features=fl.features_c,
                             labels=fl.labels,
                             numel=4000)

        wb = openpyxl.Workbook()
        ws = wb[wb.sheetnames[-1]]
        print_df_to_excel(df=pd.DataFrame(
            data=np.concatenate((f, l), axis=1),
            columns=fl.features_c_names.tolist() + fl.labels_names.tolist()),
                          ws=ws)
        wb.save(excel_dir)
def run_classification(grid_fl_dir, write_dir, gamma):
    # Load grid fl
    with open(grid_fl_dir, 'rb') as handle:
        fl = pickle.load(handle)
    # Create 10 fold for cross validation
    fl_store = fl.create_kf(k_folds=10, shuffle=True)
    # Run k model instance to perform skf
    # Results dataframe has the columns: ['idx', 'fold', 'CNT', 'PVA', 'Label', 'Prediction']
    # For each fold, append the fold information to the following lists:
    val_idx = []
    folds = []
    val_features = []
    val_labels = []
    predicted_labels_store = []
    # fl_store is a 10 item list where each item is a tuple containing the train and val fl
    for fold, fl_tuple in enumerate(fl_store):
        instance_start = time.time()
        (ss_fl,
         i_ss_fl) = fl_tuple  # ss_fl is training fl, i_ss_fl is validation fl
        # Train model
        model = SVMmodel(fl=ss_fl, gamma=gamma)
        model.train_model(fl=ss_fl)
        # Evaluation
        predicted_labels = model.predict(i_ss_fl)
        # Saving model
        save_model_name = write_dir + '/models/svm_' + str(fold + 1) + '.pkl'
        print('Saving instance {} model in {}'.format(fold + 1,
                                                      save_model_name))
        with open(save_model_name, 'wb') as handle:
            pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
        # Preparing data to put into new_df that consists of all the validation dataset and its predicted labels
        val_idx.extend(i_ss_fl.idx)
        folds.extend(
            [fold] * i_ss_fl.count
        )  # Make a col that contains the fold number for each example
        if len(val_features):
            val_features = np.concatenate((val_features, i_ss_fl.features),
                                          axis=0)
        else:
            val_features = i_ss_fl.features
        val_labels.extend(i_ss_fl.labels)
        predicted_labels_store.extend(predicted_labels)
        # Printing one instance summary.
        instance_end = time.time()
        print(
            '\nFor k-fold run {} out of {}. Each fold has {} examples. Time taken for '
            'instance = {}\n'
            '####################################################################################################'
            .format(fold + 1, 10, i_ss_fl.count,
                    instance_end - instance_start))

    # Calculating metrics based on complete validation prediction
    mcc = matthews_corrcoef(y_true=val_labels, y_pred=predicted_labels_store)

    # Creating dataframe to print into excel later.
    results_df = np.concatenate(
        (
            np.array(folds)[:, None],  # Convert 1d list to col. vector
            val_features,
            np.array(val_labels)[:, None],
            np.array(predicted_labels_store)[:, None]),
        axis=1)
    headers = ['folds'] + \
              ['CNT', 'PVA'] + \
              ['Labels'] + \
              ['Prediction']
    # val_idx is the original position of the example in the data_loader
    results_df = pd.DataFrame(data=results_df, columns=headers, index=val_idx)
    # Create excel file and print results to excel
    excel_file = create_excel_file(f'{write_dir}/classifier_results.xlsx')
    print('Writing into' + excel_file)
    wb = openpyxl.Workbook()
    # Create results sheet
    wb.create_sheet('results')
    ws = wb['results']
    # Print results df
    print_df_to_excel(df=results_df, ws=ws)
    # Writing hyperparameter information at the side
    start_col = len(results_df.columns) + 3
    headers = ['mcc', 'gamma']
    values = [mcc, gamma]
    print_array_to_excel(np.array(headers), (1, start_col + 1), ws, axis=1)
    print_array_to_excel(np.array(values), (2, start_col + 1), ws, axis=1)
    wb.save(excel_file)
    wb.close()
Exemple #20
0
        read_hparam_data(data_store=data_store, write_dir=write_dir, ett_names=ett_names, print_s_df=False,
                         trainset_ett_idx=-4)
        pass
    elif case == 3:
        # Name checking for data_store files in various folders
        dir_store = ['./results/hparams_opt round 13 ann NDA HE',
                     './results/hparams_opt round 13 ann Invariant HE',

                     './results/hparams_opt round 13 dtr invariant 10',
                     './results/hparams_opt round 13 DTR',
                     ]

        data_store = []
        for dir in dir_store:
            for filename in os.listdir(dir):
                if filename.endswith(".pkl"):
                    with open('{}/{}'.format(dir, filename), 'rb') as handle:
                        data = pickle.load(handle)
                        data_store.append([dir, data[0][0][0][0]])
                    break
        excel_dir = create_excel_file('./results/read_data_store_names.xlsx')
        wb = openpyxl.load_workbook(excel_dir)
        ws = wb[wb.sheetnames[-1]]
        df = pd.DataFrame(data_store)
        print_df_to_excel(df=df, ws=ws)
        wb.save(excel_dir)

#for i in [13,]:
#    selector(case=2, write_dir='./results/hparams_opt round {} DTR_weak_I50b_round_{}'.format(i, i))
#selector(case=3, write_dir='./results/test')
selector(case=2, write_dir='./results/hparams_opt round 1 conv1_round_1')