Exemple #1
0
    def get_best_df(dir, name, wb):
        hparam_df = pd.read_excel('{}/hparam_results.xlsx'.format(dir),
                                  index_col=None)
        mse = hparam_df.iloc[:, -1].values
        min_idx = int(hparam_df.iloc[np.argmin(mse), 0])

        xls = pd.ExcelFile('{}/skf_results.xlsx'.format(dir))
        skf_df = pd.read_excel(xls,
                               sheet_name='{}_{}_0'.format(name, min_idx),
                               index_col=0)

        df1 = skf_df.iloc[:, :fn + 1 + 2 * numel].sort_index()
        y_store = df1.iloc[:, fn + 1:fn + 1 + numel].values
        p_y = df1.iloc[:, fn + 1 + numel:fn + 1 + 2 * numel].values
        rc = np.mean(np.abs(y_store - p_y) / y_store)
        mse = np.mean((y_store - p_y)**2)

        df2 = skf_df.iloc[:, fn + 1 + 2 * numel:].reset_index(drop=True)
        best_name = '{}_{}'.format(name, min_idx)
        df2.iloc[0, 2] = best_name
        skf_df = pd.concat([df1, df2], axis=1, sort=False)

        sheet_names = wb.sheetnames
        if name in sheet_names:
            ws = wb[name]
        else:
            wb.create_sheet(name)
            ws = wb[name]

        print_df_to_excel(df=skf_df, ws=ws, index=True, header=True)

        return [best_name, mse, rc]
def svm_hparam_opt(grid_fl_dir, total_run, write_excel_dir):
    with open(grid_fl_dir, 'rb') as fp:
        fl = pickle.load(fp)

    run_count = 0
    gamma = Real(low=0.1, high=300, name='gamma')
    dimensions = [gamma]
    default_parameters = [130]

    fl_store = fl.create_kf(k_folds=10, shuffle=True)

    @use_named_args(dimensions=dimensions)
    def fitness(gamma):
        nonlocal run_count, fl_store
        run_count += 1
        # Run k model instance to perform skf
        predicted_labels_store = []
        val_labels = []
        for fold, fl_tuple in enumerate(fl_store):
            (ss_fl, i_ss_fl
             ) = fl_tuple  # ss_fl is training fl, i_ss_fl is validation fl

            # Training
            model = SVMmodel(fl=ss_fl, gamma=gamma)
            model.train_model(fl=ss_fl)

            # Evaluation
            predicted_labels = model.predict(i_ss_fl).flatten().tolist()
            predicted_labels_store.extend(predicted_labels)
            val_labels.extend(i_ss_fl.labels.flatten().tolist())

        # Calculating metrics based on complete validation prediction
        mcc = matthews_corrcoef(y_true=val_labels,
                                y_pred=predicted_labels_store)
        if run_count % 10 == 0:  # Print every 10 iteration
            print(f'Run Number {run_count}')
        return -mcc

    search_result = gp_minimize(
        func=fitness,
        dimensions=dimensions,
        acq_func='EI',  # Expected Improvement.
        n_calls=total_run,
        x0=default_parameters)
    print('Best Loss = {}'.format(search_result.fun))
    print('Best Gamma = {}'.format(search_result.x[0]))
    x = [x[0] for x in search_result.x_iters]
    results = pd.DataFrame([x] + [(-search_result.func_vals).tolist()]).T
    results.columns = ['Gamma', 'mcc']
    results = results.sort_values(by='mcc', ascending=False)

    write_excel_dir = create_excel_file(write_excel_dir)
    wb = openpyxl.load_workbook(write_excel_dir)
    ws = wb[wb.sheetnames[-1]]
    print_df_to_excel(df=results, ws=ws)
    wb.save(write_excel_dir)
    wb.close()
def l2_tracker(write_excel_dir, final_excel_loader, last_idx_store):
    '''
    To calculate the average min(L2 distance) over all the data points.
    The avg min L2 is caclulated for each active learnning round, as indicated by the last_idx_store
    :param write_excel_dir: Excel directory to write the data to
    :param final_excel_loader: The excel loader file that contains the feature information
    :param last_idx_store: A list to indicate which experiment number is the last experiment for that batch of active
    learning round. For example, we have 3 active learning rounds with 5, 10, and 3 experiments per round.
    So the last idx store will be [5, 15, 18]

    Saves a new excel file which contains the L2 information
    1) It contains the avg min L2 for each batch of active learning round
    2) The avg min L2 distance for the batch of suggestions for the next active learning round.
    Since the last round has no additional suggestions, the last round has no calculated value for this.
    '''
    write_excel_dir = create_excel_file(write_excel_dir)
    wb = openpyxl.Workbook()
    wb.create_sheet('L2 Results')
    ws = wb[wb.sheetnames[-1]]
    scaler = MinMaxScaler()
    scaler.fit(np.array([[200], [2000]]))
    fl = load_data_to_fl(data_loader_excel_file=final_excel_loader,
                         normalise_labels=False,
                         scaler=scaler,
                         norm_mask=[0, 1, 3, 4, 5])
    final_features = fl.features_c_norm
    suggestions_store = [
        y2 - y1 for y2, y1 in zip(last_idx_store[1:], last_idx_store[:-1])
    ] + [0]
    batch_l2_store = []
    batch_l2_suggestions_store = []
    for last_idx, suggestions_numel in zip(last_idx_store, suggestions_store):
        features = final_features[:last_idx, :].tolist()

        l2_store = []
        for idx, x in enumerate(features):
            other_features = np.array(features[:idx] + features[idx + 1:])
            l2_distance = np.linalg.norm(x=other_features -
                                         np.array(x).reshape((1, -1)),
                                         ord=2,
                                         axis=1)
            l2_store.append(np.min(l2_distance))
        batch_l2_store.append(np.mean(l2_store))

    df = pd.DataFrame(data=np.concatenate((
        np.array(last_idx_store).reshape(-1, 1),
        np.array(batch_l2_store).reshape(-1, 1),
    ),
                                          axis=1),
                      columns=['Expt Batches', 'Mean Min L2'],
                      index=range(1,
                                  len(last_idx_store) + 1))
    print_df_to_excel(df=df, ws=ws)
    wb.save(write_excel_dir)
Exemple #4
0
def read_col_data_store(name):
    with open('./data_store.pkl', 'rb') as handle:
        data_store = pickle.load(handle)

    write_excel = create_excel_file('./results/{}_results.xlsx'.format(name))
    wb = openpyxl.load_workbook(write_excel)
    ws = wb[wb.sheetnames[-1]]
    print_df_to_excel(df=pd.DataFrame(data=data_store[1],
                                      columns=data_store[0]),
                      ws=ws)
    wb.save(write_excel)
Exemple #5
0
 def print_results(name, y, p_y, mse, re):
     nonlocal wb
     wb.create_sheet(name)
     ws = wb[name]
     df = pd.DataFrame(np.concatenate((y, p_y), axis=1),
                       columns=['y1', 'y2', 'y3', 'P_y1', 'P_y2', 'P_y3'])
     print_df_to_excel(df=df, ws=ws)
     start_col = len(df.columns) + 3
     ws.cell(1, start_col).value = 'MSE'
     ws.cell(2, start_col).value = 'HE'
     ws.cell(1, start_col + 1).value = mse
     ws.cell(2, start_col + 1).value = re
Exemple #6
0
def l2_tracker(write_excel, final_excel_loader, last_idx_store):
    wb = openpyxl.load_workbook(write_excel)
    wb.create_sheet('L2 Results')
    ws = wb[wb.sheetnames[-1]]
    fl = load_data_to_fl(data_loader_excel_file=final_excel_loader,
                         normalise_labels=True,
                         label_type='cutoff',
                         norm_mask=[0, 1, 3, 4, 5])
    final_features = fl.features_c_norm

    suggestions_store = [
        y2 - y1 for y2, y1 in zip(last_idx_store[1:], last_idx_store[:-1])
    ] + [0]

    batch_l2_store = []
    batch_l2_suggestions_store = []
    for last_idx, suggestions_numel in zip(last_idx_store, suggestions_store):
        features = final_features[:last_idx, :].tolist()

        l2_store = []
        for idx, x in enumerate(features):
            other_features = np.array(features[:idx] + features[idx + 1:])
            l2_distance = np.linalg.norm(x=other_features -
                                         np.array(x).reshape((1, -1)),
                                         ord=2,
                                         axis=1)
            l2_store.append(np.min(l2_distance))
        batch_l2_store.append(np.mean(l2_store))

        if suggestions_numel == 0:
            batch_l2_suggestions_store.append(np.NaN)
        else:
            l2_suggestions_store = []
            suggestions_features = final_features[last_idx:last_idx +
                                                  suggestions_numel].tolist()
            for sf in suggestions_features:
                l2_distance = np.linalg.norm(x=features - np.array(sf).reshape(
                    (1, -1)),
                                             ord=2,
                                             axis=1)
                l2_suggestions_store.append(np.min(l2_distance))
            batch_l2_suggestions_store.append(np.mean(l2_suggestions_store))

    df = pd.DataFrame(
        data=np.concatenate((np.array(last_idx_store).reshape(
            -1, 1), np.array(batch_l2_store).reshape(
                -1, 1), np.array(batch_l2_suggestions_store).reshape(-1, 1)),
                            axis=1),
        columns=['Expt Batches', 'Mean Min L2', 'Suggestions Mean Min L2'],
        index=range(1,
                    len(last_idx_store) + 1))
    print_df_to_excel(df=df, ws=ws)
    wb.save(write_excel)
Exemple #7
0
def create_invariant_testset(testset_excel_dir, numel):
    df = pd.read_excel(testset_excel_dir, index_col=0, sheet_name='Sheet')

    features, labels = produce_invariant(features=df.values[:, :6], labels=df.values[:, 6:], numel=numel)
    new_data = np.concatenate((features, labels), axis=1)
    columns = df.columns
    new_df = pd.DataFrame(data=new_data, columns=columns)
    df = df.append(new_df)

    write_excel = '{} Invariant {}.xlsx'.format(testset_excel_dir.partition('.xlsx')[0], numel)
    write_excel = create_excel_file(write_excel)
    wb = openpyxl.load_workbook(write_excel)
    ws = wb[wb.sheetnames[-1]]
    print_df_to_excel(df=df, ws=ws)
    wb.save(write_excel)
Exemple #8
0
def get_final_submission_excel(excel_dir, read_excel_dir):
    xls = pd.ExcelFile(
        read_excel_dir
    )  # './results/expt1/a_Final_submission_expt1/combined_poos_results_CPIA.xlsx')
    data = {'rmse': [], 'rel_rmse': []}
    for sheet in xls.sheet_names:
        if 'rel_rmse_sel' in sheet:
            temp_df = pd.read_excel(xls, sheet_name=sheet).iloc[-2:, :]
            temp_df.index = [['2005:1~2019:12', '2020:1~2020:6']]
            data['rel_rmse'].append(temp_df)
        elif 'rmse_sel' in sheet:
            temp_df = pd.read_excel(xls, sheet_name=sheet).iloc[-2:, :]
            temp_df.index = [['2005:1~2019:12', '2020:1~2020:6']]
            data['rmse'].append(temp_df)

    excel_dir = create_excel_file(excel_dir)
    wb = openpyxl.load_workbook(excel_dir)
    for k, v in data.items():
        temp_df = pd.concat(v, axis=0)
        temp_df.index = temp_df.iloc[:, 0]
        temp_df.drop(labels=temp_df.columns[0], axis=1, inplace=True)
        wb.create_sheet(k)
        ws = wb[k]
        print_df_to_excel(df=temp_df, ws=ws)
    '''
    columns = ['Horizons', 'RW', 'AR', 'PCA']+ [f'{y}-{x}' for y in ['XGBA(rh)', 'XGBA(rfcv)'] for x in ['oracle', 'rw', 'hparam','ll', 'll*ln', 'rw_ll*ln']] + ['RF(rh)', 'RF(rfcv)']
    df = pd.read_excel('./results/expt1/a_Final_submission_expt1/final_table_IND.xlsx', sheet_name='rmse')
    df.columns = columns
    df['h'] = [x for x in [1,3,6,12,24] for _ in range(2)]
    df.iloc[:,1:-1] = df.iloc[:,1:-1].div([1,6,1,4,1,2,1,1.5,1,1], axis=0)
    df = df.melt(id_vars=['h', 'Horizons'], var_name='Model', value_name='RMSE')
    df = df[df['Model'].isin(['RW', 'AR', 'PCA', 'XGBA(rh)-rw', 'XGBA(rh)-hparam', 'RF(rh)'])]
    df = df.replace(['XGBA(rh)-rw', 'XGBA(rh)-hparam', 'RF(rh)'], ['XR', 'XH', 'RF'])
    sns.catplot(x="Model", y="RMSE",
                hue="Horizons", col="h",
                data=df, kind="bar", height=2.5, aspect=1.5, sharey=False, legend=False)
    # plt.subplots_adjust(wspace=0)
    plt.legend(bbox_to_anchor=(1.15, 1))
    g = plt.gcf()
    for ax1, (_, subdata), divby in zip(g.axes, df.groupby('h'), [6,4,2,1.5,1]):
        ax2=ax1.twinx()
        ax2.set_ylim(ax1.get_ylim())
        ax2.set_yticklabels(np.round(ax1.get_yticks() *divby, 1))
    
    plt.show()
    '''

    wb.save(excel_dir)
Exemple #9
0
def get_best_trial_from_rounds(dir_store, excel_subname, sort_col,
                               results_excel_dir):
    top_trials_store = []
    for dir in dir_store:
        for filename in os.listdir(dir):
            if filename.__contains__(excel_subname):
                print('Importing data from {} in dir {}'.format(filename, dir))
                df = pd.read_excel('{}/{}'.format(dir, filename),
                                   index_col=None)
                df.sort_values(by=sort_col, ascending=True, inplace=True)
                top_trials_store.append(df.iloc[0, :].values)

    wb = openpyxl.Workbook()
    ws = wb[wb.sheetnames[-1]]
    df = pd.DataFrame(data=top_trials_store, columns=df.columns)
    print_df_to_excel(df=df, ws=ws)
    wb.save(results_excel_dir)
Exemple #10
0
def combine_rmse_results(results_dir):
    df_store = []
    df_best_store = []
    for excel in os.listdir(results_dir):
        if 'testset_' in excel:
            print(f'Loading excel from {results_dir}/{excel}')
            df_store.append(read_excel_to_df(f'{results_dir}/{excel}'))
            df_best = []
            for df in df_store[-1]:
                df.insert(0, 'model', excel.split('_')[2])
                if '_AR_' in excel:
                    df_best.append(df.iloc[[3, df['Val RMSE'].argmin()], :])
                elif '_PCA_' in excel:
                    df_best.append(df.iloc[[df['Val RMSE'].argmin()], :])
                else:
                    df_best.append(df)
            df_best_store.append(df_best)
    # transpose nested list
    df_store = list(map(list, zip(*df_store)))
    combined_df_store = []
    for df_h in df_store:
        combined_df_store.append(pd.concat(df_h).sort_values(by='Val RMSE'))

    wb = openpyxl.Workbook()
    for h, df in zip([1, 3, 6, 12, 24], combined_df_store):
        wb.create_sheet(f'h_{h}')
        ws = wb[f'h_{h}']
        print_df_to_excel(df=df, ws=ws)

    wb.save(f'{results_dir}/summary.xlsx')

    # transpose nested list
    df_store = list(map(list, zip(*df_best_store)))
    combined_df_store = []
    for df_h in df_store:
        combined_df_store.append(pd.concat(df_h).sort_values(by='Val RMSE'))

    wb = openpyxl.Workbook()
    for h, df in zip([1, 3, 6, 12, 24], combined_df_store):
        wb.create_sheet(f'h_{h}')
        ws = wb[f'h_{h}']
        print_df_to_excel(df=df, ws=ws)

    wb.save(f'{results_dir}/best summary.xlsx')
Exemple #11
0
def eval_combination_on_testset(av_excel, y_dat, combination_dat):
    with open(y_dat, "rb") as f:
        y = pickle.load(f)
    with open(combination_dat, "rb") as f:
        p_y_store = pickle.load(f)
        p_y_store = np.array([x[1] for x in p_y_store])
    if av_excel:
        av = pd.read_excel(av_excel, sheet_name='av', index_col=None)
        selected_mask = [
            idx for idx, value in enumerate(av.iloc[:, -1].values)
            if value == 1
        ]
    else:
        selected_mask = [1] * len(p_y_store)

    p_y_selected_mean = np.mean(p_y_store[selected_mask, :, :], axis=0)
    re = np.mean(np.abs(y - p_y_selected_mean) / y)

    data = np.concatenate((y, p_y_selected_mean), axis=1)
    df = pd.DataFrame(
        data=data,
        columns=['cut=10', 'cut=100', 'End', 'P_cut=10', 'P_cut=100', 'P_End'])

    wb = openpyxl.Workbook()
    ws = wb[wb.sheetnames[-1]]
    print_df_to_excel(df=df, ws=ws)

    wb.create_sheet('Models')
    ws = wb[wb.sheetnames[-1]]
    ws.cell(1, 1).value = 'Names'
    try:
        print_array_to_excel(array=av.iloc[:, 0].values[selected_mask],
                             first_cell=(2, 1),
                             ws=ws,
                             axis=0)
    except:
        pass
    ws.cell(1, 2).value = 'RE'
    ws.cell(1, 3).value = re
    excel_dir = create_excel_file('./results/eval_combi.xlsx')
    wb.save(excel_dir)
Exemple #12
0
def combine_data_store(dir_store):

    top_df = []
    name_store = []
    for dir in dir_store:
        data_store = []
        for filename in os.listdir(f'{dir}/data_store'):
            if filename.endswith(".pkl"):
                with open('{}/data_store/{}'.format(dir, filename),
                          'rb') as handle:
                    data_store.extend(pickle.load(handle))
        ett_store = np.array([data['val']['mse'] for data in data_store])

        top_idx = np.argsort(ett_store)[:3]
        name_store.extend(
            [data_store[idx]['info']['model_name'] for idx in top_idx])
        top_df.extend(
            [data_store[idx]['ett30_pointsend']['df'] for idx in top_idx])
    print(name_store)
    wb = openpyxl.Workbook()
    for idx, df in enumerate(top_df):
        wb.create_sheet(f's{idx}')
        ws = wb[f's{idx}']
        print_df_to_excel(df, ws)
    wb.create_sheet('Avg')
    ws = wb['Avg']
    df = pd.concat(top_df).reset_index().groupby('index').mean()
    print_df_to_excel(df, ws)
    p_y = df.iloc[:, -19:].values
    y = df.iloc[:, -39:-20].values
    print(np.mean((p_y - y)**2), np.mean(np.abs(p_y - y).T / y[:, -1]))
    ws = wb['Sheet']
    ws.cell(1, 1, 'mse')
    ws.cell(1, 2, 'mre')
    ws.cell(2, 1, np.mean((p_y - y)**2))
    ws.cell(2, 2, np.mean(np.abs(p_y - y).T / y[:, -1]))
    wb.save(f'./results/top3.xlsx')
    wb.close()
Exemple #13
0
def combine_best_summary_and_xgbs(best_summary_dir, xgbs_dir, results_dir):
    xgbs_df = pd.read_excel(xgbs_dir)
    bs_df_store = read_excel_to_df(best_summary_dir)
    combined_df_store = []
    columns = xgbs_df.columns
    h_rows = list(np.where(
        xgbs_df[columns[0]].notnull())[0]) + [len(xgbs_df[columns[0]])]
    for h_start, h_end, bs_df in zip(h_rows[:-1], h_rows[1:], bs_df_store):
        count = h_end - h_start
        df = bs_df.reindex(bs_df.index.tolist() +
                           list(range(90, 90 + count))).reset_index(drop=True)
        model_names = [f'{x}s' for x in xgbs_df.iloc[h_start:h_end, 1].values]
        df.iloc[-count:, 0] = model_names
        df.iloc[-count:, 3] = xgbs_df.iloc[h_start:h_end, 2].values
        combined_df_store.append(df.sort_values('Val RMSE'))

    wb = openpyxl.Workbook()
    for h, df in zip([1, 3, 6, 12, 24], combined_df_store):
        wb.create_sheet(f'h_{h}')
        ws = wb[f'h_{h}']
        print_df_to_excel(df=df, ws=ws)

    wb.save(f'{results_dir}/best summary + xgbs.xlsx')
Exemple #14
0
def read_hparam_opt_data_store(write_dir):
    # Load all the saved data_store.pkl into data_store list
    data_store = []
    for filename in os.listdir(f'{write_dir}/data_store'):
        if filename.endswith(".pkl"):
            with open('{}/{}'.format(f'{write_dir}/data_store', filename),
                      'rb') as handle:
                data_store.extend(pickle.load(handle))

    wb_store = {
        k: openpyxl.Workbook()
        for k in data_store[0] if (k not in ['info'])
    }
    summary_df = []
    for run, data in enumerate(data_store):
        run_summary = data['info']['opt']
        for k, wb in wb_store.items():
            wb.create_sheet(f'run {run + 1}')
            ws = wb[f'run {run + 1}']
            print_df_to_excel(df=data[k]['df'], ws=ws)
            print_df_to_excel(df=data['info']['hparams'],
                              ws=ws,
                              start_col=len(data[k]['df'].columns) + 3)
            run_summary = {
                **run_summary,
                **{
                    'model_name': data['info']['model_name'],
                    f'{k}_mse': data[k]['mse'],
                    f'{k}_mre': data[k]['mre']
                }
            }
        summary_df.append(pd.Series(run_summary))
    # Print summary df
    summary_df = pd.concat(summary_df, axis=1).T
    wb = openpyxl.Workbook()
    ws = wb[wb.sheetnames[-1]]
    print_df_to_excel(df=summary_df, ws=ws)
    wb.save(f'{write_dir}/summary.xlsx')
    wb.close()
    # Print all the other df
    for k, wb in wb_store.items():
        wb.save(f'{write_dir}/{k}.xlsx')
        wb.close()
Exemple #15
0
def create_data_loader_excel(excel_dir, results_dir):
    ymain_df = pd.read_excel(excel_dir, sheet_name='y transformed', index_col=0)
    xmain_df = pd.read_excel(excel_dir, 'transformation', index_col=0)

    # Find unique var name for forecasting
    var_names = list(set([item.partition('_h')[0] for item in ymain_df.columns]))

    for var_name in var_names:
        excel_name = create_excel_file('{}/{}_data_loader.xlsx'.format(results_dir, var_name))
        wb = openpyxl.load_workbook(excel_name)
        wb.create_sheet('x')
        wb.create_sheet('yo')
        wb.create_sheet('y')
        print_df_to_excel(df=xmain_df.loc[:, xmain_df.columns != var_name], ws=wb['x'])
        print_df_to_excel(df=xmain_df.loc[:, [var_name]], ws=wb['yo'])
        mask = np.flatnonzero(np.core.defchararray.find(ymain_df.columns.values.astype(str), var_name) != -1)
        print_df_to_excel(df=ymain_df.iloc[:, mask], ws=wb['y'])
        wb.save(excel_name)

    pass
Exemple #16
0
def l2_points_opt(numel,
                  write_dir,
                  svm_directory,
                  seed_number_of_expt,
                  total_expt,
                  l2_opt=True):
    write_dir = create_results_directory(results_directory=write_dir,
                                         excels=['l2_acq'])
    svm_store = load_svm_ensemble(svm_directory)
    base = [x / (numel * 2 - 1) for x in list(range(numel * 2))]

    # Create set of possible compositions
    compositions = [[x, y] if x + y <= 1 else [-x + 1, -y + 1]
                    for x, y in list(itertools.product(base[::2], base[1::2]))]
    distance_store = []
    # Check feasibility for those compositions
    for model in svm_store:
        distance_store.append(model.model.decision_function(compositions))
    distance = np.mean(np.array(distance_store), axis=0)
    valid_compositions = [
        x for x, dist in zip(compositions, distance) if dist >= 0
    ]
    print('Number of compositions = {}. % valid = {}%'.format(
        len(valid_compositions),
        len(valid_compositions) / len(compositions) * 100))
    # Permute feasible compositions with different thickness possibilities scaled from 0 to 1
    number_valid_compositions = round(math.sqrt(len(valid_compositions)))
    compositions_thickness = list(
        itertools.product(valid_compositions, [
            x / (number_valid_compositions - 1)
            for x in list(range(number_valid_compositions))
        ]))
    print('Number of permutations = {}'.format(len(compositions_thickness *
                                                   3)))
    # Permute the above with 0D, 1D, and 2D
    all_permutations = np.array([
        x[0] + [x[1]] + y for x in compositions_thickness
        for y in [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
    ])

    if l2_opt:
        expt_idx = np.random.randint(0, len(all_permutations),
                                     seed_number_of_expt)
        expt_store = all_permutations[expt_idx, :]

        for i in range(total_expt - seed_number_of_expt):
            start = time.time()
            d = pairwise_distances(expt_store,
                                   all_permutations,
                                   metric='euclidean')
            next_expt = np.argmax(np.min(d, axis=0))
            expt_store = np.concatenate(
                (expt_store, all_permutations[next_expt, None, :]), axis=0)
            end = time.time()
            print('{} out of {} completed. Time taken = {}.'.format(
                i + 1, total_expt - seed_number_of_expt, end - start))
    else:
        expt_idx = np.random.randint(0, len(all_permutations), total_expt)
        expt_store = all_permutations[expt_idx, :]

    expt_store[:, 2] = expt_store[:, 2] * 1800 + 200

    write_excel = '{}/l2_acq.xlsx'.format(write_dir)
    wb = openpyxl.load_workbook(write_excel)
    wb.create_sheet('l2_acq')
    ws = wb[wb.sheetnames[-1]]
    ws.cell(1, 1).value = 'Valid Combinations'
    ws.cell(1, 2).value = len(all_permutations)
    ws.cell(1, 3).value = 'Seed Expt'
    ws.cell(1, 4).value = seed_number_of_expt
    df = pd.DataFrame(data=expt_store,
                      columns=['CNT', 'PVA', 'Thickness', '0D', '1D', '2D'],
                      index=list(range(1, total_expt + 1)))
    print_df_to_excel(df=df, ws=ws, start_row=2)

    wb.save(write_excel)
    pass
Exemple #17
0
def acquisition_opt_pso_ga(
    bounds,
    write_dir,
    svm_directory,
    loader_file,
    normalise_labels,
    pso_params,
    batch_runs=1,
    initial_guess=None,
    norm_mask=None,
):
    """
    bounds = [[5, 200, ],
              [0, 200, ],
              [5, 200, ],
              [0, 200, ],
              [10, 2000],
              [0, 0.3]]
    :param model_mode:
    :param loader_file:
    :param total_run:
    :param instance_per_run:
    :param hparam_file:
    :return:
    """

    print('Writing into {}/acq.xlsx'.format(write_dir))
    wb = openpyxl.Workbook()

    model_store = load_model_ensemble('{}/models'.format(write_dir))
    svm_store = load_svm_ensemble(svm_directory)

    fl = load_data_to_fl(loader_file,
                         norm_mask=norm_mask,
                         normalise_labels=normalise_labels,
                         label_type='cutoff')
    # CNT, PVA, Thickness, Dimension
    pmin = [x[0] for x in bounds]
    pmax = [x[1] for x in bounds]

    smin = [abs(x - y) * 0.001 for x, y in zip(pmin, pmax)]
    smax = [abs(x - y) * 0.5 for x, y in zip(pmin, pmax)]

    for batch, init_guess in zip(list(range(batch_runs)), initial_guess):
        instance_start = time.time()
        data_store = []

        def fitness(params):
            nonlocal data_store
            # start = time.time()
            features = np.array(params)
            x = features[0]
            y = features[1]
            if x + y > 1:
                u = -y + 1
                v = -x + 1
                features[0:2] = np.array([u, v])

            # SVM Check
            p_class, distance = svm_ensemble_prediction(
                svm_store, features[0:2])

            if distance.item() < 0:
                # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
                # The more negative the a_score is, the further the composition is from the hyperplane,
                # hence, the less likely the optimizer will select examples with class 0.
                a_score = 10e5 * distance.item()
                prediction_mean = [-1] * fl.labels_dim
                prediction_std = [-1] * fl.labels_dim
                l2_dist = -1
                disagreement = -1
            elif features[0] + features[1] > 1:
                # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
                # The more negative the a_score is, the further the composition is from the hyperplane,
                # hence, the less likely the optimizer will select examples with class 0.
                a_score = 10e5 * (1 - (features[0] + features[1]))
                prediction_mean = [-1] * fl.labels_dim
                prediction_std = [-1] * fl.labels_dim
                l2_dist = -1
                disagreement = -1
            else:
                features_c = features[:-1]
                onehot = features[-1].item()
                if onehot == 0:
                    features_in = np.concatenate(
                        (features_c, np.array([1, 0, 0])))
                elif onehot == 1:
                    features_in = np.concatenate(
                        (features_c, np.array([0, 1, 0])))
                elif onehot == 2:
                    features_in = np.concatenate(
                        (features_c, np.array([0, 0, 1])))

                features_input_norm = fl.apply_scaling(features_in)
                prediction_mean, prediction_std = model_ensemble_prediction(
                    model_store, features_input_norm)
                # Greedy Sampling
                # Get L2 distance of sampled example to all existing example in fl class object
                # Note: L2 distance is calculated using the normalised features so that all feature have the same weight
                l2_distance = np.linalg.norm(x=fl.features_c_norm -
                                             features_input_norm.reshape(
                                                 (1, -1)),
                                             ord=2,
                                             axis=1)
                l2_distance = np.min(l2_distance)  # Take the minimum L2 dist.

                # Overall Acquisition Score. Higher score if l2 distance is larger and uncertainty (std) is larger.
                disagreement = np.sum(prediction_std)
                a_score = l2_distance * disagreement

                # Storing intermediate results into list to print into excel later
                l2_dist = l2_distance
                disagreement = disagreement
                prediction_mean = prediction_mean.flatten().tolist()
                prediction_std = prediction_std.flatten().tolist()
            data = list(features) + [a_score, disagreement, l2_dist
                                     ] + prediction_mean + prediction_std
            data_store.append(data)
            # end = time.time()
            # print(end-start)
            return (-a_score, )

        _, _, best = pso_ga(func=fitness,
                            pmin=pmin,
                            pmax=pmax,
                            smin=smin,
                            smax=smax,
                            int_idx=[3],
                            params=pso_params,
                            ga=True,
                            initial_guess=init_guess)

        p_mean_name = np.array(
            ['Pmean_' + str(x) for x in list(map(str, np.arange(1, 4)))])
        p_std_name = np.array(
            ['Pstd_' + str(x) for x in list(map(str, np.arange(1, 4)))])

        columns = np.concatenate(
            (np.array(fl.features_c_names[:-2]), np.array(['A_score']),
             np.array(['Disagreement']), np.array(['L2']), p_mean_name,
             p_std_name))

        iter_df = pd.DataFrame(data=data_store, columns=columns)

        iter_df = iter_df.sort_values(by=['A_score'], ascending=False)

        # Creating new worksheet. Even if SNN worksheet already exists, a new SNN1 ws will be created and so on
        wb.create_sheet(title='Batch_{}'.format(batch + 1))
        ws = wb['Batch_{}'.format(
            batch + 1
        )]  # Taking the ws name from the back ensures that if SNN1 is the new ws, it works
        print_df_to_excel(df=iter_df, ws=ws)

        # If batch_runs > 1, next batch will be calculated. The only difference is that the previous best trial point
        # with the highest a_score will be added to fl.features_c_norm such that the L2 greedy distance will
        # account for the fact that the previous batch would had contained the best example already.
        # features = np.array(list(best))
        features = np.array(init_guess[0])
        features_c = features[:-1]
        onehot = features[-1].item()
        if onehot == 0:
            features = np.concatenate((features_c, np.array([1, 0, 0])))
        elif onehot == 1:
            features = np.concatenate((features_c, np.array([0, 1, 0])))
        elif onehot == 2:
            features = np.concatenate((features_c, np.array([0, 0, 1])))

        fl.features_c_norm = np.concatenate(
            (fl.features_c_norm, fl.apply_scaling(features)), axis=0)

        instance_end = time.time()
        print('Batch {} completed. Time taken: {}'.format(
            batch + 1, instance_end - instance_start))
        wb.save('{}/acq.xlsx'.format(write_dir))
        wb.close()
        wb = openpyxl.load_workbook('{}/acq.xlsx'.format(write_dir))
Exemple #18
0
def inverse_design(targets, loss_func, bounds, int_idx, init_guess,
                   model_directory_store, svm_directory, loader_file,
                   write_dir, opt_mode):
    model_store = []
    for model_directory in model_directory_store:
        model_store.extend(load_model_ensemble(model_directory))
    svm_store = load_svm_ensemble(svm_directory)
    fl = load_data_to_fl(loader_file,
                         norm_mask=[0, 1, 3, 4, 5],
                         normalise_labels=False,
                         label_type='cutoff')

    data_store = []
    if opt_mode == 'psoga':

        def fitness(params):
            nonlocal data_store
            features = np.array(params)
            x = features[0]
            y = features[1]
            if x + y > 1:
                u = -y + 1
                v = -x + 1
                features[0:2] = np.array([u, v])

            # SVM Check
            p_class, distance = svm_ensemble_prediction(
                svm_store, features[0:2])

            if distance.item() < 0:
                # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
                # The more negative the a_score is, the further the composition is from the hyperplane,
                # hence, the less likely the optimizer will select examples with class 0.
                mse = 10e5 * distance.item()
                prediction_mean = [-1] * fl.labels_dim
                prediction_std = [-1] * fl.labels_dim
                disagreement = -1
            elif features[0] + features[1] > 1:
                # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
                # The more negative the a_score is, the further the composition is from the hyperplane,
                # hence, the less likely the optimizer will select examples with class 0.
                mse = 10e5 * (1 - (features[0] + features[1]))
                prediction_mean = [-1] * fl.labels_dim
                prediction_std = [-1] * fl.labels_dim
                disagreement = -1
            else:
                features_c = features[:-1]
                onehot = features[-1].item()
                if onehot == 0:
                    features_in = np.concatenate(
                        (features_c, np.array([1, 0, 0])))
                elif onehot == 1:
                    features_in = np.concatenate(
                        (features_c, np.array([0, 1, 0])))
                elif onehot == 2:
                    features_in = np.concatenate(
                        (features_c, np.array([0, 0, 1])))
                features_input_norm = fl.apply_scaling(features_in)
                prediction_mean, prediction_std = model_ensemble_prediction(
                    model_store, features_input_norm)
                mse = -loss_func(targets, prediction_mean)
                disagreement = np.mean(prediction_std)
                prediction_mean = prediction_mean.tolist()
                prediction_std = prediction_std.tolist()

            data = list(features) + [-mse, disagreement
                                     ] + prediction_mean + prediction_std
            data_store.append(data)
            return (-mse, )

        pmin = [x[0] for x in bounds]
        pmax = [x[1] for x in bounds]

        smin = [abs(x - y) * 0.001 for x, y in zip(pmin, pmax)]
        smax = [abs(x - y) * 0.5 for x, y in zip(pmin, pmax)]

        pso_params = {
            'c1': 1.5,
            'c2': 1.5,
            'wmin': 0.4,
            'wmax': 0.9,
            'ga_iter_min': 2,
            'ga_iter_max': 10,
            'iter_gamma': 10,
            'ga_num_min': 5,
            'ga_num_max': 20,
            'num_beta': 15,
            'tourn_size': 3,
            'cxpd': 0.9,
            'mutpd': 0.05,
            'indpd': 0.5,
            'eta': 0.5,
            'pso_iter': 10,
            'swarm_size': 300
        }

        pso_ga(func=fitness,
               pmin=pmin,
               pmax=pmax,
               smin=smin,
               smax=smax,
               int_idx=[3],
               params=pso_params,
               ga=True,
               initial_guess=init_guess)

    elif opt_mode == 'forest' or opt_mode == 'dummy':
        space = [
            Real(low=bounds[0][0], high=bounds[0][1], name='CNT'),
            Real(low=bounds[1][0], high=bounds[1][1], name='PVA'),
            Real(low=bounds[2][0], high=bounds[2][1], name='Thickness'),
            Categorical(categories=[0, 1, 2], name='Dimension')
        ]

        iter_count = 0
        start = time.time()
        end = 0

        @use_named_args(space)
        def fitness(**params):
            nonlocal data_store, iter_count, start, end
            iter_count += 1
            features = np.array([x for x in params.values()])
            x = features[0]
            y = features[1]
            if x + y > 1:
                u = -y + 1
                v = -x + 1
                features[0:2] = np.array([u, v])
            # SVM Check
            p_class, distance = svm_ensemble_prediction(
                svm_store, features[0:2])
            if distance.item() < 0:
                # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
                # The more negative the a_score is, the further the composition is from the hyperplane,
                # hence, the less likely the optimizer will select examples with class 0.
                mse = 10e5 * distance.item()
                prediction_mean = [-1] * fl.labels_dim
                prediction_std = [-1] * fl.labels_dim
                disagreement = -1
            elif features[0] + features[1] > 1:
                # Sum of composition needs to be less than 1
                mse = 10e5 * (1 - (features[0] + features[1]))
                prediction_mean = [-1] * fl.labels_dim
                prediction_std = [-1] * fl.labels_dim
                disagreement = -1
            else:
                features_c = features[:-1]
                onehot = features[-1].item()
                if onehot == 0:
                    features_in = np.concatenate(
                        (features_c, np.array([1, 0, 0])))
                elif onehot == 1:
                    features_in = np.concatenate(
                        (features_c, np.array([0, 1, 0])))
                elif onehot == 2:
                    features_in = np.concatenate(
                        (features_c, np.array([0, 0, 1])))
                features_input_norm = fl.apply_scaling(features_in)
                prediction_mean, prediction_std = model_ensemble_prediction(
                    model_store, features_input_norm)
                mse = -loss_func(targets,
                                 prediction_mean)  # Some negative number
                disagreement = np.mean(prediction_std)
                prediction_mean = prediction_mean.tolist()
                prediction_std = prediction_std.tolist()

            data = list(features) + [-mse, disagreement
                                     ] + prediction_mean + prediction_std
            data_store.append(data)
            if iter_count % 10 == 0:
                end = time.time()
                print(
                    'Current Iteration {}. Time taken for past 10 evals: {}. '.
                    format(iter_count, end - start))
                start = time.time()
            return -mse  # Make negative become positive, and minimizing score towards 0.

        if opt_mode == 'forest':
            forest_minimize(
                func=fitness,
                dimensions=space,
                acq_func='EI',  # Expected Improvement.
                n_calls=1000,
                verbose=False)
        else:
            dummy_minimize(func=fitness,
                           dimensions=space,
                           n_calls=5000,
                           verbose=False)

    p_mean_name = np.array(
        ['Pmean_' + str(x) for x in list(map(str, np.arange(1, 4)))])
    p_std_name = np.array(
        ['Pstd_' + str(x) for x in list(map(str, np.arange(1, 4)))])

    columns = np.concatenate(
        (np.array(fl.features_c_names[:-2]), np.array(['mse']),
         np.array(['Disagreement']), p_mean_name, p_std_name))

    iter_df = pd.DataFrame(data=data_store, columns=columns)

    iter_df = iter_df.sort_values(by=['mse'], ascending=True)

    excel_dir = create_excel_file('{}/inverse_design_{}_{}.xlsx'.format(
        write_dir, opt_mode, targets))
    wb = openpyxl.load_workbook(excel_dir)
    ws = wb[wb.sheetnames[
        -1]]  # Taking the ws name from the back ensures that if SNN1 is the new ws, it works
    ws.cell(1, 1).value = 'Target'
    print_array_to_excel(array=targets, first_cell=(1, 2), axis=1, ws=ws)
    print_df_to_excel(df=iter_df, ws=ws, start_row=3)

    wb.save(excel_dir)
    wb.close()
Exemple #19
0
def cutoff_combine_excel_results_with_excel(results_excel_dir, plot_dir, fn,
                                            numel, plot_mode):
    cutoff = [10, 100]
    xls = pd.ExcelFile(results_excel_dir)

    p_y_store = []
    sheets = xls.sheet_names
    for sheet in sheets:
        if sheet == 'Sheet':
            pass
        else:
            df = pd.read_excel(xls, sheet_name=sheet, index_col=0)
            df = df.sort_index()

            p_y = df.iloc[:, fn + 1 + numel:fn + 1 + 2 * numel].values.tolist()
            p_y_store.append(p_y)

    y_store = df.iloc[:, fn + 1:fn + 1 + numel].values
    p_y_store_mean = np.mean(np.array(p_y_store), axis=0)

    combine_mse = np.mean((y_store - p_y_store_mean)**2)
    p_y_store.append(p_y_store_mean.tolist())

    rc = np.mean(np.abs(y_store - p_y_store_mean) / y_store)

    se = (y_store - p_y_store_mean)**2
    cumulative_mse = []
    for idx in range(np.shape(se)[0]):
        cumulative_mse.append(np.mean(se[0:idx + 1, :]))

    sheets.append('Combined')

    if plot_mode:
        for idx, [x, p_x_store] in enumerate(
                zip(y_store.tolist(),
                    np.swapaxes(np.array(p_y_store), 0, 1).tolist())):
            plt.plot([0, x[0], x[1], x[2]], [
                0, 0, 10 * (x[1] - x[0]), cutoff[0] *
                (x[1] - x[0]) + cutoff[1] * (x[2] - x[1])
            ],
                     c='r',
                     label='Actual Spline Fit')
            for idx1, p_x in enumerate(p_x_store):
                if idx1 == 3:
                    plt.plot([0, p_x[0], p_x[1], p_x[2]], [
                        0, 0, 10 * (p_x[1] - p_x[0]), cutoff[0] *
                        (p_x[1] - p_x[0]) + cutoff[1] * (p_x[2] - p_x[1])
                    ],
                             label=sheets[idx1])
            plt.legend(loc='upper left')
            plt.title('Expt. ' + str(idx + 1))
            plt.savefig('{}/Expt_{}.png'.format(plot_dir, idx + 1),
                        bbox_inches='tight')
            plt.close()

    df.iloc[:, fn + 1 + numel:fn + 1 + 2 * numel] = np.array(p_y_store[-1])
    df = df.iloc[:, :fn + 1 + 2 * numel]
    df['Cumulative MSE'] = cumulative_mse

    wb = openpyxl.load_workbook(results_excel_dir)
    wb.create_sheet('Results')
    names = wb.sheetnames
    ws = wb[names[-1]]
    print_df_to_excel(df=df, ws=ws, index=True, header=True)

    col = fn + 1 + 1 + 2 * numel + 3
    ws.cell(1, col).value = 'Combined'
    ws.cell(2, col + 0).value = 'mse'
    ws.cell(2, col + 1).value = combine_mse
    ws.cell(3, col + 0).value = 'RC'
    ws.cell(3, col + 1).value = rc
    wb.save(results_excel_dir)
Exemple #20
0
def testset_prediction_results(write_excel, model_dir_store,
                               excel_loader_dir_store, testset_excel_dir,
                               rounds, fn, numel):
    wb = openpyxl.load_workbook(write_excel)
    results_col = fn + 1 + 1 + 2 * numel + 3
    mse_store = []
    mre_store = []
    mare_store = []
    testset_fl = load_data_to_fl(testset_excel_dir,
                                 normalise_labels=True,
                                 label_type='cutoff',
                                 norm_mask=[0, 1, 3, 4, 5])
    column_headers = testset_fl.labels_names
    for idx, (model_dir, loader_excel, round) in enumerate(
            zip(model_dir_store, excel_loader_dir_store, rounds)):
        wb.create_sheet('Round {}'.format(round))
        ws = wb[wb.sheetnames[-1]]
        fl = load_data_to_fl(data_loader_excel_file=loader_excel,
                             norm_mask=[0, 1, 3, 4, 5],
                             normalise_labels=True,
                             label_type='cutoff')
        model_store = load_model_ensemble(model_dir)
        # Must use the round's fl to scale, not the testset scaler as it might be different
        p_y, _ = model_ensemble_prediction(
            model_store, fl.apply_scaling(testset_fl.features_c))

        for row, p_label in enumerate(p_y.tolist()):
            if p_label[1] > p_label[2]:
                p_y[row, 1] = p_y[row, 2]
            if p_label[0] > p_y[row, 1]:
                p_y[row, 0] = p_y[row, 1]

        se_store = (testset_fl.labels - p_y)**2
        re_store = np.abs(testset_fl.labels - p_y) / testset_fl.labels
        are_store = np.arctan(re_store)

        df = pd.DataFrame(data=np.concatenate(
            (testset_fl.labels, p_y, se_store, re_store, are_store), axis=1),
                          index=list(range(1, 1 + testset_fl.count)),
                          columns=list(column_headers) +
                          ['P_{}'.format(col) for col in column_headers] +
                          ['SE_{}'.format(col) for col in column_headers] +
                          ['RE_{}'.format(col) for col in column_headers] +
                          ['ARE_{}'.format(col) for col in column_headers])
        print_df_to_excel(df=df, ws=ws)

        col = fn + 1 + 1 + 2 * numel + 3
        mse_store.append(np.mean(se_store))
        mre_store.append(np.mean(re_store))
        mare_store.append(np.mean(are_store))
        ws.cell(1, col).value = 'MSE'
        ws.cell(1, col + 1).value = mse_store[-1]
        ws.cell(2, col).value = 'MRE'
        ws.cell(2, col + 1).value = mre_store[-1]
        ws.cell(3, col).value = 'ARE'
        ws.cell(3, col + 1).value = mare_store[-1]

    wb.create_sheet('Final_results')
    ws = wb[wb.sheetnames[-1]]
    df = pd.DataFrame(data=np.array([mse_store, mre_store, mare_store]),
                      index=['mse', 're', 'are'],
                      columns=rounds)
    print_df_to_excel(df=df, ws=ws)
    wb.save(write_excel)
Exemple #21
0
def training_curve_comparision(train_excel, val_excel, write_dir, hparams):
    fl = load_data_to_fl(data_loader_excel_file=train_excel,
                         normalise_labels=False,
                         label_type='cutoff',
                         norm_mask=[0, 1, 3, 4, 5])
    val_fl = load_testset_to_fl(val_excel,
                                scaler=fl.scaler,
                                norm_mask=[0, 1, 3, 4, 5])

    data = defaultdict(list)

    def run_model(loss, pre, epoch):
        sess = tf.compat.v1.Session()
        K.set_session(sess)
        hparams['loss'] = loss
        hparams['pre'] = pre
        hparams['epochs'] = epoch
        model = Kmodel(fl=fl, mode='ann3', hparams=hparams)
        _, history = model.train_model(fl,
                                       val_fl,
                                       plot_name='{}/{}_{}_{}.png'.format(
                                           write_dir, loss, pre, epoch))

        data['train_error'].append(history.history['loss'])
        data['test_error'].append(history.history['val_loss'])
        data['names'].append('{}_{}_{}'.format(loss, pre, epoch))

        # Need to put the next 3 lines if not memory will run out
        del model
        K.clear_session()
        sess.close()
        gc.collect()

    # run_model(loss='mse', pre=100, epoch=500)
    # run_model(loss='haitao', pre=100, epoch=500)
    # run_model(loss='mse', pre=500, epoch=500)
    # run_model(loss='haitao', pre=500, epoch=500)
    #
    # run_model(loss='mse', pre=100, epoch=1000)
    # run_model(loss='haitao', pre=100, epoch=1000)
    # run_model(loss='mse', pre=500, epoch=1000)
    # run_model(loss='haitao', pre=500, epoch=1000)
    #
    # run_model(loss='mse', pre=100, epoch=5000)
    # run_model(loss='haitao', pre=100, epoch=5000)
    run_model(loss='mse', pre=500, epoch=5000)
    run_model(loss='haitao', pre=500, epoch=5000)
    # run_model(loss='mse', pre=100, epoch=500)
    # run_model(loss='mse', pre=100, epoch=500)
    # run_model(loss='mse', pre=100, epoch=500)
    # run_model(loss='mse', pre=100, epoch=500)
    # run_model(loss='mse', pre=100, epoch=500)
    # run_model(loss='mse', pre=100, epoch=500)

    wb = openpyxl.Workbook()
    wb.create_sheet('125train_loss')
    wb.create_sheet('30test_loss')

    train_df = pd.DataFrame(data=list(
        itertools.zip_longest(*data['train_error'], fillvalue="")),
                            columns=data['names'])
    test_df = pd.DataFrame(data=list(
        itertools.zip_longest(*data['test_error'], fillvalue="")),
                           columns=data['names'])

    ws = wb['125train_loss']
    print_df_to_excel(df=train_df, ws=ws)

    ws = wb['30test_loss']
    print_df_to_excel(df=test_df, ws=ws)

    wb.save('{}/training_curve_results.xlsx'.format(write_dir))
Exemple #22
0
def testset_model_results_to_excel(write_excel, model_dir_store, loader_excel,
                                   testset_excel_dir, fn, numel, chunks):
    wb = openpyxl.load_workbook(write_excel)
    mse_store = []
    mre_store = []
    mare_store = []
    testset_fl = load_data_to_fl(testset_excel_dir,
                                 normalise_labels=True,
                                 label_type='cutoff',
                                 norm_mask=[0, 1, 3, 4, 5])
    column_headers = testset_fl.labels_names

    fl = load_data_to_fl(data_loader_excel_file=loader_excel,
                         norm_mask=[0, 1, 3, 4, 5],
                         normalise_labels=True,
                         label_type='cutoff')

    model_name_store = []
    for model_dir in model_dir_store:
        for idx, file in enumerate(os.listdir(model_dir)):
            filename = os.fsdecode(file)
            model_name_store.append(model_dir + '/' + filename)
        print('Loading the following models from {}. Total models = {}'.format(
            model_dir, len(model_name_store)))
    model_chunks = [
        model_name_store[x:x + chunks]
        for x in range(0, len(model_name_store), chunks)
    ]
    testset_features_c_norm = fl.apply_scaling(testset_fl.features_c)
    model_idx = 1
    for single_model_chunk in model_chunks:
        model_store = load_model_chunks(single_model_chunk)
        for idx, model in enumerate(model_store):
            wb.create_sheet('{}'.format(model_idx))
            model_idx += 1
            ws = wb[wb.sheetnames[-1]]
            if model:
                # Must use the round's fl to scale, not the testset scaler as it might be different
                p_y = model.predict(testset_features_c_norm)
                print(np.std(p_y, axis=0))

                for row, p_label in enumerate(p_y.tolist()):
                    if p_label[1] > p_label[2]:
                        p_y[row, 1] = p_y[row, 2]
                    if p_label[0] > p_y[row, 1]:
                        p_y[row, 0] = p_y[row, 1]

                se_store = (testset_fl.labels - p_y)**2
                re_store = np.abs(testset_fl.labels - p_y) / testset_fl.labels
                are_store = np.arctan(re_store)

                df = pd.DataFrame(
                    data=np.concatenate((testset_fl.labels, p_y, se_store,
                                         re_store, are_store),
                                        axis=1),
                    index=list(range(1, 1 + testset_fl.count)),
                    columns=list(column_headers) +
                    ['P_{}'.format(col) for col in column_headers] +
                    ['SE_{}'.format(col) for col in column_headers] +
                    ['RE_{}'.format(col) for col in column_headers] +
                    ['ARE_{}'.format(col) for col in column_headers])
                print_df_to_excel(df=df, ws=ws)

                col = fn + 1 + 1 + 2 * numel + 3
                mse_store.append(np.mean(se_store))
                mre_store.append(np.mean(re_store))
                mare_store.append(np.mean(are_store))
                ws.cell(1, col).value = 'MSE'
                ws.cell(1, col + 1).value = mse_store[-1]
                ws.cell(2, col).value = 'MRE'
                ws.cell(2, col + 1).value = mre_store[-1]
                ws.cell(3, col).value = 'ARE'
                ws.cell(3, col + 1).value = mare_store[-1]
            else:
                ws.cell(1, 1).value = 'EOF error'
                ws.cell(2, 1).value = model_name_store[idx - 1]
                mse_store.append(np.nan)
                mre_store.append(np.nan)
                mare_store.append(np.nan)

    ws = wb[wb.sheetnames[0]]
    df = pd.DataFrame(data=np.array([mse_store, mre_store, mare_store]).T,
                      columns=['mse', 're', 'are'],
                      index=range(1, 1 + len(mse_store)))
    df.insert(0, 'Name', model_name_store)
    print_df_to_excel(df=df, ws=ws)
    wb.save(write_excel)
Exemple #23
0
def cutoff_combine_excel_results(dir_store, results_excel_dir, plot_dir,
                                 sheets, fn, numel, plot_mode):
    def get_best_df(dir, name, wb):
        hparam_df = pd.read_excel('{}/hparam_results.xlsx'.format(dir),
                                  index_col=None)
        mse = hparam_df.iloc[:, -1].values
        min_idx = int(hparam_df.iloc[np.argmin(mse), 0])

        xls = pd.ExcelFile('{}/skf_results.xlsx'.format(dir))
        skf_df = pd.read_excel(xls,
                               sheet_name='{}_{}_0'.format(name, min_idx),
                               index_col=0)

        df1 = skf_df.iloc[:, :fn + 1 + 2 * numel].sort_index()
        y_store = df1.iloc[:, fn + 1:fn + 1 + numel].values
        p_y = df1.iloc[:, fn + 1 + numel:fn + 1 + 2 * numel].values
        rc = np.mean(np.abs(y_store - p_y) / y_store)
        mse = np.mean((y_store - p_y)**2)

        df2 = skf_df.iloc[:, fn + 1 + 2 * numel:].reset_index(drop=True)
        best_name = '{}_{}'.format(name, min_idx)
        df2.iloc[0, 2] = best_name
        skf_df = pd.concat([df1, df2], axis=1, sort=False)

        sheet_names = wb.sheetnames
        if name in sheet_names:
            ws = wb[name]
        else:
            wb.create_sheet(name)
            ws = wb[name]

        print_df_to_excel(df=skf_df, ws=ws, index=True, header=True)

        return [best_name, mse, rc]

    while os.path.isfile(results_excel_dir):
        expand = 1
        while True:
            expand += 1
            new_file_name = results_excel_dir.split('.xlsx')[0] + ' - ' + str(
                expand) + '.xlsx'
            if os.path.isfile(new_file_name):
                continue
            else:
                results_excel_dir = new_file_name
                break

    best_store = []
    wb = openpyxl.Workbook()
    for dir, sheet in zip(dir_store, sheets):
        best_store.append(get_best_df(dir, sheet, wb))
    wb.save(results_excel_dir)

    cutoff = [10, 100]
    xls = pd.ExcelFile(results_excel_dir)

    p_y_store = []
    for sheet in sheets:
        df = pd.read_excel(xls, sheet_name=sheet, index_col=0)
        df = df.sort_index()

        p_y = df.iloc[:, fn + 1 + numel:fn + 1 + 2 * numel].values.tolist()
        p_y_store.append(p_y)

    y_store = df.iloc[:, fn + 1:fn + 1 + numel].values
    p_y_store_mean = np.mean(np.array(p_y_store), axis=0)

    combine_mse = np.mean((y_store - p_y_store_mean)**2)
    p_y_store.append(p_y_store_mean.tolist())

    rc = np.mean(np.abs(y_store - p_y_store_mean) / y_store)

    se = (y_store - p_y_store_mean)**2
    cumulative_mse = []
    for idx in range(np.shape(se)[0]):
        cumulative_mse.append(np.mean(se[0:idx + 1, :]))

    sheets.append('Combined')

    if plot_mode:
        for idx, [x, p_x_store] in enumerate(
                zip(y_store.tolist(),
                    np.swapaxes(np.array(p_y_store), 0, 1).tolist())):
            plt.plot([0, x[0], x[1], x[2]], [
                0, 0, 10 * (x[1] - x[0]), cutoff[0] *
                (x[1] - x[0]) + cutoff[1] * (x[2] - x[1])
            ],
                     c='r',
                     label='Actual Spline Fit')
            for idx1, p_x in enumerate(p_x_store):
                if idx1 == 3:
                    plt.plot([0, p_x[0], p_x[1], p_x[2]], [
                        0, 0, 10 * (p_x[1] - p_x[0]), cutoff[0] *
                        (p_x[1] - p_x[0]) + cutoff[1] * (p_x[2] - p_x[1])
                    ],
                             label=sheets[idx1])
            plt.legend(loc='upper left')
            plt.title('Expt. ' + str(idx + 1))
            plt.savefig('{}/Expt_{}.png'.format(plot_dir, idx + 1),
                        bbox_inches='tight')
            plt.close()

    df.iloc[:, fn + 1 + numel:fn + 1 + 2 * numel] = np.array(p_y_store[-1])
    df = df.iloc[:, :fn + 1 + 2 * numel]
    df['Cumulative MSE'] = cumulative_mse

    wb = openpyxl.load_workbook(results_excel_dir)
    wb.create_sheet('Results')
    names = wb.sheetnames
    ws = wb[names[-1]]
    print_df_to_excel(df=df, ws=ws, index=True, header=True)

    best_store = np.array(best_store).T.tolist()
    best_store[0].append('Combined')
    best_store[1].append(combine_mse)
    best_store[2].append(rc)

    col = fn + 1 + 1 + 2 * numel + 3
    ws.cell(1, col).value = 'models'
    print_array_to_excel(best_store[0], (1, col + 1), ws, axis=1)
    ws.cell(2, col + 0).value = 'mse'
    print_array_to_excel([[float(x) for x in y] for y in best_store[1:]],
                         (2, col + 1),
                         ws,
                         axis=2)
    ws.cell(3, col + 0).value = 'RC'
    wb.save(results_excel_dir)
Exemple #24
0
def eval_models(model_directory_store, results_dir):
    model_store = []
    for model_directory in model_directory_store:
        model_store.extend(load_model_ensemble(model_directory))

    test_excel_dir = './excel/ett_30testset_cut.xlsx'
    ett_names = [
        'I01-1', 'I01-2', 'I01-3', 'I05-1', 'I05-2', 'I05-3', 'I10-1', 'I10-2',
        'I10-3', 'I30-1', 'I30-2', 'I30-3', 'I50-1', 'I50-2', 'I50-3',
        '125Test', '125Test I01', '125Test I05', '125Test I10'
    ]
    ett_store = [
        './excel/ett_30testset_cut Invariant 1.xlsx',
        './excel/ett_30testset_cut Invariant 1 - 2.xlsx',
        './excel/ett_30testset_cut Invariant 1 - 3.xlsx',
        './excel/ett_30testset_cut Invariant 5.xlsx',
        './excel/ett_30testset_cut Invariant 5 - 2.xlsx',
        './excel/ett_30testset_cut Invariant 5 - 3.xlsx',
        './excel/ett_30testset_cut Invariant 10.xlsx',
        './excel/ett_30testset_cut Invariant 10 - 2.xlsx',
        './excel/ett_30testset_cut Invariant 10 - 3.xlsx',
        './excel/ett_30testset_cut Invariant 30.xlsx',
        './excel/ett_30testset_cut Invariant 30 - 2.xlsx',
        './excel/ett_30testset_cut Invariant 30 - 3.xlsx',
        './excel/ett_30testset_cut Invariant 50.xlsx',
        './excel/ett_30testset_cut Invariant 50 - 2.xlsx',
        './excel/ett_30testset_cut Invariant 50 - 3.xlsx',
        './excel/ett_125trainset_cut.xlsx',
        './excel/ett_125trainset_cut Invariant 1.xlsx',
        './excel/ett_125trainset_cut Invariant 5.xlsx',
        './excel/ett_125trainset_cut Invariant 10.xlsx'
    ]

    fl = load_data_to_fl(
        './excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx',
        label_type='cutoff',
        normalise_labels=False,
        norm_mask=[0, 1, 3, 4, 5])
    test_fl = load_testset_to_fl(test_excel_dir,
                                 scaler=fl.scaler,
                                 norm_mask=[0, 1, 3, 4, 5])
    ett_fl_store = [
        load_testset_to_fl(x, scaler=fl.scaler, norm_mask=[0, 1, 3, 4, 5])
        for x in ett_store
    ]

    ytt = test_fl.labels
    yett_store = [ett_fl.labels for ett_fl in ett_fl_store]

    stt_p_y_store = []
    stt_df_store = []
    stt_mse_store = []
    stt_mre_store = []

    sett_p_y_store = []
    sett_df_store = []
    sett_mse_store = []
    sett_mre_store = []

    for model in model_store:
        stt_p_y, stt_df, stt_mse, stt_mre = eval_model_on_fl(model,
                                                             test_fl,
                                                             return_df=True)
        stt_p_y_store.append(stt_p_y)
        stt_df_store.append(stt_df)
        stt_mse_store.append(stt_mse)
        stt_mre_store.append(stt_mre)

        p_y_store = []
        df_store = []
        mse_store = []
        mre_store = []

        for ett_fl in ett_fl_store:
            p_y, df, mse, mre = eval_model_on_fl(model, ett_fl, return_df=True)
            p_y_store.append(p_y)
            df_store.append(df)
            mse_store.append(mse)
            mre_store.append(mre)

        sett_p_y_store.append(p_y_store)
        sett_df_store.append(df_store)
        sett_mse_store.append(mse_store)
        sett_mre_store.append(mre_store)

    p_ytt_selected_mean = np.mean(np.array(stt_p_y_store), axis=0)
    p_yett_store_selected_mean = [
        np.mean(np.array(p_yett), axis=0)
        for p_yett in [list(x) for x in zip(*sett_p_y_store)]
    ]

    def get_mse_re(y, p_y):
        return np.mean((y - p_y)**2), np.mean(np.abs(y - p_y).T / y[:, -1].T)

    mse_tt, re_tt = get_mse_re(ytt, p_ytt_selected_mean)
    mse_re_ett_store = [
        get_mse_re(yett, p_yett)
        for yett, p_yett in zip(yett_store, p_yett_store_selected_mean)
    ]

    var_ett = []
    idx_store = [
        1, 1, 1, 5, 5, 5, 10, 10, 10, 30, 30, 30, 50, 50, 50, 0, 1, 5, 10
    ]

    for idx, (invariant,
              p_y) in enumerate(zip(idx_store, p_yett_store_selected_mean)):
        if invariant == 0:
            var_ett.append(0)
        else:
            if idx < 15:
                base_numel = 30
            else:
                base_numel = 125
            var_ett.append(
                np.mean([
                    np.std(np.concatenate(
                        (p_y[i:i + 1, :],
                         p_y[base_numel + invariant * i:base_numel +
                             invariant * i + invariant, :]),
                        axis=0),
                           axis=0) for i in range(base_numel)
                ]))

    # Printing to excel
    excel_name = results_dir + '/results.xlsx'
    wb = openpyxl.Workbook()
    wb.create_sheet('main')

    def print_results(name, y, p_y, mse, re):
        nonlocal wb
        wb.create_sheet(name)
        ws = wb[name]
        df = pd.DataFrame(np.concatenate((y, p_y), axis=1),
                          columns=['y1', 'y2', 'y3', 'P_y1', 'P_y2', 'P_y3'])
        print_df_to_excel(df=df, ws=ws)
        start_col = len(df.columns) + 3
        ws.cell(1, start_col).value = 'MSE'
        ws.cell(2, start_col).value = 'HE'
        ws.cell(1, start_col + 1).value = mse
        ws.cell(2, start_col + 1).value = re

    print_results('Test', ytt, p_ytt_selected_mean, mse_tt, re_tt)
    [
        print_results(name, yett_store[idx], p_yett_store_selected_mean[idx],
                      mse_re[0], mse_re[1]) for name, idx, mse_re in zip(
                          ett_names, range(len(yett_store)), mse_re_ett_store)
    ]

    df = pd.DataFrame(data=[[mse_tt] + [x[0] for x in mse_re_ett_store],
                            [re_tt] + [x[1] for x in mse_re_ett_store],
                            [0] + var_ett],
                      columns=['Test'] + ett_names,
                      index=['MSE', 'HE', 'Var'])

    print_df_to_excel(df=df, ws=wb['main'], start_row=5)
    wb.save(excel_name)
Exemple #25
0
def get_best_trial_from_rounds_custom_metric(dir_store,
                                             excel_subname,
                                             metric_cols,
                                             weightage,
                                             results_excel_dir,
                                             top_models=1):
    '''

    :param dir_store: directory where data_store.pkl and processed excel results are kept in
    :param excel_subname: name of the excel to base the model selection on.
    Either overall_summary (for 10 fold aggregated results) or solo_summary (for individual fold results)
    :param metric_cols: metric to base comparison on. Can be a list of metric, if so, need to give the weightage list
    :param weightage: list of weightage to multiply the metrics by
    :param results_excel_dir: excel name to print results in
    :param top_models: How many top models to average over for each round.
    :return:
    '''
    top_trials_store = []
    test_p_y_store = []

    for round, dir_pair in enumerate(dir_store):
        top_idx_pair = []
        data_store_pairs = []
        for pair_idx, dir in enumerate(dir_pair):
            for filename in os.listdir(dir):
                if filename == 'overall_summary.xlsx':
                    print('Importing data from {} in dir {}'.format(
                        filename, dir))
                    df = pd.read_excel('{}/{}'.format(dir, filename),
                                       index_col=None)
                    column_names = df.columns.tolist()
                    metric_store = np.array(
                        [df[metric].values for metric in metric_cols])
                    new_metric = np.sum(metric_store.T * np.array(weightage),
                                        axis=1)
                    if top_models == 1:
                        idx = np.argmin(new_metric)
                        top_trials_store.append(
                            df.iloc[idx, :].values.tolist() +
                            [new_metric[idx]])
                    else:
                        data_store = []
                        for filename in os.listdir(dir):
                            if filename.endswith(".pkl"):
                                with open('{}/{}'.format(dir, filename),
                                          'rb') as handle:
                                    data_store.extend(pickle.load(handle))
                        name_store = [
                            x[0][0][0].rpartition('_')[0] for x in data_store
                        ]
                        df['new'] = new_metric
                        df.sort_values(by='new', inplace=True)
                        top_rows = list(df.iloc[:top_models]['name'])
                        top_model_idx_store = []
                        if round == 11:
                            top_model_idx_store.extend([0, 1, 2])
                        else:
                            pass
                            #for row in top_rows:
                            #idx = '_{}_1'.format(row.split('_')[-1])
                            #top_model_idx_store.extend([i for i, x in enumerate(name_store) if x.find(idx) > -1])
                        top_model_idx_store = [
                            idx for row in top_rows
                            for idx, s in enumerate(name_store) if row == s
                        ]
                        top_idx_pair.append(top_model_idx_store)
                        data_store_pairs.append(data_store)

        [combined_row, un_se, un_re, t_se, t_re, var_ett,
         test_p_y] = combine_model(data_store=data_store_pairs,
                                   idx_store=top_idx_pair,
                                   return_test_p_y=True)
        test_p_y_store.append(test_p_y)
        combined_df = pd.DataFrame(np.array(combined_row)[None, :],
                                   columns=column_names)
        metric_store = np.array(
            [float(combined_df[metric].values) for metric in metric_cols])
        top_trials_store.append(
            combined_row + [np.sum(metric_store.T * np.array(weightage))] +
            var_ett + un_se.flatten().tolist() + un_re.flatten().tolist() +
            t_se.flatten().tolist() + t_re.flatten().tolist())

    wb = openpyxl.Workbook()
    ws = wb[wb.sheetnames[-1]]
    ett_names = [
        'I01-1', 'I01-2', 'I01-3', 'I05-1', 'I05-2', 'I05-3', 'I10-1', 'I10-2',
        'I10-3', 'I30-1', 'I30-2', 'I30-3', 'I50-1', 'I50-2', 'I50-3',
        '125Test', '125Test I01', '125Test I05', '125Test I10'
    ]
    df = pd.DataFrame(data=top_trials_store,
                      columns=column_names + ['New Metric'] +
                      ['std {}'.format(x) for x in ett_names] + [
                          'UN SE{}_{}'.format(y + 1, x + 1) for x in range(125)
                          for y in range(3)
                      ] + [
                          'UN RE{}_{}'.format(y + 1, x + 1) for x in range(125)
                          for y in range(3)
                      ] + [
                          'T SE{}_{}'.format(y + 1, x + 1) for x in range(30)
                          for y in range(3)
                      ] + [
                          'T RE{}_{}'.format(y + 1, x + 1) for x in range(30)
                          for y in range(3)
                      ])
    print_df_to_excel(df=df, ws=ws)
    for idx, p_y in enumerate(test_p_y_store):
        wb.create_sheet(str(idx))
        ws = wb[str(idx)]
        print_df_to_excel(pd.DataFrame(test_p_y), ws=ws)

    wb.save(results_excel_dir)
Exemple #26
0
def final_prediction_results(write_excel, model_dir_store,
                             combined_excel_store, rounds,
                             excel_loader_dir_store, fn, numel):
    wb = openpyxl.load_workbook(write_excel)
    results_col = fn + 1 + 1 + 2 * numel + 3
    mse_store = []
    mre_store = []
    mare_store = []
    final_excel_loader_dir = excel_loader_dir_store[-1]
    final_features = pd.read_excel(final_excel_loader_dir,
                                   sheet_name='features',
                                   index_col=0).sort_index().values
    final_df = pd.read_excel(final_excel_loader_dir,
                             sheet_name='cutoff',
                             index_col=0).sort_index()
    for idx, (model_dir, combined_excel, loader_excel, round) in enumerate(
            zip(model_dir_store, combined_excel_store, excel_loader_dir_store,
                rounds)):
        wb.create_sheet('Round {}'.format(round))
        ws = wb[wb.sheetnames[-1]]

        combined_df = pd.read_excel(combined_excel,
                                    sheet_name='Results',
                                    index_col=0).sort_index()
        numel_expt = combined_df.shape[0]
        total_expt = final_df.shape[0]

        y_store = combined_df.iloc[:, fn + 1:fn + 1 + numel].values
        p_y_store = combined_df.iloc[:,
                                     fn + 1 + numel:fn + 1 + 2 * numel].values

        if total_expt > numel_expt:
            model_store = load_model_ensemble(model_dir)
            fl = load_data_to_fl(data_loader_excel_file=loader_excel,
                                 norm_mask=[0, 1, 3, 4, 5],
                                 normalise_labels=True,
                                 label_type='cutoff')

            p_y, _ = model_ensemble_prediction(
                model_store, fl.apply_scaling(final_features[numel_expt:, :]))

            y_store = np.concatenate(
                (y_store, final_df.values[numel_expt:, :]), axis=0)
            p_y_store = np.concatenate((p_y_store, p_y), axis=0)

        se_store = (y_store - p_y_store)**2
        re_store = np.abs(y_store - p_y_store) / y_store
        are_store = np.arctan(re_store)

        column_headers = final_df.columns.values.tolist()

        df = pd.DataFrame(data=np.concatenate(
            (y_store, p_y_store, se_store, re_store, are_store), axis=1),
                          index=list(final_df.index),
                          columns=column_headers +
                          ['P_{}'.format(col) for col in column_headers] +
                          ['SE_{}'.format(col) for col in column_headers] +
                          ['RE_{}'.format(col) for col in column_headers] +
                          ['ARE_{}'.format(col) for col in column_headers])
        print_df_to_excel(df=df, ws=ws)

        col = fn + 1 + 1 + 2 * numel + 3
        mse_store.append(np.mean(se_store))
        mre_store.append(np.mean(re_store))
        mare_store.append(np.mean(are_store))
        ws.cell(1, col).value = 'MSE'
        ws.cell(1, col + 1).value = mse_store[-1]
        ws.cell(2, col).value = 'MRE'
        ws.cell(2, col + 1).value = mre_store[-1]
        ws.cell(3, col).value = 'ARE'
        ws.cell(3, col + 1).value = mare_store[-1]

    wb.create_sheet('Final_results')
    ws = wb[wb.sheetnames[-1]]
    df = pd.DataFrame(data=np.array([mse_store, mre_store, mare_store]),
                      index=['mse', 're', 'are'],
                      columns=rounds)
    print_df_to_excel(df=df, ws=ws)
    wb.save(write_excel)
Exemple #27
0
def mse_tracker(excel_store, write_excel, rounds, headers, fn, numel):
    while os.path.isfile(write_excel):
        expand = 1
        while True:
            expand += 1
            new_file_name = write_excel.split('.xlsx')[0] + ' - ' + str(
                expand) + '.xlsx'
            if os.path.isfile(new_file_name):
                continue
            else:
                write_excel = new_file_name
                break
    print('Writing into' + write_excel + '\n')

    mse_store = []
    rc_store = []
    se_store = []
    re_store = []
    are_store = []
    last_expt_store = []
    for excel in excel_store:
        df = pd.read_excel(excel, sheet_name='Results', index_col=0)
        mse_store.append(df.iloc[0, 16:].values.tolist())
        rc_store.append(df.iloc[1, 16:].values.tolist())
        y_store = df.iloc[:, fn + 1:fn + 1 + numel].values
        p_y = df.iloc[:, fn + 1 + numel:fn + 1 + 2 * numel].values
        se = np.square(p_y - y_store)
        se_store.append(se)
        re = np.abs(p_y - y_store) / y_store
        are = np.arctan(re)
        re_store.append(re)
        are_store.append(are)
        last_expt_store.append(np.shape(se)[0])

    wb = openpyxl.Workbook()
    wb.create_sheet('MSE Results')
    ws = wb[wb.sheetnames[-1]]
    mse_df = pd.DataFrame(data=mse_store, index=rounds, columns=headers)
    print_df_to_excel(df=mse_df, ws=ws)

    wb.create_sheet('RE Results')
    ws = wb[wb.sheetnames[-1]]
    rc_df = pd.DataFrame(data=rc_store, index=rounds, columns=headers)
    print_df_to_excel(df=rc_df, ws=ws)

    wb.create_sheet('ARE Results')
    ws = wb[wb.sheetnames[-1]]
    are_df = pd.DataFrame(data=[np.mean(x) for x in are_store],
                          index=rounds,
                          columns=['Combined'])
    print_df_to_excel(df=are_df, ws=ws)

    wb.create_sheet('Batch MSE Results')
    batch_store = []
    re_batch_store = []
    are_batch_store = []
    last_expt_store0 = [0] + last_expt_store[:-1]
    for idx, (last_expt_idx0, last_expt_idx) in enumerate(
            zip(last_expt_store0, last_expt_store)):
        batch = []
        re_batch = []
        are_batch = []
        for se, re, are in zip(se_store[idx:], re_store[idx:],
                               are_store[idx:]):
            batch.append(np.mean(se[last_expt_idx0:last_expt_idx, :]))
            re_batch.append(np.mean(re[last_expt_idx0:last_expt_idx, :]))
            are_batch.append(np.mean(are[last_expt_idx0:last_expt_idx, :]))
        batch_store.append(batch)
        re_batch_store.append(re_batch)
        are_batch_store.append(are_batch)
    batch_store = [[''] * idx + batch for idx, batch in enumerate(batch_store)]
    ws = wb[wb.sheetnames[-1]]
    df = pd.DataFrame(data=batch_store, index=last_expt_store, columns=rounds)
    print_df_to_excel(df=df, ws=ws)

    wb.create_sheet('Batch RE Results')
    re_batch_store = [[''] * idx + re_batch
                      for idx, re_batch in enumerate(re_batch_store)]
    ws = wb[wb.sheetnames[-1]]
    df = pd.DataFrame(data=re_batch_store,
                      index=last_expt_store,
                      columns=rounds)
    print_df_to_excel(df=df, ws=ws)

    wb.create_sheet('Batch ARE Results')
    are_batch_store = [[''] * idx + are_batch
                       for idx, are_batch in enumerate(are_batch_store)]
    ws = wb[wb.sheetnames[-1]]
    df = pd.DataFrame(data=are_batch_store,
                      index=last_expt_store,
                      columns=rounds)
    print_df_to_excel(df=df, ws=ws)

    wb.save(write_excel)
Exemple #28
0
        count_non2 = (ret['stationary case'] != 2).sum()
        count_trend = (ret['ywmk trend'] != 'no trend').sum()
        ret['type'] = data_type_store

        ret = pd.concat((pd.DataFrame.from_dict({'fail count':{'stationary case':count_non2, 'ywmk trend': count_trend}}, orient='index'), ret), axis=0)
        ret = ret[['index', 'type', 'suggested type', 'adf p_value', 'kpss p_value', 'stationary case', 'dfgls p_value',
                   'ywmk p_value', 'ywmk slope', 'ywmk trend']]
        return ret

    time_stamps = [f'{x}:{y}' for x, y in
                   zip(pd.DatetimeIndex(time_stamps).year, pd.DatetimeIndex(time_stamps).day)]

    # Stationary tests for transformed X
    wb.create_sheet('tests')
    ws = wb['tests']
    print_df_to_excel(df=summary_test(df.copy(), data_type_store=data_type_store), ws=ws)
    df['sasdate'] = time_stamps
    for var in df.columns.values[1:]:
        plt.close()
        df.plot(y=var, x='sasdate')
        plt.savefig(f'{results_dir}/plot_{var}.png')

    # Stationary tests for transformed X BUT exclude COVID ==> Last period 2019:12
    wb.create_sheet('tests 2019-12')
    ws = wb['tests 2019-12']
    print_df_to_excel(df=summary_test(df.iloc[:-6, :].copy(), data_type_store=data_type_store), ws=ws)

    idx = np.where(np.array(time_stamps) == '2015:12')[0][0]
    wb.create_sheet('tests 2015-12')
    ws = wb['tests 2015-12']
    print_df_to_excel(df=summary_test(df.iloc[:idx+1, :].copy(), data_type_store=data_type_store), ws=ws)
def run_classification(grid_fl_dir, write_dir, gamma):
    # Load grid fl
    with open(grid_fl_dir, 'rb') as handle:
        fl = pickle.load(handle)
    # Create 10 fold for cross validation
    fl_store = fl.create_kf(k_folds=10, shuffle=True)
    # Run k model instance to perform skf
    # Results dataframe has the columns: ['idx', 'fold', 'CNT', 'PVA', 'Label', 'Prediction']
    # For each fold, append the fold information to the following lists:
    val_idx = []
    folds = []
    val_features = []
    val_labels = []
    predicted_labels_store = []
    # fl_store is a 10 item list where each item is a tuple containing the train and val fl
    for fold, fl_tuple in enumerate(fl_store):
        instance_start = time.time()
        (ss_fl,
         i_ss_fl) = fl_tuple  # ss_fl is training fl, i_ss_fl is validation fl
        # Train model
        model = SVMmodel(fl=ss_fl, gamma=gamma)
        model.train_model(fl=ss_fl)
        # Evaluation
        predicted_labels = model.predict(i_ss_fl)
        # Saving model
        save_model_name = write_dir + '/models/svm_' + str(fold + 1) + '.pkl'
        print('Saving instance {} model in {}'.format(fold + 1,
                                                      save_model_name))
        with open(save_model_name, 'wb') as handle:
            pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
        # Preparing data to put into new_df that consists of all the validation dataset and its predicted labels
        val_idx.extend(i_ss_fl.idx)
        folds.extend(
            [fold] * i_ss_fl.count
        )  # Make a col that contains the fold number for each example
        if len(val_features):
            val_features = np.concatenate((val_features, i_ss_fl.features),
                                          axis=0)
        else:
            val_features = i_ss_fl.features
        val_labels.extend(i_ss_fl.labels)
        predicted_labels_store.extend(predicted_labels)
        # Printing one instance summary.
        instance_end = time.time()
        print(
            '\nFor k-fold run {} out of {}. Each fold has {} examples. Time taken for '
            'instance = {}\n'
            '####################################################################################################'
            .format(fold + 1, 10, i_ss_fl.count,
                    instance_end - instance_start))

    # Calculating metrics based on complete validation prediction
    mcc = matthews_corrcoef(y_true=val_labels, y_pred=predicted_labels_store)

    # Creating dataframe to print into excel later.
    results_df = np.concatenate(
        (
            np.array(folds)[:, None],  # Convert 1d list to col. vector
            val_features,
            np.array(val_labels)[:, None],
            np.array(predicted_labels_store)[:, None]),
        axis=1)
    headers = ['folds'] + \
              ['CNT', 'PVA'] + \
              ['Labels'] + \
              ['Prediction']
    # val_idx is the original position of the example in the data_loader
    results_df = pd.DataFrame(data=results_df, columns=headers, index=val_idx)
    # Create excel file and print results to excel
    excel_file = create_excel_file(f'{write_dir}/classifier_results.xlsx')
    print('Writing into' + excel_file)
    wb = openpyxl.Workbook()
    # Create results sheet
    wb.create_sheet('results')
    ws = wb['results']
    # Print results df
    print_df_to_excel(df=results_df, ws=ws)
    # Writing hyperparameter information at the side
    start_col = len(results_df.columns) + 3
    headers = ['mcc', 'gamma']
    values = [mcc, gamma]
    print_array_to_excel(np.array(headers), (1, start_col + 1), ws, axis=1)
    print_array_to_excel(np.array(values), (2, start_col + 1), ws, axis=1)
    wb.save(excel_file)
    wb.close()
Exemple #30
0
        excel_dir = create_excel_file('./results/smote_data.xlsx')
        fl = load_data_to_fl(
            data_loader_excel_file=
            './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.format(
                13, 13),
            normalise_labels=True,
            label_type='cutoff',
            norm_mask=[0, 1, 3, 4, 5])
        f, l = produce_smote(features=fl.features_c,
                             labels=fl.labels,
                             numel=4000)

        wb = openpyxl.Workbook()
        ws = wb[wb.sheetnames[-1]]
        print_df_to_excel(df=pd.DataFrame(
            data=np.concatenate((f, l), axis=1),
            columns=fl.features_c_names.tolist() + fl.labels_names.tolist()),
                          ws=ws)
        wb.save(excel_dir)
        pass
    elif case == 2:
        testset_excel_dir = './excel/ett_125trainset_gf20.xlsx'
        create_invariant_testset(testset_excel_dir=testset_excel_dir, numel=1)
        create_invariant_testset(testset_excel_dir=testset_excel_dir, numel=1)
        create_invariant_testset(testset_excel_dir=testset_excel_dir, numel=1)
        create_invariant_testset(testset_excel_dir=testset_excel_dir, numel=5)
        create_invariant_testset(testset_excel_dir=testset_excel_dir, numel=5)
        create_invariant_testset(testset_excel_dir=testset_excel_dir, numel=5)
        create_invariant_testset(testset_excel_dir=testset_excel_dir, numel=10)
        create_invariant_testset(testset_excel_dir=testset_excel_dir, numel=10)
        create_invariant_testset(testset_excel_dir=testset_excel_dir, numel=10)
        create_invariant_testset(testset_excel_dir=testset_excel_dir, numel=30)