def selector(case): if case == 1: write_dir = create_results_directory( results_directory='./results/svm_results with proba') run_classification(read_dir='./results/grid full', write_dir=write_dir, gamma=130) elif case == 2: fl = load_data_to_fl('./excel/Data_loader_spline_full_onehot_R4.xlsx', normalise_labels=True, norm_mask=[0, 0, 0, 1, 1, 1]) write_dir = create_results_directory( results_directory='./results/svr_results', excels=['svr_results.xlsx']) run_svr(fl=fl, write_dir=write_dir, excel_dir=write_dir + '/svr_results.xlsx', model_selector='svr', gamma=0.2694100909858187) elif case == 3: fl = load_data_to_fl('./excel/Data_loader_spline_full_onehot_R4.xlsx', normalise_labels=False, norm_mask=[0, 0, 0, 1, 1, 1]) hparams = create_hparams(shared_layers=[30, 30], epochs=700, reg_l1=0.05, reg_l2=0.05, activation='relu', batch_size=16, verbose=0) write_dir = create_results_directory( results_directory='./results/svr_results', excels=['svr_results.xlsx']) run_svr(fl=fl, write_dir=write_dir, excel_dir=write_dir + '/svr_results.xlsx', model_selector='ann', hparams=hparams) elif case == 4: fl = load_data_to_fl( './excel/Data_loader_spline_full_onehot_R6_arcsinh.xlsx', normalise_labels=False, norm_mask=[0, 0, 0, 1, 1, 1]) fl_store = fl.create_kf(k_folds=10, shuffle=True) write_dir = create_results_directory( results_directory='./results/end_hparams_results', excels=['svr_results.xlsx', 'hparam_results.xlsx']) ann_end_hparam_opt(fl_store, 150, model_selector='ann', write_dir=write_dir, excel_dir=write_dir + '/svr_results.xlsx', hparams_excel_dir=write_dir + '/hparam_results.xlsx')
def features_pearson_analysis(data_excel, results_directory): write_dir = create_results_directory(results_directory) try: del mpl.font_manager.weight_dict['roman'] mpl.font_manager._rebuild() except KeyError: pass sns.set(style='ticks') mpl.rc('font', family='Times New Roman') df = pd.read_excel(data_excel, index_col=0, sheet_name='features') df_labels = pd.read_excel(data_excel, index_col=0, sheet_name='cutoff') working_range = df_labels.iloc[:, -1].values - df_labels.iloc[:, -2].values df.insert(loc=df.shape[-1] - 3, column='Working Range', value=working_range) df1 = df[df.iloc[:, -3] == 1].iloc[:, :-3] df2 = df[df.iloc[:, -2] == 1].iloc[:, :-3] df3 = df[df.iloc[:, -1] == 1].iloc[:, :-3] x_store = ['CNT Mass Percentage', 'PVA Mass Percentage', 'Thickness nm', 'Mxene Mass Percentage'] mypal = sns.hls_palette(4, l=.3, s=.8) for dimension, df in enumerate([df1, df2, df3]): df['Mxene Mass Percentage'] = 1 - df.iloc[:, 0] - df.iloc[:, 1] for x, color in zip(x_store, mypal): plt.close() sns.jointplot(x=x, y='Working Range', data=df, alpha=0.3, color=color, stat_func=stat.pearsonr) plt.savefig('{}/{}_dim_{}.png'.format(write_dir, x, dimension), bbox_inches='tight')
def lvl2_xgb_randomsearch(rawdf, results_dir, pp_choice, param_dir, passthrough, final_pp_choice=None): x_train = rawdf.iloc[:, :-1] y_train = rawdf.iloc[:, -1] model_store = ['rf', 'et', 'xgb'] model_object = { 'xgb': XGBRegressor(), 'rf': RandomForestRegressor(), 'et': ExtraTreesRegressor() } with open(param_dir, 'rb') as f: model_results = pickle.load(f) model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in model_results.items()} model_object = {k: model_object[k].set_params(**{kk.split('__')[1]: vv for kk, vv in v.loc[0, 'params'].items()}) for k, v in model_results.items()} preprocess_pipeline = pp_selector(pp_choice) lvl1_pipeline = [ (model_name, Pipeline([ ('preprocess', preprocess_pipeline), (model_name, model_object[model_name]) ]) ) for model_name in model_store] final_estimator_params = {'final_estimator__final_est__n_estimators': scipy.stats.randint(150, 1000), 'final_estimator__final_est__learning_rate': scipy.stats.uniform(0.01, 0.59), 'final_estimator__final_est__subsample': scipy.stats.uniform(0.3, 0.6), 'final_estimator__final_est__max_depth': scipy.stats.randint(1, 16), 'final_estimator__final_est__colsample_bytree': scipy.stats.uniform(0.5, 0.4), 'final_estimator__final_est__min_child_weight': [1, 2, 3, 4], 'final_estimator__final_est__gamma': scipy.stats.expon(scale=0.05), } if passthrough: final_est = Pipeline([ ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(), preprocess_pipeline=pp_selector(final_pp_choice), no_of_lvl1=len(lvl1_pipeline))), ('debugger', DebuggerTransformer(info='final')), ('final_est', XGBRegressor()) ]) else: final_est = XGBRegressor() est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=final_est, passthrough=passthrough) est = RandomizedSearchCV(est, param_distributions=final_estimator_params, cv=5, n_iter=100, scoring=make_scorer(rmsle, greater_is_better=False), verbose=1, n_jobs=-1) est.fit(x_train, y_train) score = {'lvl2_xgb': est.cv_results_} results_dir = create_results_directory(results_dir) with open(f'{results_dir}/results_store.pkl', 'wb') as f: pickle.dump(score, f)
def run_skf_with_te_nofolds(inputs, plot_spline, smote_numel): shared, end, pre, filters, epochs, label_type = inputs hparams = create_hparams(shared_layers=[30, 30], ts_layers=[5, 5], cs_layers=[5, 5], shared=shared, end=end, pre=pre, filters=filters, epochs=epochs, reg_l1=0.0005, reg_l2=0., max_depth=100, num_est=1000, epsilon=0.0001, c=0.001, activation='relu', batch_size=4, verbose=0) write_dir = create_results_directory('./results/skf', folders=['plots', 'models', 'learning rate plots'], excels=['skf_results', 'te.xlsx']) fl = load_data_to_fl('./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx', label_type=label_type, normalise_labels=False, norm_mask=[0, 1, 3, 4, 5]) if smote_numel: fl_store = fl.fold_smote_kf_augment(k_folds=10, shuffle=True, numel=smote_numel) else: fl_store = fl.create_kf(k_folds=10, shuffle=True) run_skf_with_training_error(model_mode='ann3', loss_mode='ann', fl=fl, fl_store=[[fl, fl]], hparams=hparams, skf_file=write_dir + '/skf_results.xlsx', te_sheet=write_dir + '/te.xlsx', skf_sheet=None, k_folds=10, k_shuffle=True, save_model=True, save_model_name=None, save_model_dir=write_dir + '/models/', plot_name=write_dir + '/learning rate plots/plot') write_excel = create_excel_file('{}/training_error.xlsx'.format(write_dir)) testset_model_results_to_excel(write_excel=write_excel, model_dir_store=['{}/models'.format(write_dir)], loader_excel='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx', testset_excel_dir='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx', fn=6, numel=3, chunks=10)
def plot_forecasts(save_dir_store, results_dir, model_names, est_store, h_store): results_dir = create_results_directory(results_dir) for h, sds in zip(h_store, save_dir_store): for idx, (model_name, est, save_dir) in enumerate(zip(model_names, est_store, sds)): with open(save_dir, 'rb') as handle: data_df = pickle.load(handle)['data_df'] if idx == 0: df = data_df[[f'y_{h}']] if model_name in ['rw', 'ar', 'pca']: df = pd.concat((df, data_df[[f'{model_name}_ehat']]), axis=1) elif 'xgba' in model_name: df = pd.concat((df, data_df[[f'rw_ehat']].rename( columns={'rw_ehat': f'{model_name}_rw_{est}_ehat'}, inplace=False)), axis=1) elif model_name == 'rf': df = pd.concat((df, data_df[[f'rf_ehat']].rename( columns={'rf_ehat': f'rf_{est}_ehat'}, inplace=False)), axis=1) ax = df[[x for x in df.columns if '_ehat' in x]].plot(lw=0.5) ax.ylabel = 'ehat' plt.savefig(f'{results_dir}/{h}_ehat_all.png', bbox_inches='tight') plt.close() ax = df[[ x for x in df.columns if any([y in x for y in ['ar', 'pca', 'rw_rh']]) ]].plot(lw=0.5) ax.ylabel = 'ehat' plt.savefig(f'{results_dir}/{h}_ehat_arpcaxgbarwrh.png', bbox_inches='tight') plt.close() df = df[[x for x in df.columns if '_ehat' in x]] df.columns = [x.partition('_ehat')[0] for x in df.columns] df_temp = df.pow(2).rolling( 12, min_periods=1).apply(lambda x: np.sqrt(x.mean())) ax = df_temp.plot(lw=0.5) plt.savefig(f'{results_dir}/{h}_rolling_rmse.png', bbox_inches='tight') plt.close() ax = df_temp.drop(['rw', 'ar', 'pca'], axis=1).plot(lw=0.5) plt.savefig(f'{results_dir}/{h}_xgb_rolling_rmse.png', bbox_inches='tight') plt.close() ax = df_temp.drop(['rw', 'ar', 'pca'], axis=1).divide(df_temp.drop(['rw', 'ar', 'pca'], axis=1).sum(axis=1), axis=0).plot.area(lw=0.5) plt.savefig(f'{results_dir}/{h}_xgb_rolling_rmse_stackedplot.png', bbox_inches='tight') plt.close() pass
def selector(run, **kwargs): if run == 1: write_dir = kwargs['write_dir'] acquisition_opt(bounds=bounds, model_directory='{}/models'.format(write_dir), svm_directory='./results/svm gamma130/models', loader_file='./excel/Data_loader_spline_full_onehot_R1_cut_CM3.xlsx', total_run=2000, random_run=1700, batch_runs=1, normalise_labels=False, norm_mask=[0, 1, 3, 4, 5], acquisition_file='{}/acq.xlsx'.format(write_dir)) elif run == 1.1: write_dir = kwargs['write_dir'] round = kwargs['round'] batch = kwargs['batch'] initial_guess = kwargs['initial_guess'] params = {'c1': 1.5, 'c2': 1.5, 'wmin': 0.4, 'wmax': 0.9, 'ga_iter_min': 2, 'ga_iter_max': 10, 'iter_gamma': 10, 'ga_num_min': 5, 'ga_num_max': 20, 'num_beta': 15, 'tourn_size': 3, 'cxpd': 0.9, 'mutpd': 0.05, 'indpd': 0.5, 'eta': 0.5, 'pso_iter': 10, 'swarm_size': 300} acquisition_opt_pso_ga(bounds=bounds, write_dir=write_dir, svm_directory='./results/svm gamma130/models', loader_file='./excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.format(round), batch_runs=batch, pso_params=params, initial_guess=initial_guess, normalise_labels=False, norm_mask=[0, 1, 3, 4, 5]) elif run == 2: write_dir = './results/actual/conv1 run2' plot_acq_splines(write_dir=write_dir, fn=6) elif run == 3: hparams = create_hparams(shared_layers=[50, 50], ts_layers=[5, 5], cs_layers=[5, 5], epochs=5000, reg_l1=0.05, reg_l2=0., activation='relu', batch_size=16, verbose=0) write_dir = create_results_directory('./results/acq', folders=['plots', 'models', 'learning rate plots'], excels=['acq_exp']) variance_error_experiement('conv1', 'ann', norm_mask=None, labels_norm=False, loader_file='./excel/Data_loader_spline.xlsx', model_dir=write_dir + '/models/', hparams=hparams, results_excel=write_dir + '/acq_exp.xlsx') elif run == 4: numel = kwargs['numel'] svm_store = kwargs['svm_store'] seed_number_expt = kwargs['seed_number_expt'] total_expt = kwargs['total_expt'] write_dir = kwargs['write_dir'] l2_points_opt(numel=numel, write_dir=write_dir, svm_directory=svm_store, l2_opt=False, seed_number_of_expt=seed_number_expt, total_expt=total_expt)
def run_preprocess(select): if select == 1: write_dir = create_results_directory('./results/preprocess_poly', excels=['results']) read_excel_data('./excel/Raw_Data_caa_090219.xlsx', write_excel_file=write_dir + '/results.xlsx', normalise_r=False, mode='multipoly_cutoff', plot_directory=write_dir + '/plots', poly=2) elif select == 2: write_dir = create_results_directory('./results/preprocess') read_excel_data_to_spline( read_excel_file='./excel/Raw_Data_Round2_removed_outlier_a.xlsx', write_dir=write_dir, discrete_points=20, spline_selector=1) elif select == 3: write_dir = create_results_directory('./results/grid') read_grid_data(read_excel_file='./excel/grid.xlsx', write_dir=write_dir)
def selector(case): if case == 1: # Run svm_hparam opt to determine the optimal gamma grid_fl_dir = './demo/grid/grid_data' svm_hparam_opt(grid_fl_dir=grid_fl_dir, total_run=20, write_excel_dir='./results/svm_hparam_opt.xlsx') elif case == 2: # Run svm_classifier for a particular value of gamma and save those models grid_fl_dir = './demo/grid/grid_data' results_dir = create_results_directory( './results/svm_classifier/gamma130', folders=['models']) run_classification(grid_fl_dir=grid_fl_dir, write_dir=results_dir, gamma=130)
def run_preprocess(select): # Selector to choose which code from preprocessing to run. # Here, we only change the file directory to choose which excel file is inputted into the various functions if select == 1: write_dir = create_results_directory( './results/preprocessing/preprocess_round13') read_excel_data_to_cutoff( read_excel_file='./excel/Raw_Data_Round13.xlsx', write_dir=write_dir) elif select == 2: write_dir = create_results_directory('./results/preprocessing/grid') read_grid_data(read_excel_file='./excel/grid.xlsx', write_dir=write_dir) elif select == 3: l2_tracker( write_excel_dir= './results/preprocessing/l2_information_rounded.xlsx', final_excel_loader='./excel/Data_loader_Round13_rounded.xlsx', last_idx_store=[ 11, 16, 21, 29, 37, 45, 69, 77, 85, 93, 101, 109, 117, 125 ])
def lvl2_ridgecv(rawdf, results_dir, pp_choice, param_dir, passthrough, final_pp_choice=None, ): x_train = rawdf.iloc[:, :-1] y_train = rawdf.iloc[:, -1] model_store = ['rf', 'et', 'xgb'] model_object = { 'xgb': XGBRegressor(), 'rf': RandomForestRegressor(), 'et': ExtraTreesRegressor() } with open(param_dir, 'rb') as f: model_results = pickle.load(f) model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in model_results.items()} model_object = {k: model_object[k].set_params(**{kk.split('__')[1]: vv for kk, vv in v.loc[0, 'params'].items()}) for k, v in model_results.items()} lvl1_pipeline = [ (model_name, Pipeline([ ('preprocess', pp_selector(pipeline_idx)), ('debugger', DebuggerTransformer(info='lvl1')), (model_name, model_object[model_name]) ]) ) for model_name, pipeline_idx in zip(model_store, pp_choice)] if passthrough: final_est = Pipeline([ ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(), preprocess_pipeline=pp_selector(final_pp_choice), no_of_lvl1=len(lvl1_pipeline))), ('debugger', DebuggerTransformer(info='final')), ('final_est', RidgeCV()) ]) else: final_est = RidgeCV() est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=final_est, passthrough=passthrough) score = cross_validate(est, x_train, y_train, cv=5, return_train_score=True, scoring=make_scorer(rmsle, greater_is_better=False)) results_dir = create_results_directory(results_dir) with open(f'{results_dir}/results_store.pkl', 'wb') as f: pickle.dump(score, f)
def run_skf_conv1(inputs, plot_spline, smote_numel): shared, end, pre, filters, epochs, label_type = inputs hparams = create_hparams(shared_layers=[30, 30], ts_layers=[5, 5], cs_layers=[5, 5], shared=shared, end=end, pre=pre, filters=filters, epochs=epochs, reg_l1=0.05, reg_l2=0., max_depth=5, num_est=200, activation='relu', batch_size=16, verbose=0) write_dir = create_results_directory('./results/skf', folders=['plots', 'models', 'learning rate plots'], excels=['skf_results']) fl = load_data_to_fl('./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx', label_type=label_type, normalise_labels=True, norm_mask=[0, 0, 0, 1, 1, 1]) if smote_numel: fl_store = fl.fold_smote_kf_augment(k_folds=10, shuffle=True, numel=smote_numel) else: fl_store = fl.create_kf(k_folds=10, shuffle=True) run_skf(model_mode='dtr', loss_mode='dtr', fl=fl, fl_store=fl_store, hparams=hparams, skf_file=write_dir + '/skf_results.xlsx', skf_sheet=None, k_folds=10, k_shuffle=True, save_model=True, save_model_name=None, save_model_dir=write_dir + '/models/', plot_name=write_dir + '/learning rate plots/plot') if plot_spline: if label_type == 'points': plot_arcsinh_predicted_splines(plot_dir='{}/plots'.format(write_dir), results_excel_dir='{}/skf_results.xlsx'.format(write_dir), end_excel_dir='./results/combine Round 6/end 6e.xlsx', sheets=['ann3'], fn=6, numel=100) elif label_type == 'cutoff': plot_cutoff(plot_dir='{}/plots'.format(write_dir), results_excel_dir='{}/skf_results.xlsx'.format(write_dir), sheets=['ann3'], fn=6, numel=3) write_excel = create_excel_file('{}/training_error.xlsx'.format(write_dir)) testset_model_results_to_excel(write_excel=write_excel, model_dir_store=['{}/models'.format(write_dir)], loader_excel='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx', testset_excel_dir='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx', fn=6, numel=3, chunks=10) return write_dir
data_store = prepare_grand_data_store([ './results/hparams_opt round 13 DTR', './results/hparams_opt round 13 dtr_deep_I10_round_13', './results/hparams_opt round 13 dtr_deep_I50_round_13', './results/hparams_opt round 13 dtr_deep_I100_round_13', './results/hparams_opt round 13 ann Invariant HE', './results/hparams_opt round 13 ann NDA HE', ]) hparams = { 'init': [0.95, 0.05], 'n_gen': 800, 'n_pop': 5000, 'eval_func': 'eval2' } results_dir = create_results_directory(write_dir) ga_train_val_eval_on_test(results_dir=results_dir, data_store=data_store, hparams=hparams) elif case == 4: write_dir = kwargs['write_dir'] file_name = '{}/data_store.pkl'.format(write_dir) # Load data (deserialize) with open(file_name, 'rb') as handle: data_store = pickle.load(handle) read_hparam_data(data_store=data_store, write_dir=write_dir) pass selector(case=2, write_dir='./results/ga combination deep,N,S,I10,50,100_N,S,I10')
levels, model=id['model'], nber_excel_dir='./excel/NBER_062020.xlsx', est_dates=est_dates, first_est_date=first_est_date, combinations=[ ['rw', 'll*ln'], ['rw', 'llt*ln'], ['rw', 'll*ln', 'llt*ln'], ]) elif case == 4: # Run poos experiment for ar4 or pca var_name = kwargs['var_name'] excel_dir = kwargs['excel_dir'] results_dir = create_results_directory( './results/expt1/{}'.format(var_name)) output = read_excel_dataloader(excel_dir=excel_dir) fl_master = Fl_master(x=output[0], features_names=output[1], yo=output[2], labels_names=output[3], y=output[4], y_names=output[5], time_stamp=output[6]) fl = Fl_ar(val_split=None, x=None, yo=None, y=None, time_stamp=None, time_idx=None, features_names=fl_master.features_names,
def lvl1_randomsearch(rawdf, testdf, results_dir, pp_choice, lt_choice=None): ''' :param rawdf: :param results_dir: :param pp_choice: preprocessing choice :param lt_choice: label tranformation choice. None is no transformation. :return: ''' results_dir = create_results_directory(results_dir) x_train = rawdf.iloc[:, :-1] y_train = rawdf.iloc[:, -1] x_test = testdf model_store = ['rf', 'et', 'xgb'] model_object = { 'xgb': XGBRegressor(), 'rf': RandomForestRegressor(), 'et': ExtraTreesRegressor() } model_param = { 'xgb': {'xgb__n_estimators': scipy.stats.randint(150, 1000), 'xgb__learning_rate': scipy.stats.uniform(0.01, 0.59), 'xgb__subsample': scipy.stats.uniform(0.3, 0.6), 'xgb__max_depth': scipy.stats.randint(1, 16), 'xgb__colsample_bytree': scipy.stats.uniform(0.5, 0.4), 'xgb__min_child_weight': [1, 2, 3, 4], 'xgb__gamma': scipy.stats.expon(scale=0.05), }, 'rf': {"rf__max_depth": [None], "rf__max_features": scipy.stats.randint(1, 11), "rf__min_samples_split": scipy.stats.randint(2, 11), "rf__min_samples_leaf": scipy.stats.randint(1, 11), "rf__bootstrap": [False], "rf__n_estimators": scipy.stats.randint(10, 300), }, 'et': {"et__max_depth": [None], "et__max_features": scipy.stats.randint(1, 11), "et__min_samples_split": scipy.stats.randint(2, 11), "et__min_samples_leaf": scipy.stats.randint(1, 11), "et__bootstrap": [False], "et__n_estimators": scipy.stats.randint(10, 300), } } results_store = {} preprocess_pipeline = pp_selector(pp_choice, rawdf) if lt_choice is None: scorer = make_scorer(rmsle, greater_is_better=False) elif lt_choice == 1 or lt_choice == 2: y_train = np.log(y_train) scorer = 'neg_root_mean_squared_error' for model_name in model_store: model = Pipeline([ ('preprocess', preprocess_pipeline), (model_name, model_object[model_name]) ]) clf = RandomizedSearchCV(model, param_distributions=model_param[model_name], cv=5, n_iter=100, scoring=scorer, verbose=1, n_jobs=-1, refit=True) clf.fit(x_train, y_train) results_store[model_name] = clf.cv_results_ if lt_choice is None: pred_y_test = clf.predict(x_test) elif lt_choice == 1: pred_y_test = np.exp(clf.predict(x_test)) elif lt_choice == 2: pred_logy_test = clf.predict(x_test) pred_y_test = np.exp(pred_logy_test + get_corr(np.exp(y_train), clf.predict(x_train), error_func=rmsle, options={'gtol': 1e-04})) sub = pd.DataFrame() sub['Id'] = x_test['Id'] sub['SalePrice'] = pred_y_test sub.to_csv(f'{results_dir}/{model_name}_{results_dir.split("/")[-1]}_predictions.csv', index=False) with open(f'{results_dir}/results_store.pkl', 'wb') as f: pickle.dump(results_store, f)
def lvl2_xgb_vsrandomsearch(rawdf, results_dir, pp_choice, param_dir, passthrough, final_pp_choice=None): x_train = rawdf.iloc[:, :-1] y_train = rawdf.iloc[:, -1] model_store = ['rf', 'et', 'xgb'] model_object = { 'xgb': XGBRegressor(), 'rf': RandomForestRegressor(), 'et': ExtraTreesRegressor() } with open(param_dir, 'rb') as f: model_results = pickle.load(f) model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in model_results.items()} model_object = {k: model_object[k].set_params(**{kk.split('__')[1]: vv for kk, vv in v.loc[0, 'params'].items()}) for k, v in model_results.items()} preprocess_pipeline = pp_selector(pp_choice) lvl1_pipeline = [(model_name,model_object[model_name]) for model_name in model_store] stack = StackingTransformer(estimators=lvl1_pipeline, # base estimators regression=True, # regression task (if you need # classification - set to False) variant='A', # oof for train set, predict test # set in each fold and find mean metric=rmsle, # metric: callable n_folds=5, # number of folds shuffle=True, # shuffle the data random_state=0, # ensure reproducibility verbose=0) stack.fit(preprocess_pipeline.fit_transform(x_train), y_train) s_train = stack.transform(preprocess_pipeline.fit_transform(x_train)) if passthrough: final_est = Pipeline([ ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(), preprocess_pipeline=pp_selector(final_pp_choice), no_of_lvl1=len(lvl1_pipeline))), #('debugger', DebuggerTransformer(info='final')), ('final_est', XGBRegressor()) ]) est_name = 'final_est__' train = np.concatenate((s_train, x_train.values), axis=1) else: final_est = XGBRegressor() est_name = '' train = s_train final_estimator_params = {f'{est_name}n_estimators': scipy.stats.randint(150, 1000), f'{est_name}learning_rate': scipy.stats.uniform(0.01, 0.59), f'{est_name}subsample': scipy.stats.uniform(0.3, 0.6), f'{est_name}max_depth': scipy.stats.randint(1, 16), f'{est_name}colsample_bytree': scipy.stats.uniform(0.5, 0.4), f'{est_name}min_child_weight': [1, 2, 3, 4], f'{est_name}gamma': scipy.stats.expon(scale=0.05), } est = RandomizedSearchCV(final_est, param_distributions=final_estimator_params, cv=5, n_iter=100, scoring=make_scorer(rmsle, greater_is_better=False), verbose=1, n_jobs=-1) est.fit(train, y_train) score = {'lvl2ptvs_xgb': est.cv_results_} results_dir = create_results_directory(results_dir) with open(f'{results_dir}/results_store.pkl', 'wb') as f: pickle.dump(score, f)
def ga_opt(load_dir_store, hparams): # Load all the saved data_store.pkl into data_store list data_store = prepare_grand_data_store(load_dir_store) yt = data_store[0]['train']['df'].iloc[:, -6:-3].values p_yt_store = np.array( [data['train']['df'].iloc[:, -3:].values for data in data_store]) yv = data_store[0]['val']['df'].iloc[:, -6:-3].values p_yv_store = np.array( [data['val']['df'].iloc[:, -3:].values for data in data_store]) def eval(individual): # Individual is a list of 0 or 1, where if the j entry is 1, the j model is included and vice versa selected_mask = [ idx for idx, value in enumerate(individual) if value == 1 ] # Calculate mean relative error for the selected models re_t = mean_relative_error( yt, np.mean(p_yt_store[selected_mask, :, :], axis=0)) re_v = mean_relative_error( yv, np.mean(p_yv_store[selected_mask, :, :], axis=0)) re = (re_t + 2 * re_v) / 3 return (re, ) creator.create("FitnessMax", base.Fitness, weights=(-1, )) creator.create("Individual", list, fitness=creator.FitnessMax) toolbox = base.Toolbox() toolbox.register("attr_bool", np.random.choice, np.arange(0, 2), p=hparams['init']) toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(data_store)) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("evaluate", eval) toolbox.register("mate", tools.cxTwoPoint) toolbox.register("mutate", tools.mutFlipBit, indpb=0.2) toolbox.register("select", tools.selTournament, tournsize=3) # Logging stats = tools.Statistics(key=lambda ind: ind.fitness.values) stats.register("avg", np.mean, axis=0) stats.register("std", np.std, axis=0) stats.register("min", np.min, axis=0) stats.register("max", np.max, axis=0) pop = toolbox.population(n=hparams['n_pop']) hof = tools.HallOfFame(1) # Run the GA algorithm pop, logbook = algorithms.eaSimple(toolbox=toolbox, population=pop, cxpb=0.5, mutpb=0.2, ngen=hparams['n_gen'], halloffame=hof, stats=stats, verbose=True) # Create the ga results dir based on the load dir name results_dir = create_results_directory(f'./results/ga/ga_opt', folders=['plots'], excels=['ga_results']) # Plotting gen = logbook.select("gen") fit_min = [x.item() for x in logbook.select("min")] fit_avg = [x.item() for x in logbook.select("avg")] fit_max = [x.item() for x in logbook.select("max")] fig, ax1 = plt.subplots() line1 = ax1.plot(gen, fit_min, label="Min MRE") line2 = ax1.plot(gen, fit_avg, label="Avg MRE") line3 = ax1.plot(gen, fit_max, label="Max MRE") plt.legend() ax1.set_xlabel("Generation") ax1.set_ylabel("Relative Error") plt.savefig('{}/plots/GA_opt_MRE_all.png'.format(results_dir), bbox_inches="tight") fig, ax1 = plt.subplots() line1 = ax1.plot(gen, fit_min, label="Min MRE") plt.legend() ax1.set_xlabel("Generation") ax1.set_ylabel("Total Generation Cost") plt.savefig('{}/plots/GA_opt_min_only.png'.format(results_dir), bbox_inches="tight") # Store final results av = hof[-1] # av stands for allocation vector results_dict = defaultdict(list) data_names = [k for k in data_store[0].keys() if k not in ['info']] for data, indicator in zip(data_store, av): if indicator == 1: # Means include the model for k in data_names: results_dict[k].append(data[k]['df'].iloc[:, -3:].values) # Create excel workbook to print GA results to wb = openpyxl.Workbook() # Print allocation vector to excel wb.create_sheet('av') ws = wb['av'] model_names = [data['info']['model_name'] for data in data_store] print_df_to_excel(df=pd.DataFrame([av, model_names], index=['av', 'model_names']).T, ws=ws) summary_df = {} for k, v in results_dict.items( ): # Print the prediction for each dataset to excel y = data_store[0][k]['df'].iloc[:, -6:-3].values v = np.array(v) p_y = np.mean(v, axis=0) mse = mean_squared_error(y, p_y) mre = mean_relative_error(y, p_y) var = np.mean(np.var(v, axis=0)) summary_df[k] = {'mse': mse, 'mre': mre, 'var': var} df = pd.DataFrame(np.hstack((y, p_y)), columns=[f'y{i + 1}' for i in range(3)] + [f'P_y{i + 1}' for i in range(3)]) wb.create_sheet(k) ws = wb[k] print_df_to_excel(df=df, ws=ws) print_df_to_excel(df=pd.DataFrame.from_dict({ 'mse': [mse], 'mre': [mre] }), ws=ws, start_col=10) # Print summary of losses for different dataset in the summary worksheet summary_df = pd.DataFrame.from_dict(summary_df) def move_column_inplace(df, col, pos): col = df.pop(col) df.insert(pos, col.name, col) move_column_inplace(summary_df, 'train', 0) move_column_inplace(summary_df, 'val', 1) ws = wb['Sheet'] print_df_to_excel(df=summary_df, ws=ws, start_row=5) print_df_to_excel(df=pd.DataFrame(hparams), ws=ws) # Save and close excel workbook wb.save(f'{results_dir}/ga_results.xlsx') wb.close()
first_est_date=first_est_date, combinations=[ ['rw', 'll*ln'], ['rw', 'llt*ln'], ['rw', 'll*ln', 'llt*ln'], ]) elif case == 3.1: # Combine multiple different xgb runs by averaging them. Uses the post processed of poos_h{}.pkl. h_store = [1, 3, 6, 12, 24] h_idx_store = [0, 1, 2, 3, 4] poos_post_dir_store = [ './results/poos_rolling/poos_IND_xgbar', './results/poos_rolling/poos_IND_xgba_rs17' ] results_dir = create_results_directory( './results/poos/poos_IND_xgba_rcombined') with open(f'{results_dir}/dir_stores.txt', "w") as text_file: text_file.write(str(poos_post_dir_store)) for h, h_idx in zip(h_store, h_idx_store): poos_analysis_combining_xgb( h=h, results_dir=results_dir, poos_post_dir_store=poos_post_dir_store) elif case == 3.2: # Plot information about m iteration errors for xgb. Uses the post processed of poos_h{}.pkl. h_store = [1, 3, 6, 12, 24] h_idx_store = [0, 1, 2, 3, 4] results_dir = './results/poos/poos_IND_xgba_rh_s42' for h, h_idx in zip(h_store, h_idx_store): poos_xgb_plotting_m(h=h, results_dir=results_dir,
label_type='cutoff', norm_mask=[0, 1, 3, 4, 5]) if smote_numel: fl_store = fl.fold_smote_kf_augment(numel=smote_numel, k_folds=20, shuffle=True) elif smote_excel: fl_store = fl.smote_kf_augment(smote_excel=smote_excel, k_folds=20, shuffle=True) else: fl_store = fl.create_kf(k_folds=10, shuffle=True) hparam_opt(model_mode='dtr', loss_mode='dtr', fl_in=fl, fl_store_in=fl_store, norm_mask=[0, 1, 3, 4, 5], scoring=scoring, total_run=120, instance_per_run=1, write_dir=write_dir, save_model=save_model, save_model_dir=write_dir + '/models/', plot_dir=None) ''' write_dir = create_results_directory( './results/hparams_opt round {} conv1'.format(round), folders=['plots', 'models', 'learning rate plots'], excels=['skf_results', 'hparam_results']) fl = load_data_to_fl(loader_excel, normalise_labels=True, label_type='gf20', norm_mask=None) #[0, 1, 3, 4, 5]) if smote_numel: fl_store = fl.fold_smote_kf_augment(numel=smote_numel, k_folds=10, shuffle=True) elif smote_excel: fl_store = fl.smote_kf_augment(smote_excel=smote_excel, k_folds=10, shuffle=True) else: fl_store = fl.create_kf(k_folds=10, shuffle=True)
def test(selector, number=None): if selector == 1: write_dir = './results/svm gamma130 with proba' svm_store = load_svm_ensemble('{}/models'.format(write_dir)) x, y = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100)) composition = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1)), axis=1) prediction, distance, probability = svm_ensemble_prediction( svm_store, composition, probability=True) plt.scatter(composition[:, 0], composition[:, 1], c=distance) plt.colorbar() plt.savefig('./results/svm gamma130 with proba/distance map.png', bbox_inches='tight') plt.close() plt.scatter(composition[:, 0], composition[:, 1], c=prediction) plt.colorbar() plt.savefig('./results/svm gamma130 with proba/prediction map.png', bbox_inches='tight') plt.close() plt.scatter(composition[:, 0], composition[:, 1], c=probability) plt.colorbar() plt.savefig('./results/svm gamma130 with proba/probability map.png', bbox_inches='tight') plt.close() with open('results/grid full/grid_data', 'rb') as handle: fl = pickle.load(handle) plt.scatter(fl.features[:, 0], fl.features[:, 1], c=fl.labels) plt.colorbar() plt.savefig('./results/svm gamma130 with proba/actual map.png', bbox_inches='tight') plt.close() wb = openpyxl.Workbook() wb.create_sheet('data') x_name = 'CNT' y_name = 'PVA' print_df_to_excel(df=pd.DataFrame( np.array([ composition[:, 0], composition[:, 1], prediction, distance, probability ]).T, columns=[x_name, y_name, 'prediction', 'distance', 'probability']), ws=wb['data']) wb.save('{}/svm prediction distance prob.xlsx'.format(write_dir)) model = SVMmodel(fl=fl, gamma=130) model.train_model(fl=fl) prediction, distance = svm_ensemble_prediction([model], composition) plt.scatter(composition[:, 0], composition[:, 1], c=distance) plt.colorbar() plt.savefig('{}/distance map2.png'.format(write_dir), bbox_inches='tight') plt.close() plt.scatter(composition[:, 0], composition[:, 1], c=prediction) plt.colorbar() plt.savefig('{}/prediction map2.png'.format(write_dir), bbox_inches='tight') plt.close() elif selector == 2: with open('results/grid full/grid_data', 'rb') as handle: fl = pickle.load(handle) grid_hparam_opt(fl, 300) elif selector == 3: composition = np.array([0.175763935003216, 0.195036471863385]) svm_store = load_svm_ensemble('./results/svm gamma130/models') prediction, distance = svm_ensemble_prediction(svm_store, composition) print('prediction: {}\ndistance: {}'.format(prediction, distance)) elif selector == 4: write_dir = './results/skf3' plot_arcsinh_predicted_splines( plot_dir='{}/plots'.format(write_dir), results_excel_dir='{}/skf_results.xlsx'.format(write_dir), end_excel_dir='./results/combine Round 6/end 6.xlsx', transformation='arcsinh', sheets=['ann3'], fn=6, numel=99) elif selector == 5: combine_excel_results( results_excel_dir= './results/Optimal Combinations/testset_combi/combinations.xlsx', end_excel_dir='./results/combine Round 6/end 6.xlsx', plot_dir='./results/combine Round 6/plots', sheets=[ 'ann3_115_0', 'ann3_190_0 sqrt', 'conv1_40_0', 'conv1_158_0 sqrt' ], fn=6) elif selector == 6: cutoff_combine_excel_results( dir_store=[ './results/hparams_opt Round {} SVR'.format(number), './results/hparams_opt Round {} DTR'.format(number), './results/hparams_opt Round {} ANN3'.format(number) ], sheets=['svr', 'dtr', 'ann3'], results_excel_dir='./results/combination {}/combination CM R{}.xlsx' .format(number, number), plot_dir='./results/combination {}/plots'.format(number), plot_mode=False, fn=6, numel=3) elif selector == 6.1: cutoff_combine_excel_results( dir_store=[ './results/hparams_opt Round {} DTR'.format(number), './results/hparams_opt Round {} ANN3 - 2'.format(number) ], sheets=['dtr', 'ann3'], results_excel_dir='./results/combination {}/combination CM R{}.xlsx' .format(number, number), plot_dir='./results/combination {}/plots'.format(number), plot_mode=False, fn=6, numel=3) elif selector == 6.2: cutoff_combine_excel_results_with_excel( results_excel_dir= './results/combination_13s_R13_predictions/testset_prediction.xlsx', plot_dir='./results/combination_13s_R13_predictions/plots', plot_mode=False, fn=-1, numel=3) elif selector == 7: model_store = load_model_ensemble('./results/skf13/models') mean, std = model_ensemble_prediction( model_store, np.array([[0.5, 0.5, 0.5, 0, 1, 0]])) print(mean, std) elif selector == 8: mse_tracker(excel_store=[ './results/combination {}/combination CM R{}.xlsx'.format(1, 1), './results/combination {}/combination CM R{}.xlsx'.format(2, 2), './results/combination {}/combination CM R{}.xlsx'.format(3, 3), './results/combination {}/combination CM R{}.xlsx'.format(4, 4), './results/combination {}/combination CM R{}.xlsx'.format(5, 5), './results/combination {}/combination CM R{}.xlsx'.format(6, 6), './results/combination {}/combination CM R{}.xlsx'.format( '6e', '6e'), './results/combination {}/combination CM R{}.xlsx'.format(7, 7), './results/combination {}/combination CM R{}.xlsx'.format(8, 8), './results/combination {}/combination CM R{}.xlsx'.format(9, 9), './results/combination {}/combination CM R{}.xlsx'.format(10, 10), './results/combination {}/combination CM R{}.xlsx'.format(11, 11), './results/combination {}/combination CM R{}.xlsx'.format(12, 12), './results/combination {}/combination CM R{}.xlsx'.format(13, 13) ], write_excel='./MSE tracker.xlsx', rounds=[1, 2, 3, 4, 5, 6, '6e', 7, 8, 9, 10, 11, 12, 13], headers=['SVR', 'DTR', 'ANN3', 'Combined'], fn=6, numel=3) elif selector == 9: write_dir = create_results_directory( results_directory='./results/final_prediction', excels=['final_prediction']) final_prediction_results( write_excel='{}/final_prediction.xlsx'.format(write_dir), model_dir_store=[ './results/combination {}/models'.format(1), './results/combination {}/models'.format(2), './results/combination {}/models'.format(3), './results/combination {}/models'.format(4), './results/combination {}/models'.format(5), './results/combination {}/models'.format(6), './results/combination {}/models'.format('6e'), './results/combination {}/models'.format(7), './results/combination {}/models'.format(8), './results/combination {}/models'.format(9), './results/combination {}/models'.format(10), './results/combination {}/models'.format(11), './results/combination {}/models'.format(12), './results/combination {}/models'.format(13) ], combined_excel_store=[ './results/combination {}/combination CM R{}.xlsx'.format( 1, 1), './results/combination {}/combination CM R{}.xlsx'.format( 2, 2), './results/combination {}/combination CM R{}.xlsx'.format( 3, 3), './results/combination {}/combination CM R{}.xlsx'.format( 4, 4), './results/combination {}/combination CM R{}.xlsx'.format( 5, 5), './results/combination {}/combination CM R{}.xlsx'.format( 6, 6), './results/combination {}/combination CM R{}.xlsx'.format( '6e', '6e'), './results/combination {}/combination CM R{}.xlsx'.format( 7, 7), './results/combination {}/combination CM R{}.xlsx'.format( 8, 8), './results/combination {}/combination CM R{}.xlsx'.format( 9, 9), './results/combination {}/combination CM R{}.xlsx'.format( 10, 10), './results/combination {}/combination CM R{}.xlsx'.format( 11, 11), './results/combination {}/combination CM R{}.xlsx'.format( 12, 12), './results/combination {}/combination CM R{}.xlsx'.format( 13, 13) ], excel_loader_dir_store=[ './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'. format(1, 1), './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'. format(2, 2), './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'. format(3, 3), './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'. format(4, 4), './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'. format(5, 5), './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'. format(6, 6), './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'. format('6e', '6e'), './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'. format(7, 7), './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'. format(8, 8), './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'. format(9, 9), './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'. format(10, 10), './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'. format(11, 11), './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'. format(12, 12), './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'. format(13, 13) ], rounds=[1, 2, 3, 4, 5, 6, '6e', 7, 8, 9, 10, 11, 12, 13], fn=6, numel=3)
def l2_points_opt(numel, write_dir, svm_directory, seed_number_of_expt, total_expt, l2_opt=True): write_dir = create_results_directory(results_directory=write_dir, excels=['l2_acq']) svm_store = load_svm_ensemble(svm_directory) base = [x / (numel * 2 - 1) for x in list(range(numel * 2))] # Create set of possible compositions compositions = [[x, y] if x + y <= 1 else [-x + 1, -y + 1] for x, y in list(itertools.product(base[::2], base[1::2]))] distance_store = [] # Check feasibility for those compositions for model in svm_store: distance_store.append(model.model.decision_function(compositions)) distance = np.mean(np.array(distance_store), axis=0) valid_compositions = [ x for x, dist in zip(compositions, distance) if dist >= 0 ] print('Number of compositions = {}. % valid = {}%'.format( len(valid_compositions), len(valid_compositions) / len(compositions) * 100)) # Permute feasible compositions with different thickness possibilities scaled from 0 to 1 number_valid_compositions = round(math.sqrt(len(valid_compositions))) compositions_thickness = list( itertools.product(valid_compositions, [ x / (number_valid_compositions - 1) for x in list(range(number_valid_compositions)) ])) print('Number of permutations = {}'.format(len(compositions_thickness * 3))) # Permute the above with 0D, 1D, and 2D all_permutations = np.array([ x[0] + [x[1]] + y for x in compositions_thickness for y in [[1, 0, 0], [0, 1, 0], [0, 0, 1]] ]) if l2_opt: expt_idx = np.random.randint(0, len(all_permutations), seed_number_of_expt) expt_store = all_permutations[expt_idx, :] for i in range(total_expt - seed_number_of_expt): start = time.time() d = pairwise_distances(expt_store, all_permutations, metric='euclidean') next_expt = np.argmax(np.min(d, axis=0)) expt_store = np.concatenate( (expt_store, all_permutations[next_expt, None, :]), axis=0) end = time.time() print('{} out of {} completed. Time taken = {}.'.format( i + 1, total_expt - seed_number_of_expt, end - start)) else: expt_idx = np.random.randint(0, len(all_permutations), total_expt) expt_store = all_permutations[expt_idx, :] expt_store[:, 2] = expt_store[:, 2] * 1800 + 200 write_excel = '{}/l2_acq.xlsx'.format(write_dir) wb = openpyxl.load_workbook(write_excel) wb.create_sheet('l2_acq') ws = wb[wb.sheetnames[-1]] ws.cell(1, 1).value = 'Valid Combinations' ws.cell(1, 2).value = len(all_permutations) ws.cell(1, 3).value = 'Seed Expt' ws.cell(1, 4).value = seed_number_of_expt df = pd.DataFrame(data=expt_store, columns=['CNT', 'PVA', 'Thickness', '0D', '1D', '2D'], index=list(range(1, total_expt + 1))) print_df_to_excel(df=df, ws=ws, start_row=2) wb.save(write_excel) pass
from own_package.analysis import l2_tracker, testset_prediction_results, testset_model_results_to_excel, \ testset_optimal_combination, save_testset_prediction, eval_combination_on_testset, save_valset_prediction,\ features_correlation_analysis, training_curve_comparision from own_package.others import create_results_directory #from own_package.models.models import create_hparams def selector(case, **kwargs): if case == 1: round_number = kwargs['round_number'] write_dir = create_results_directory('./results/l2_tracker', excels=['l2_results']) l2_tracker( write_excel='{}/l2_results.xlsx'.format(write_dir), final_excel_loader= './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.format( round_number), last_idx_store=[ 11, 16, 21, 29, 37, 45, 69, 77, 85, 93, 101, 109, 117, 125 ]) elif case == 2: write_dir = create_results_directory('./results/testset_prediction', excels=['testset_prediction']) testset_prediction_results( write_excel='{}/testset_prediction.xlsx'.format(write_dir), model_dir_store=[ './results/combination {}/models'.format(1), './results/combination {}/models'.format(2), './results/combination {}/models'.format(3), './results/combination {}/models'.format(4), './results/combination {}/models'.format(5),
def run_train_test(model_mode, hparams, window_size, loader_file, results_directory=None, seed=42, save_model=False, save_model_name=None): ''' Stratified k fold cross validation for training and evaluating model 2 only. Model 1 data is trained before hand. :param model_mode: Choose between using SNN or cDNN (non_smiles) and SNN_smiles or cDNN_smiles :param cv_mode: Cross validation mode. Either 'skf' or 'loocv'. :param hparams: hparams dict containing hyperparameters information :param loader_file: data_loader excel file location :param skf_file: skf_file name to save excel file as :param skf_sheet: name of sheet to save inside the skf_file excel. If None, will default to SNN or cDNN as name :param k_folds: Number of k folds. Used only for skf cv_mode :param k_shuffle: Whether to shuffle the given examples to split into k folds if using skf :return: ''' if not results_directory: results_directory = './results/{}'.format(model_mode) results_directory = create_results_directory(results_directory) fl = load_data_to_fl(loader_file, window_size=window_size) # Run train test sess = tf.Session() K.set_session(sess) instance_start = time.time() (ss_fl, i_ss_fl) = fl.create_train_test_split( seed=seed) # ss_fl is training fl, i_ss_fl is validation fl # Set up model model = LSTMmodel(ss_fl, model_mode, hparams) # Train model and save model training loss vs epoch plot if plot_name is given, else no plot will be saved model.train_model( ss_fl, save_mode=False, plot_name='{}/plots/training_loss.png'.format(results_directory)) # Evaluation predicted_labels, mse = model.eval(i_ss_fl) # Saving model if save_model: # Set save_model_name if isinstance(save_model_name, str): save_model_name1 = save_model_name + '_' + model_mode else: save_model_name1 = model_mode # Save model dirc = results_directory + '/models/' + save_model_name1 + '.h5' print('Saving model in {}'.format(dirc)) model.model.save(dirc) # Need to put the next 3 lines if not memory will run out del model K.clear_session() gc.collect() # Printing one instance summary. instance_end = time.time() print( 'Model is {}. Time take for instance = {}\n' 'Post-training results: \nmse = {},\n' '####################################################################################################' .format(model_mode, instance_end - instance_start, mse)) # Plotting the time series plot for prediction and actual test labels for k in range(i_ss_fl.count): plt.plot(np.squeeze(i_ss_fl.labels[k, :, 0]), c='g', label='Actual') plt.plot(np.squeeze(predicted_labels[k, :]), c='r', label='Predicted') plt.legend(loc='upper left') plt.title('Test Example ' + str(k + 1)) plt.ylabel('Demand') plt.xlabel('Hours of the day') plt.savefig(results_directory + '/plots/validation_plots/Test Example ' + str(k + 1) + '.png', bbox_inches='tight') plt.close() # Printing results to excel # Creating excel excel_name = results_directory + '/results.xlsx' wb = openpyxl.Workbook() wb.save(excel_name) sheetname = wb.sheetnames[-1] ws = wb[sheetname] # Writing other subset split, instance per run, and bounds start_row = 1 start_col = 1 headers = ['mse'] values = [mse] print_array_to_excel(np.array(headers), (start_row, start_col + 1), ws, axis=1) print_array_to_excel(np.array(values), (1 + start_row, start_col + 1), ws, axis=1) start_col += 2 # Writing hparams dataframe pd_writer = pd.ExcelWriter(excel_name, engine='openpyxl') pd_writer.book = wb pd_writer.sheets = dict((ws.title, ws) for ws in wb.worksheets) hparams = pd.DataFrame( dict([(k, pd.Series(v)) for k, v in hparams.items()])) hparams.to_excel(pd_writer, sheetname, startrow=0, startcol=start_col) # Saving and closing pd_writer.save() pd_writer.close() wb.close() return mse
def run_testing(): plt.rcParams["font.family"] = "Times New Roman" results_dir = create_results_directory('./results/simulation') n_total = 10 t_train = 20 t_test = 100 simulation_runs = 20 df_store = [] def func(z): return 1 + 5 * z[:, [0]] + 2 * z[:, [1]] + z[:, [2]] + np.random.normal( 0, 2, (z.shape[0], 1)) def plot(cw, name): plt.plot( np.mean((sm.add_constant(z_test) @ np.cumsum( np.array(cw.bhat_new_store.toarray()), axis=0).T - y_test)**2, axis=0)[5:]) plt.xlabel('m iterations') plt.ylabel('Test MSE') plt.axvline(cw.m_star, linestyle='--') plt.savefig(f'{results_dir}/{name}.png') plt.close() final = min(cw.m_star + 25, cw.bhat_new_store.shape[0]) plt.plot( np.mean((sm.add_constant(z_test) @ np.cumsum( np.array(cw.bhat_new_store.toarray()), axis=0).T - y_test)**2, axis=0)[5:final]) plt.xlabel('m iterations') plt.ylabel('Test MSE') plt.axvline(cw.m_star, linestyle='--') plt.savefig(f'{results_dir}/{name}_zoomed.png') plt.close() def cw_run(cw, hparams, store, idx, name): cw = cw(z_matrix=z, y_vec=y, hparams=hparams, r=None) if idx == 0: cw.fit(plot_name=f'{results_dir}/{name}') else: cw.fit() yhat = cw.predict(exog=sm.add_constant(z_test)) ssr = sum((y_test - yhat)**2) store.append([(f'{name} MSE', ssr / t_test), (f'{name} m_star', cw.m_star), (f'{name} params', cw.params), (f'{name} i frac', cw.i_star_frac)]) if idx == 0: plot(cw, name) for idx in range(simulation_runs): z = np.random.normal(0, 1, (t_train, n_total)) y = func(z) z_test = np.random.normal(0, 1, (t_test, n_total)) y_test = func(z_test) ols = sm.OLS(endog=y, exog=sm.add_constant(z)).fit() yhat_ols = ols.predict(sm.add_constant(z_test))[..., None] ssr_ols = sum((y_test - yhat_ols)**2) # lasso 10CV space = [Real(low=-10, high=1, name='alpha')] @use_named_args(space) def fitness(**params): return -np.mean( cross_val_score(SMwrapper(sm.OLS, 10**params['alpha']), sm.add_constant(z), y, cv=10, scoring='neg_mean_squared_error')) results = gp_minimize( func=fitness, dimensions=space, acq_func='EI', # Expected Improvement. n_calls=20, verbose=False) alpha = results.x[0] # in lg10 lasso = sm.OLS(endog=y, exog=sm.add_constant(z)).fit_regularized( L1_wt=1, alpha=10**alpha) yhat_lasso = lasso.predict(sm.add_constant(z_test))[..., None] ssr_lasso = sum((y_test - yhat_lasso)**2) results_store = { 'n_total': n_total, 'T_train': t_train, 'T_test': t_test, 'Simulation Runs': simulation_runs, 'OLS MSE': ssr_ols / t_test, 'Lasso MSE': ssr_lasso / t_test, 'lasso_alpha': 10**alpha, 'predictor': np.arange(n_total + 1), 'True params': [1, 5, 2, 1] + [0] * (n_total - 3), 'ols params': ols.params, 'Lasso params': lasso.params, } store = [] hparams = { 'm_max': 500, 'learning_rate': 0.1, 'ic_mode': 'aic', 'dropout': 0.5 } cw_run(cw=ComponentwiseL2BoostDropout, hparams=hparams, store=store, idx=idx, name='cwd01_50') hparams = { 'm_max': 500, 'learning_rate': 0.3, 'ic_mode': 'aic', 'dropout': 0.5 } cw_run(cw=ComponentwiseL2BoostDropout, hparams=hparams, store=store, idx=idx, name='cwd03_50') hparams = {'m_max': 2000, 'learning_rate': 0.1, 'ic_mode': 'aic'} cw_run(cw=ComponentwiseL2Boost, hparams=hparams, store=store, idx=idx, name='cw01') hparams = {'m_max': 2000, 'learning_rate': 0.3, 'ic_mode': 'aic'} cw_run(cw=ComponentwiseL2Boost, hparams=hparams, store=store, idx=idx, name='cw03') hparams = { 'm_max': 500, 'learning_rate': 0.1, 'ic_mode': 'aic', 'dropout': 0.5 } cw_run(cw=ComponentwiseL2BoostDropout, hparams=hparams, store=store, idx=idx, name='cwd01_50') hparams = { 'm_max': 500, 'learning_rate': 0.3, 'ic_mode': 'aic', 'dropout': 0.5 } cw_run(cw=ComponentwiseL2BoostDropout, hparams=hparams, store=store, idx=idx, name='cwd03_50') store = list(zip(*store)) for item in store: results_store.update(item) df_store.append( pd.DataFrame({k: pd.Series(v) for k, v in results_store.items()})) df = pd.concat(objs=df_store).groupby(level=0).mean() excel_name = f'{results_dir}/test_comparision.xlsx' excel_name = create_excel_file(excel_name) wb = openpyxl.load_workbook(excel_name) ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=df, ws=ws) wb.save(excel_name)
other_names = ['ett30'] other_dir = ['./excel/ett30.xlsx'] # Load main training data fl = load_data_to_fl(fl_dir, normalise_labels=False, norm_mask=[0, 1, 3, 4, 5]) fl_store = fl.create_kf(k_folds=k_folds, shuffle=True) # Load other data to evaluate the model on. e.g. the separate test set other_fl_dict = { k: load_testset_to_fl(v, norm_mask=[0, 1, 3, 4, 5], scaler=fl.scaler) for k, v in zip(other_names, other_dir) } write_dir = create_results_directory('./results/kf/kf_results', folders=['models', 'plots'], excels=['kf_results']) write_excel = f'{write_dir}/kf_results.xlsx' run_kf(model_mode=model_mode, fl=fl, fl_store=fl_store, hparams=hparams, scoring='mse', other_fl_dict=other_fl_dict, write_excel_dir=write_excel, save_model_name=f'{write_dir}/models/{model_mode}', plot_name=f'{write_dir}/plots/lr') selector(1)
def selector(case): if case == 1: results_dir = create_results_directory('./results/paper/dtr_vs_xgb') x, y = load_boston(return_X_y=True) x = pd.DataFrame(x, columns=[ 'crime', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'blacks', 'lstat' ]) x = x[['rm', 'lstat']] df_all = x.copy() df_all['price'] = y # Plot 3D scatter fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(df_all['rm'], df_all['lstat'], df_all['price']) ax.view_init(30, 135) plt.savefig(f'{results_dir}/scatter.png') plt.close() dtr = DecisionTreeRegressor(max_depth=2) dtr.fit(x, y) plot_tree(dtr, impurity=False) plt.savefig(f'{results_dir}/dtr_visual.png') plt.close() x_min = x.min(axis=0) x_max = x.max(axis=0) rm_linspace = np.linspace(x_min['rm'], x_max['rm'], 100) lstat_linspace = np.linspace(x_min['lstat'], x_max['lstat'], 100) rm, lstat = np.meshgrid(rm_linspace, lstat_linspace) points = np.stack(map(np.ravel, (rm, lstat)), axis=1) z = dtr.predict(points).reshape(rm.shape) fig = plt.figure() ax = fig.gca(projection='3d') ax.plot_surface(rm, lstat, z, cmap=plt.cm.BuGn, linewidth=0.2, vmin=-50) ax.view_init(30, 135) plt.savefig(f'{results_dir}/dtr_prediction.png') plt.close() # Linear regression lr = LinearRegression().fit(x, y) z = lr.predict(points).reshape(rm.shape) fig = plt.figure() ax = fig.gca(projection='3d') ax.plot_surface(rm, lstat, z, cmap=plt.cm.BuGn, linewidth=0.2, vmin=-50) ax.view_init(30, 135) plt.savefig(f'{results_dir}/lr_prediction.png') plt.close() # Linear regression kr = KernelReg(exog=x, endog=y, var_type='cc') z = kr.fit(points)[0].reshape(rm.shape) fig = plt.figure() ax = fig.gca(projection='3d') ax.plot_surface(rm, lstat, z, cmap=plt.cm.BuGn, linewidth=0.2, vmin=-50) ax.view_init(30, 135) plt.savefig(f'{results_dir}/kr_prediction.png') plt.close() # XGB hparams = { 'seed': 42, 'booster': 'gbtree', 'learning_rate': 0.1, 'objective': 'reg:squarederror', 'verbosity': 0, 'subsample': 1, 'max_depth': 2, 'colsample_bytree': 0.5, } dtrain = xgb.DMatrix(x.values, label=y) model = xgb.train(hparams, dtrain=dtrain, num_boost_round=100, verbose_eval=False) z_xgb = model.predict(xgb.DMatrix(points)).reshape(rm.shape) fig = plt.figure() ax = fig.gca(projection='3d') ax.plot_surface(rm, lstat, z_xgb, cmap=plt.cm.BuGn, linewidth=0.2, vmin=-50) ax.view_init(30, 135) plt.savefig(f'{results_dir}/xgb_prediction.png')
def __init__(self, results_dir, feature_mode, func_mode, numel, res, plot_mode=False): self.write_dir = create_results_directory( results_directory=results_dir, folders=['plots'], excels=['expt']) self.numel = numel self.res = res if feature_mode == 1: self.features = self.feature_1() feature_headers = np.array(['a', 'b', 'c', 'e']) elif feature_mode == 2: self.features = self.feature_2() feature_headers = np.array(['a', 'b', 'c', 'e']) elif feature_mode == 3: self.features = self.feature_3() feature_headers = np.array(['a', 'b', 'c', 'e']) else: raise KeyError( 'feature_mode {} selected does not exist'.format(feature_mode)) if func_mode == 1: func = self.func_1 elif func_mode == 2: func = self.func_2 elif func_mode == 3: func = self.func_3 else: raise KeyError( 'func_mode {} selected does not exist'.format(func_mode)) # Generating labels self.labels = [func(*item) for item in self.features.tolist()] # Writing to excel pd_writer = pd.ExcelWriter(self.write_dir + '/expt.xlsx', engine='openpyxl') exp_number = np.array(range( self.numel)) + 1 # Index to label Exp 1, 2, 3, ... y_number = np.array(range(self.res)) + 1 labels = [ np.concatenate((np.array(item[0]).reshape(-1), item[1])) for item in self.labels ] summary = np.concatenate((self.features, np.array(labels)), axis=1) df_write = pd.DataFrame(summary, index=exp_number, columns=np.concatenate( (feature_headers, np.array('e').reshape(-1), y_number))) df_write.to_excel(pd_writer) pd_writer.save() pd_writer.close() # Plotting if plot_mode: for idx, (f, l) in enumerate(zip(self.features.tolist(), self.labels)): self.plot(*(l + (idx, ) + tuple(f)))
def type_transformations(excel_dir, results_dir, y_selection, h_steps): df = pd.read_excel(excel_dir, sheet_name='Master') names = df.columns.values.tolist() data = df.values data_type_store = np.copy(data[0, 1:]) time_stamps = np.copy(data[3:, 0]) data = np.copy(data[1:, 1:]).astype(np.float) x_store = [] for _, (type, x) in enumerate(zip(data_type_store.tolist(), data.T.tolist())): if type == 1: x_store.append(x) elif type == 2: x_transformed = np.array(x)[1:] - np.array(x)[:-1] x_transformed = [np.nan] + x_transformed.tolist() x_store.append(x_transformed) elif type == 4: x_transformed = np.log(np.array(x)).tolist() x_store.append(x_transformed) elif type == 5: x_transformed = np.log(np.array(x)[1:]) - np.log(np.array(x)[:-1]) x_transformed = [np.nan] + x_transformed.tolist() x_store.append(x_transformed) elif type == 6: x_transformed = np.log(np.array(x)[2:]) - 2 * np.log(np.array(x)[1:-1]) + np.log(np.array(x)[:-2]) x_transformed = [np.nan, np.nan] + x_transformed.tolist() x_store.append(x_transformed) elif type == 7: x_transformed = np.array(x)[2:] / np.array(x)[1:-1] - np.array(x)[1:-1] / np.array(x)[:-2] x_transformed = [np.nan, np.nan] + x_transformed.tolist() x_store.append(x_transformed) else: pass x_store = np.array(x_store).T temp_names = names[1:] selection_idx = [i for i in range(len(temp_names)) if temp_names[i] in y_selection] y_transformed_names = [] y_store = [] for idx, selection in enumerate(selection_idx): yo = data[:, selection] type = data_type_store[selection] for h in h_steps: y_transformed_names.append('{}_h{}'.format(temp_names[selection], h)) if type == 5: y_transformed = 1200 / h * np.log(yo[h:] / yo[:-h]) y_transformed = [np.nan] * h + y_transformed.tolist() y_store.append(y_transformed) elif type == 6: y_transformed = 1200 / h * np.log(yo[h + 1:] / yo[1:-h]) - 1200 * np.log(yo[1:-h] / yo[:-h - 1]) y_transformed = [np.nan] * (h + 1) + y_transformed.tolist() y_store.append(y_transformed) else: raise KeyError('Label type is not 5 or 6') y_store = (np.array(y_store).T)[2:, :] x_store[:, selection_idx] = x_store[:, selection_idx] * 1200 x_store = x_store[2:, :] # _, ic, v = iterated_em(all_x=x_store.copy(), pca_p=9, max_iter=1e4, tol=0.1) pc = SMPCA(data=x_store.copy(), ncomp=9, missing='fill-em') x_store = pc._adjusted_data results_dir = create_results_directory(results_dir) wb = openpyxl.Workbook() wb.create_sheet('transformation') sheet_name = wb.sheetnames[-1] ws = wb[sheet_name] df = pd.DataFrame(data=np.concatenate((time_stamps[..., None], x_store), axis=1), columns=names) for r in dataframe_to_rows(df, index=False, header=True): ws.append(r) wb.create_sheet('y transformed') sheet_name = wb.sheetnames[-1] ws = wb[sheet_name] ydf = pd.DataFrame(data=np.concatenate((time_stamps[..., None], y_store), axis=1), columns=['Time Stamps'] + y_transformed_names) for r in dataframe_to_rows(ydf, index=False, header=True): ws.append(r) def summary_test(df, data_type_store): results_dict = collections.defaultdict(dict) suggested_type_store = [] for var, type_ in zip(df.columns.values[1:], data_type_store): ts = df[var].values.astype(float) # ADF test. Null: time series has a unit root adf_p = adfuller(x=ts.copy())[1] # KPSS test. Null: time series is stationary around a constant kpss_p = kpss(x=ts.copy())[1] results_dict[var]['adf p_value'] = adf_p results_dict[var]['kpss p_value'] = kpss_p ''' Case 1: Both tests conclude that the series is not stationary - The series is not stationary Case 2: Both tests conclude that the series is stationary - The series is stationary Case 3: KPSS indicates stationarity and ADF indicates non-stationarity - The series is trend stationary. Trend needs to be removed to make series strict stationary. The detrended series is checked for stationarity. Case 4: KPSS indicates non-stationarity and ADF indicates stationarity - The series is difference stationary. Differencing is to be used to make series stationary. The differenced series is checked for stationarity. ''' if adf_p >= 0.05 and kpss_p <= 0.05: case = 1 suggested_type_store.append(type_+1) # Try differencing elif adf_p <= 0.05 and kpss_p >= 0.05: case = 2 suggested_type_store.append(type_) elif adf_p>=0.05 and kpss_p>=0.05: case = 3 suggested_type_store.append('BAD THERE IS TREND')
def selector(case): if case == 1: read_excel_acquisition_data(write_dir='./results/skf9', excel_file='./results/skf9/acq7.xlsx') elif case == 2: plot_all_umap(read_dir='./results/skf9/acq_fl_data') elif case == 3: write_dir = create_results_directory('./Plots/rounds') excel_store = [ [ './results/hparams_opt round 1 ANN - 2/overall_summary.xlsx', './results/hparams_opt round 1 DTR/overall_summary.xlsx' ], [ './results/hparams_opt round 2 ann - 2/overall_summary.xlsx', './results/hparams_opt round 2 DTR/overall_summary.xlsx' ], [ './results/hparams_opt round 3 ann/overall_summary.xlsx', './results/hparams_opt round 3 DTR/overall_summary.xlsx' ], [ './results/hparams_opt round 4 ann/overall_summary.xlsx', './results/hparams_opt round 4 DTR/overall_summary.xlsx' ], [ './results/hparams_opt round 5 ann/overall_summary.xlsx', './results/hparams_opt round 5 DTR/overall_summary.xlsx' ], [ './results/hparams_opt round 6e ann/overall_summary.xlsx', './results/hparams_opt round 6e DTR/overall_summary.xlsx' ], [ './results/hparams_opt round 6 ann/overall_summary.xlsx', './results/hparams_opt round 6 DTR/overall_summary.xlsx' ], [ './results/hparams_opt round 7 ann/overall_summary.xlsx', './results/hparams_opt round 7 DTR/overall_summary.xlsx' ], [ './results/hparams_opt round 8 ann/overall_summary.xlsx', './results/hparams_opt round 8 DTR/overall_summary.xlsx' ], [ './results/hparams_opt round 9 ann/overall_summary.xlsx', './results/hparams_opt round 9 DTR/overall_summary.xlsx' ], [ './results/hparams_opt round 10 ann/overall_summary.xlsx', './results/hparams_opt round 10 DTR/overall_summary.xlsx' ], [ './results/hparams_opt round 11 ann/overall_summary.xlsx', './results/hparams_opt round 11 DTR/overall_summary.xlsx' ], [ './results/hparams_opt round 12 ann/overall_summary.xlsx', './results/hparams_opt round 12 DTR/overall_summary.xlsx' ], [ './results/hparams_opt round 13 ann/overall_summary.xlsx', './results/hparams_opt round 13 DTR/overall_summary.xlsx' ], ] rounds = [1, 2, 3, 4, 5, 6, '6e', 7, 8, 9, 10, 11, 12, 13] read_hparam_rounds(write_dir=write_dir, excel_store=excel_store, rounds=rounds) elif case == 4: plot_hparam_rounds(write_dir='./Plots/rounds - 7', metrics=[ 'Train MSE', 'Train MRE', 'Test MSE', 'Test MRE', 'Val MSE', 'Val MRE', 'un125Train MSE', 'un125Train MRE', ]) elif case == 5: plot_un_hparam_rounds(write_dir='./Plots', excel_dir='./results/new_summary - 30.xlsx') elif case == 6: plot_var(excel_dir='./Round 13 GA Combination Summary.xlsx', combi_names=['Round 13', 'NDA', 'NDA+I', 'NDA+S'])
from own_package.features_labels import read_excel_data, read_excel_dataloader, Fl_master, Fl_pca, Fl_ar, \ Fl_cw, Fl_xgb, hparam_selection import numpy as np import pandas as pd import pickle def selector(case, **kwargs): if case == 1: # Run poos experiment var_name = kwargs['var_name'] excel_dir = kwargs['excel_dir'] results_dir = create_results_directory('./results/exptg/{}'.format(var_name)) output = read_excel_dataloader(excel_dir=excel_dir) fl_master = Fl_master(x=output[0], features_names=output[1], yo=output[2], labels_names=output[3], y=output[4], y_names=output[5], time_stamp=output[6]) fl_xgb = Fl_xgb(val_split=None, x=None, yo=None, y=None, time_stamp=None, time_idx=None, features_names=fl_master.features_names, labels_names=fl_master.labels_names, y_names=fl_master.y_names) first_est_date = '1970:1' model_mode = 'xgb_with_hparam' if model_mode == 'xgb' or model_mode == 'xgb_with_hparam': default_hparams = {'seed': 42,
def run_skf_with_te(inputs_store, loader_excel, smote_numel, mode, name, learningrate=0.001, eval_model_dir=None): write_dir = create_results_directory('./results/{}'.format(name), folders=['plots', 'models', 'learning rate plots'], excels=['skf_results', 'te.xlsx']) data_store = [] loss = 'mse' if eval_model_dir: inputs_store = load_model_ensemble(eval_model_dir) for inputs in inputs_store: fl = load_data_to_fl(loader_excel, label_type='cutoff', normalise_labels=False, norm_mask=[0, 1, 3, 4, 5]) test_excel_dir = './excel/ett_30testset_cut.xlsx' ett_store = ['./excel/ett_30testset_cut Invariant 1.xlsx', './excel/ett_30testset_cut Invariant 1 - 2.xlsx', './excel/ett_30testset_cut Invariant 1 - 3.xlsx', './excel/ett_30testset_cut Invariant 5.xlsx', './excel/ett_30testset_cut Invariant 5 - 2.xlsx', './excel/ett_30testset_cut Invariant 5 - 3.xlsx', './excel/ett_30testset_cut Invariant 10.xlsx', './excel/ett_30testset_cut Invariant 10 - 2.xlsx', './excel/ett_30testset_cut Invariant 10 - 3.xlsx', './excel/ett_30testset_cut Invariant 30.xlsx', './excel/ett_30testset_cut Invariant 30 - 2.xlsx', './excel/ett_30testset_cut Invariant 30 - 3.xlsx', './excel/ett_30testset_cut Invariant 50.xlsx', './excel/ett_30testset_cut Invariant 50 - 2.xlsx', './excel/ett_30testset_cut Invariant 50 - 3.xlsx', './excel/ett_125trainset_cut.xlsx', './excel/ett_125trainset_cut Invariant 1.xlsx', './excel/ett_125trainset_cut Invariant 5.xlsx', './excel/ett_125trainset_cut Invariant 10.xlsx'] test_fl = load_testset_to_fl(test_excel_dir, scaler=fl.scaler, norm_mask=[0, 1, 3, 4, 5]) ett_fl_store = [load_testset_to_fl(x, scaler=fl.scaler, norm_mask=[0, 1, 3, 4, 5]) for x in ett_store] if smote_numel: fl_store = fl.fold_smote_kf_augment(k_folds=10, shuffle=True, numel=smote_numel) else: fl_store = fl.create_kf(k_folds=10, shuffle=True) if eval_model_dir: val_score, train_score, data = run_eval_model_on_train_val_test_error(fl=fl, fl_store=fl_store, test_fl=test_fl, ett_fl_store=ett_fl_store, model_name='hparams_opt_makeup', model=inputs, ) else: pre, epochs = inputs hparams = create_hparams(shared_layers=[30, 30], ts_layers=[5, 5], cs_layers=[5, 5], learning_rate=learningrate, shared=0, end=0, pre=pre, filters=0, epochs=epochs, reg_l1=0.0005, reg_l2=0, loss=loss, max_depth=pre, num_est=epochs, epsilon=0.0001, c=0.001, activation='relu', batch_size=16, verbose=0) if mode == 'ann': model_mode = 'ann3' loss_mode = 'ann' elif mode == 'dtr': model_mode = 'dtr' loss_mode = 'dtr' val_score, train_score, data = run_skf_train_val_test_error(model_mode=model_mode, loss_mode=loss_mode, fl=fl, fl_store=fl_store, test_fl=test_fl, ett_fl_store=ett_fl_store, model_name='{}_{}_{}_{}'.format(write_dir, model_mode, pre, epochs), hparams=hparams, k_folds=10, scoring='mse', save_model_name='/{}_{}_{}'.format(mode, pre, epochs), save_model=True, save_model_dir=write_dir + '/models', plot_name='{}/{}'.format(write_dir, str(inputs))) ett_names = ['I01-1', 'I01-2', 'I01-3', 'I05-1', 'I05-2', 'I05-3', 'I10-1', 'I10-2', 'I10-3', 'I30-1', 'I30-2', 'I30-3', 'I50-1', 'I50-2', 'I50-3', '125Test', '125Test I01', '125Test I05', '125Test I10'] if eval_model_dir: data.append([1, 1]) else: data.append([pre, epochs]) data_store.append(data) with open('{}/data_store.pkl'.format(write_dir), "wb") as file: pickle.dump(data_store, file) read_hparam_data(data_store=data_store, write_dir=write_dir, ett_names=ett_names, print_s_df=False, trainset_ett_idx=-4)