def run_gp_optim(company: str, target_column: str, split_perc: float, imputation: str, featureset: str): """ Run GPR offline optimization loop :param company: prefix for data in case company data is also used :param target_column: target column to use :param split_perc: share of train data :param imputation: imputation method :param featureset: featureset to use """ config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, company=company, target_column=target_column, split_perc=split_perc) # load datasets datasets = TrainHelper.load_datasets(config=config, company=company, target_column=target_column) # prepare parameter grid kernels = [] base_kernels = [ ConstantKernel(constant_value=1000, constant_value_bounds=(1e-5, 1e5)), Matern(length_scale=1.0, length_scale_bounds=(1e-5, 1e5)), ExpSineSquared(length_scale=1.0, periodicity=seasonal_periods, length_scale_bounds=(1e-5, 1e5), periodicity_bounds=(int(seasonal_periods * 0.8), int(seasonal_periods * 1.2))), RBF(length_scale=1.0, length_scale_bounds=(1e-5, 1e5)), RationalQuadratic(length_scale=1.0, alpha=1.0, length_scale_bounds=(1e-5, 1e5), alpha_bounds=(1e-5, 1e5)), WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-5, 1e5)) ] TrainHelper.extend_kernel_combinations(kernels=kernels, base_kernels=base_kernels) param_grid = { 'dataset': [datasets[0]], 'imputation': [imputation], 'featureset': [featureset], 'dim_reduction': ['None', 'pca'], 'kernel': kernels, 'alpha': [1e-5, 1e-3, 1e-1, 1, 1e1, 1e3], 'n_restarts_optimizer': [0, 5, 10], 'standardize': [False, True], 'norm_y': [False, True], 'osa': [False] } # random sample from parameter grid sample_share = 0.1 params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=sample_share) doc_results = None best_rmse = 5000000.0 dataset_last_name = 'Dummy' imputation_last = 'Dummy' dim_reduction_last = 'Dummy' featureset_last = 'Dummy' for i in tqdm(range(len(params_lst))): warnings.simplefilter('ignore') dataset = params_lst[i]['dataset'] imputation = params_lst[i]['imputation'] featureset = params_lst[i]['featureset'] dim_reduction = None if params_lst[i][ 'dim_reduction'] == 'None' else params_lst[i]['dim_reduction'] kernel = params_lst[i]['kernel'] alpha = params_lst[i]['alpha'] n_restarts_optimizer = params_lst[i]['n_restarts_optimizer'] stand = params_lst[i]['standardize'] norm_y = params_lst[i]['norm_y'] one_step_ahead = params_lst[i]['osa'] # dim_reduction can only be done without NaNs if imputation is None and dim_reduction is not None: continue # 'dim_reduction does not make sense for few features if featureset == 'none' and dim_reduction is not None: continue if not ((dataset.name == dataset_last_name) and (imputation == imputation_last) and (dim_reduction == dim_reduction_last) and (featureset == featureset_last)): if resample_weekly and 'weekly' not in dataset.name: dataset.name = dataset.name + '_weekly' print(dataset.name + ' ' + str('None' if imputation is None else imputation) + ' ' + str('None' if dim_reduction is None else dim_reduction) + ' ' + featureset + ' ' + target_column) train_test_list = TrainHelper.get_ready_train_test_lst( dataset=dataset, config=config, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc, imputation=imputation, target_column=target_column, dimensionality_reduction=dim_reduction, featureset=featureset) if dataset.name != dataset_last_name: best_rmse = 5000000.0 dataset_last_name = dataset.name imputation_last = imputation dim_reduction_last = dim_reduction featureset_last = featureset sum_dict = None try: for train, test in train_test_list: model = ModelsGaussianProcessRegression.GaussianProcessRegression( target_column=target_column, seasonal_periods=seasonal_periods, kernel=kernel, alpha=alpha, n_restarts_optimizer=n_restarts_optimizer, one_step_ahead=one_step_ahead, standardize=stand, normalize_y=norm_y) cross_val_dict = model.train(train=train, cross_val_call=True) eval_dict = model.evaluate(train=train, test=test) eval_dict.update(cross_val_dict) if sum_dict is None: sum_dict = eval_dict else: for k, v in eval_dict.items(): sum_dict[k] += v evaluation_dict = { k: v / len(train_test_list) for k, v in sum_dict.items() } params_dict = { 'dataset': dataset.name, 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'kernel': kernel, 'alpha': alpha, 'n_restarts_optimizer': n_restarts_optimizer, 'standardize': stand, 'normalize_y': norm_y, 'one_step_ahead': one_step_ahead, 'optimized_kernel': model.model.kernel_ } save_dict = params_dict.copy() save_dict.update(evaluation_dict) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) best_rmse = TrainHelper.print_best_vals( evaluation_dict=evaluation_dict, best_rmse=best_rmse, run_number=i) except KeyboardInterrupt: print('Got interrupted') break except Exception as exc: print(exc) params_dict = { 'dataset': 'Failure', 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'kernel': kernel, 'alpha': alpha, 'n_restarts_optimizer': n_restarts_optimizer, 'standardize': stand, 'normalize_y': norm_y, 'one_step_ahead': one_step_ahead, 'optimized_kernel': 'failed' } save_dict = params_dict.copy() save_dict.update(TrainHelper.get_failure_eval_dict()) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) TrainHelper.save_csv_results(doc_results=doc_results, save_dir=base_dir + 'OptimResults/', company_model_desc=company + '-gp-sklearn_raw', target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, featuresets=param_grid['featureset'], imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.')
def run_ann_optim(target_column: str, split_perc: float, imputation: str, featureset: str): """ Run whole ANN optimization loop :param target_column: target variable for predictions :param split_perc: percentage of samples to use for train set :param imputation: imputation method for missing values :param featureset: featureset to use """ config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc) # load datasets datasets = TrainHelper.load_datasets(config=config, target_column=target_column) # prepare parameter grid param_grid = { 'dataset': datasets, 'imputation': [imputation], 'featureset': [featureset], 'dim_reduction': ['None', 'pca'], 'dropout_rate': [0.0, 0.5], 'batch_size': [4, 8, 16, 32], 'learning_rate': [1e-4, 1e-3, 1e-2, 1e-1], 'min_val_loss_improvement': [100, 1000], 'max_epochs_wo_improvement': [20, 50, 100], 'n_hidden': [10, 20, 50, 100], 'num_hidden_layer': [1, 2, 3], 'osa': [True] } # random samples from parameter grid params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=0.1) doc_results = None best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = 'Dummy' imputation_last = 'Dummy' dim_reduction_last = 'Dummy' featureset_last = 'Dummy' for i in tqdm(range(len(params_lst))): dataset = params_lst[i]['dataset'] imputation = params_lst[i]['imputation'] featureset = params_lst[i]['featureset'] dim_reduction = None if params_lst[i][ 'dim_reduction'] == 'None' else params_lst[i]['dim_reduction'] dropout_rate = params_lst[i]['dropout_rate'] batch_size = params_lst[i]['batch_size'] learning_rate = params_lst[i]['learning_rate'] min_val_loss_improvement = params_lst[i]['min_val_loss_improvement'] max_epochs_wo_improvement = params_lst[i]['max_epochs_wo_improvement'] one_step_ahead = params_lst[i]['osa'] n_hidden = params_lst[i]['n_hidden'] num_hidden_layer = params_lst[i]['num_hidden_layer'] # dim_reduction does not make sense for few features if featureset == 'none' and dim_reduction is not None: continue if not ((dataset.name == dataset_last_name) and (imputation == imputation_last) and (dim_reduction == dim_reduction_last) and (featureset == featureset_last)): if resample_weekly and 'weekly' not in dataset.name: dataset.name = dataset.name + '_weekly' print(dataset.name + ' ' + str('None' if imputation is None else imputation) + ' ' + str('None' if dim_reduction is None else dim_reduction) + ' ' + featureset + ' ' + target_column) train_test_list = TrainHelper.get_ready_train_test_lst( dataset=dataset, config=config, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc, imputation=imputation, target_column=target_column, dimensionality_reduction=dim_reduction, featureset=featureset) if dataset.name != dataset_last_name: best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = dataset.name imputation_last = imputation dim_reduction_last = dim_reduction featureset_last = featureset sum_dict = None try: for train, test in train_test_list: model = ModelsANN.AnnRegression( target_column=target_column, seasonal_periods=seasonal_periods, one_step_ahead=one_step_ahead, n_feature=train.shape[1] - 1, n_hidden=n_hidden, num_hidden_layer=num_hidden_layer, dropout_rate=dropout_rate, batch_size=batch_size, learning_rate=learning_rate, min_val_loss_improvement=min_val_loss_improvement, max_epochs_wo_improvement=max_epochs_wo_improvement) cross_val_dict = model.train(train=train, cross_val_call=False) eval_dict = model.evaluate(train=train, test=test) eval_dict.update(cross_val_dict) if sum_dict is None: sum_dict = eval_dict else: for k, v in eval_dict.items(): sum_dict[k] += v evaluation_dict = { k: v / len(train_test_list) for k, v in sum_dict.items() } params_dict = { 'dataset': dataset.name, 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'algo': model.name, 'dropout_rate': dropout_rate, 'batch_size': batch_size, 'learning_rate': learning_rate, 'min_val_loss_improvement': min_val_loss_improvement, 'max_epochs_wo_improvement': max_epochs_wo_improvement, 'n_hidden': n_hidden, 'num_hidden_layer': num_hidden_layer, 'one_step_ahead': one_step_ahead } save_dict = params_dict.copy() save_dict.update(evaluation_dict) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) best_rmse, best_mape, best_smape = TrainHelper.print_best_vals( evaluation_dict=evaluation_dict, best_rmse=best_rmse, best_mape=best_mape, best_smape=best_smape, run_number=i) except KeyboardInterrupt: print('Got interrupted') break except Exception as exc: print(exc) params_dict = { 'dataset': 'Failure', 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'algo': model.name, 'dropout_rate': dropout_rate, 'batch_size': batch_size, 'learning_rate': learning_rate, 'min_val_loss_improvement': min_val_loss_improvement, 'max_epochs_wo_improvement': max_epochs_wo_improvement, 'n_hidden': n_hidden, 'num_hidden_layer': num_hidden_layer, 'one_step_ahead': one_step_ahead } save_dict = params_dict.copy() save_dict.update(TrainHelper.get_failure_eval_dict()) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) TrainHelper.save_csv_results(doc_results=doc_results, save_dir=base_dir + 'OptimResults/', company_model_desc='ANN', target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, featuresets=param_grid['featureset'], imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.')
def run_gp_optim(target_column: str, split_perc: float, imputation: str, featureset: str): """ Run whole GPR optimization loop :param target_column: target variable for predictions :param split_perc: percentage of samples to use for train set :param imputation: imputation method for missing values :param featureset: featureset to use """ config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc) # load datasets datasets = TrainHelper.load_datasets(config=config, target_column=target_column) # prepare parameter grid kernels = [] base_kernels = [ SquaredExponential(), Matern52(), White(), RationalQuadratic(), Polynomial() ] for kern in base_kernels: if isinstance(kern, IsotropicStationary): base_kernels.append(Periodic(kern, period=seasonal_periods)) TrainHelper.extend_kernel_combinations(kernels=kernels, base_kernels=base_kernels) param_grid = { 'dataset': datasets, 'imputation': [imputation], 'featureset': [featureset], 'dim_reduction': ['None', 'pca'], 'kernel': kernels, 'mean_function': [None, gpflow.mean_functions.Constant()], 'noise_variance': [0.01, 1, 10, 100], 'optimizer': [gpflow.optimizers.Scipy()], 'standardize_x': [False, True], 'standardize_y': [False, True], 'osa': [True] } # random sample from parameter grid params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=0.2) doc_results = None best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = 'Dummy' imputation_last = 'Dummy' dim_reduction_last = 'Dummy' featureset_last = 'Dummy' for i in tqdm(range(len(params_lst))): warnings.simplefilter('ignore') dataset = params_lst[i]['dataset'] imputation = params_lst[i]['imputation'] featureset = params_lst[i]['featureset'] dim_reduction = None if params_lst[i][ 'dim_reduction'] == 'None' else params_lst[i]['dim_reduction'] # deepcopy to prevent impact of previous optimizations kernel = gpflow.utilities.deepcopy(params_lst[i]['kernel']) mean_fct = gpflow.utilities.deepcopy(params_lst[i]['mean_function']) noise_var = params_lst[i]['noise_variance'] optimizer = gpflow.utilities.deepcopy(params_lst[i]['optimizer']) stand_x = params_lst[i]['standardize_x'] stand_y = params_lst[i]['standardize_y'] one_step_ahead = params_lst[i]['osa'] # dim_reduction only done without NaNs if imputation is None and dim_reduction is not None: continue # dim_reduction does not make sense for few features if featureset == 'none' and dim_reduction is not None: continue if not ((dataset.name == dataset_last_name) and (imputation == imputation_last) and (dim_reduction == dim_reduction_last) and (featureset == featureset_last)): if resample_weekly and 'weekly' not in dataset.name: dataset.name = dataset.name + '_weekly' print(dataset.name + ' ' + str('None' if imputation is None else imputation) + ' ' + str('None' if dim_reduction is None else dim_reduction) + ' ' + featureset + ' ' + target_column) train_test_list = TrainHelper.get_ready_train_test_lst( dataset=dataset, config=config, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc, imputation=imputation, target_column=target_column, dimensionality_reduction=dim_reduction, featureset=featureset) if dataset.name != dataset_last_name: best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = dataset.name imputation_last = imputation dim_reduction_last = dim_reduction featureset_last = featureset kernel_string, mean_fct_string, optimizer_string = get_docresults_strings( kernel=kernel, mean_function=mean_fct, optimizer=optimizer) sum_dict = None try: for train, test in train_test_list: model = ModelsGPR.GaussianProcessRegressionGPFlow( target_column=target_column, seasonal_periods=seasonal_periods, kernel=kernel, mean_function=mean_fct, noise_variance=noise_var, optimizer=optimizer, standardize_x=stand_x, standardize_y=stand_y, one_step_ahead=one_step_ahead) cross_val_dict = model.train(train=train, cross_val_call=False) eval_dict = model.evaluate(train=train, test=test) eval_dict.update(cross_val_dict) if sum_dict is None: sum_dict = eval_dict else: for k, v in eval_dict.items(): sum_dict[k] += v evaluation_dict = { k: v / len(train_test_list) for k, v in sum_dict.items() } params_dict = { 'dataset': dataset.name, 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'kernel': kernel_string, 'mean_function': mean_fct_string, 'noise_variance': noise_var, 'optimizer': optimizer_string, 'standardize_x': stand_x, 'standardize_y': stand_y, 'one_step_ahead': one_step_ahead, 'optim_mod_params': model.model.parameters } save_dict = params_dict.copy() save_dict.update(evaluation_dict) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) best_rmse, best_mape, best_smape = TrainHelper.print_best_vals( evaluation_dict=evaluation_dict, best_rmse=best_rmse, best_mape=best_mape, best_smape=best_smape, run_number=i) except KeyboardInterrupt: print('Got interrupted') break except Exception as exc: # print(exc) params_dict = { 'dataset': 'Failure', 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'kernel': kernel_string, 'mean_function': mean_fct_string, 'noise_variance': noise_var, 'optimizer': optimizer_string, 'standardize_x': stand_x, 'standardize_y': stand_y, 'one_step_ahead': one_step_ahead, 'optim_mod_params': 'failed' } save_dict = params_dict.copy() save_dict.update(TrainHelper.get_failure_eval_dict()) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) TrainHelper.save_csv_results(doc_results=doc_results, save_dir=base_dir + 'OptimResults/', company_model_desc='gpr', target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, featuresets=param_grid['featureset'], imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.')
def run_regressions_optim(target_column: str, split_perc: float, algo: str): """ Run whole multiple linear regression optimization loops :param target_column: target variable for predictions :param split_perc: percentage of samples to use for train set :param algo: algo to use for optimization (['lasso', 'ridge', 'elasticnet', 'bayesridge', 'ard']) """ config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc) multiple_nans_raw_set = config[target_column].getboolean( 'multiple_nans_raw_set') # load datasets datasets = TrainHelper.load_datasets(config=config, target_column=target_column) # prepare parameter grid # parameters relevant for all algos param_grid = { 'dataset': datasets, 'imputation': ['mean', 'iterative', 'knn'], 'featureset': ['full', 'cal', 'stat', 'none'], 'dim_reduction': ['None', 'pca'], 'normalize': [False, True], 'osa': [True] } # parameters relevant for lasso, ridge and elasticnet if algo in ['lasso', 'ridge', 'elasticnet']: param_grid['alpha'] = [10**x for x in range(-5, 5)] if algo == 'elasticnet': param_grid['l1_ratio'] = np.arange(0.1, 1, 0.1) # random sample from parameter grid: all combis for lasso, ridge, elasticnet params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=1) # parameters relevant for bayesian ridge and ard regression else: param_grid['alpha_1'] = [10**x for x in range(-6, 1)] param_grid['alpha_2'] = [10**x for x in range(-6, -4)] param_grid['lambda_1'] = [10**x for x in range(-6, 1)] param_grid['lambda_2'] = [10**x for x in range(-6, 1)] # random sample from parameter grid: 0.25 share for bayesridge params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=0.2) if algo == 'ard': param_grid['threshold_lambda'] = [10**x for x in range(2, 6)] # random sample from parameter grid: 0.2 share for ard params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=0.2) # remove non-relevant featureset imputation combis if not multiple_nans_raw_set: params_lst_small = params_lst.copy() for param_set in params_lst: feat = param_set['featureset'] imp = param_set['imputation'] if (feat == 'cal' or feat == 'none') and (imp == 'iterative' or imp == 'knn'): params_lst_small.remove(param_set) params_lst = params_lst_small doc_results = None best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = 'Dummy' imputation_last = 'Dummy' dim_reduction_last = 'Dummy' featureset_last = 'Dummy' for i in tqdm(range(len(params_lst))): warnings.simplefilter('ignore') dataset = params_lst[i]['dataset'] imputation = params_lst[i]['imputation'] featureset = params_lst[i]['featureset'] dim_reduction = None if params_lst[i][ 'dim_reduction'] == 'None' else params_lst[i]['dim_reduction'] normalize = params_lst[i]['normalize'] one_step_ahead = params_lst[i]['osa'] l1_ratio = params_lst[i]['l1_ratio'] if 'l1_ratio' in params_lst[ i] else None alpha = params_lst[i]['alpha'] if 'alpha' in params_lst[i] else None alpha_1 = params_lst[i]['alpha_1'] if 'alpha_1' in params_lst[ i] else None alpha_2 = params_lst[i]['alpha_2'] if 'alpha_2' in params_lst[ i] else None lambda_1 = params_lst[i]['lambda_1'] if 'lambda_1' in params_lst[ i] else None lambda_2 = params_lst[i]['lambda_2'] if 'lambda_2' in params_lst[ i] else None threshold_lambda = params_lst[i][ 'threshold_lambda'] if 'threshold_lambda' in params_lst[i] else None # dim_reduction does not make sense for few features if featureset == 'none' and dim_reduction is not None: continue if not ((dataset.name == dataset_last_name) and (imputation == imputation_last) and (dim_reduction == dim_reduction_last) and (featureset == featureset_last)): if resample_weekly and 'weekly' not in dataset.name: dataset.name = dataset.name + '_weekly' print(dataset.name + ' ' + str('None' if imputation is None else imputation) + ' ' + str('None' if dim_reduction is None else dim_reduction) + ' ' + featureset + ' ' + target_column) train_test_list = TrainHelper.get_ready_train_test_lst( dataset=dataset, config=config, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc, imputation=imputation, target_column=target_column, dimensionality_reduction=dim_reduction, featureset=featureset) if dataset.name != dataset_last_name: best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = dataset.name imputation_last = imputation dim_reduction_last = dim_reduction featureset_last = featureset sum_dict = None try: for train, test in train_test_list: model = ModelsMLR.MultipleLinearRegression( model_to_use=algo, target_column=target_column, seasonal_periods=seasonal_periods, one_step_ahead=one_step_ahead, normalize=normalize, l1_ratio=l1_ratio, alpha=alpha, alpha_1=alpha_1, alpha_2=alpha_2, lambda_1=lambda_1, lambda_2=lambda_2, threshold_lambda=threshold_lambda) cross_val_dict = model.train(train=train, cross_val_call=False) eval_dict = model.evaluate(train=train, test=test) eval_dict.update(cross_val_dict) if sum_dict is None: sum_dict = eval_dict else: for k, v in eval_dict.items(): sum_dict[k] += v evaluation_dict = { k: v / len(train_test_list) for k, v in sum_dict.items() } params_dict = { 'dataset': dataset.name, 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'algo': model.name, 'normalize': normalize, 'alpha': alpha, 'l1_ratio': l1_ratio, 'alpha_1': alpha_1, 'alpha_2': alpha_2, 'lambda_1': lambda_1, 'lambda_2': lambda_2, 'threshold_lambda': threshold_lambda, 'one_step_ahead': one_step_ahead, 'fitted_coef': model.model.coef_, 'fitted_intercept': model.model.intercept_ } save_dict = params_dict.copy() save_dict.update(evaluation_dict) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) best_rmse, best_mape, best_smape = TrainHelper.print_best_vals( evaluation_dict=evaluation_dict, best_rmse=best_rmse, best_mape=best_mape, best_smape=best_smape, run_number=i) except KeyboardInterrupt: print('Got interrupted') break except Exception as exc: print(exc) params_dict = { 'dataset': 'Failure', 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'algo': model.name, 'normalize': normalize, 'alpha': alpha, 'l1_ratio': l1_ratio, 'alpha_1': alpha_1, 'alpha_2': alpha_2, 'lambda_1': lambda_1, 'lambda_2': lambda_2, 'threshold_lambda': threshold_lambda, 'one_step_ahead': one_step_ahead, 'fitted_coef': 'failed', 'fitted_intercept': 'failed' } save_dict = params_dict.copy() save_dict.update(TrainHelper.get_failure_eval_dict()) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) TrainHelper.save_csv_results(doc_results=doc_results, save_dir=base_dir + 'OptimResults/', company_model_desc=algo, target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, featuresets=param_grid['featureset'], imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.')
def run_xgb_optim(target_column: str, split_perc: float, imputation: str, featureset: str): """ Run whole XGB optimization loop :param target_column: target variable for predictions :param split_perc: percentage of samples to use for train set :param imputation: imputation method for missing values :param featureset: featureset to use """ config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc) # load datasets datasets = TrainHelper.load_datasets(config=config, target_column=target_column) # prepare parameter grid param_grid = { 'dataset': datasets, 'imputation': [imputation], 'featureset': [featureset], 'dim_reduction': ['None', 'pca'], 'learning_rate': [0.05, 0.1, 0.3], 'max_depth': [3, 5, 10], 'subsample': [0.3, 0.7, 1], 'n_estimators': [10, 100, 1000], 'gamma': [0, 1, 10], 'alpha': [0, 0.1, 1, 10], 'reg_lambda': [0, 0.1, 1, 10], 'osa': [True] } # random sample from parameter grid params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=0.2) doc_results = None best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = 'Dummy' imputation_last = 'Dummy' dim_reduction_last = 'Dummy' featureset_last = 'Dummy' for i in tqdm(range(len(params_lst))): warnings.simplefilter('ignore') dataset = params_lst[i]['dataset'] imputation = params_lst[i]['imputation'] featureset = params_lst[i]['featureset'] dim_reduction = None if params_lst[i][ 'dim_reduction'] == 'None' else params_lst[i]['dim_reduction'] learning_rate = params_lst[i]['learning_rate'] max_depth = params_lst[i]['max_depth'] subsample = params_lst[i]['subsample'] n_estimators = params_lst[i]['n_estimators'] gamma = params_lst[i]['gamma'] alpha = params_lst[i]['alpha'] reg_lambda = params_lst[i]['reg_lambda'] one_step_ahead = params_lst[i]['osa'] # dim_reduction only done without NaNs if imputation is None and dim_reduction is not None: continue # dim_reduction does not make sense for few features if featureset == 'none' and dim_reduction is not None: continue if not ((dataset.name == dataset_last_name) and (imputation == imputation_last) and (dim_reduction == dim_reduction_last) and (featureset == featureset_last)): if resample_weekly and 'weekly' not in dataset.name: dataset.name = dataset.name + '_weekly' print(dataset.name + ' ' + str('None' if imputation is None else imputation) + ' ' + str('None' if dim_reduction is None else dim_reduction) + ' ' + featureset + ' ' + target_column) train_test_list = TrainHelper.get_ready_train_test_lst( dataset=dataset, config=config, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc, imputation=imputation, target_column=target_column, dimensionality_reduction=dim_reduction, featureset=featureset) if dataset.name != dataset_last_name: best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = dataset.name imputation_last = imputation dim_reduction_last = dim_reduction featureset_last = featureset sum_dict = None try: for train, test in train_test_list: model = ModelXGBoost.XGBoostRegression( target_column=target_column, seasonal_periods=seasonal_periods, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, n_estimators=n_estimators, gamma=gamma, alpha=alpha, reg_lambda=reg_lambda, one_step_ahead=one_step_ahead) cross_val_dict = model.train(train=train, cross_val_call=False) eval_dict = model.evaluate(train=train, test=test) eval_dict.update(cross_val_dict) if sum_dict is None: sum_dict = eval_dict else: for k, v in eval_dict.items(): sum_dict[k] += v evaluation_dict = { k: v / len(train_test_list) for k, v in sum_dict.items() } params_dict = { 'dataset': dataset.name, 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'learning_rate': learning_rate, 'max_depth': max_depth, 'subsample': subsample, 'n_estimators': n_estimators, 'gamma': gamma, 'alpha': alpha, 'lambda': reg_lambda, 'one_step_ahead': one_step_ahead } save_dict = params_dict.copy() save_dict.update(evaluation_dict) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) best_rmse, best_mape, best_smape = TrainHelper.print_best_vals( evaluation_dict=evaluation_dict, best_rmse=best_rmse, best_mape=best_mape, best_smape=best_smape, run_number=i) except KeyboardInterrupt: print('Got interrupted') break except Exception as exc: print(exc) params_dict = { 'dataset': 'Failure', 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'learning_rate': learning_rate, 'max_depth': max_depth, 'subsample': subsample, 'n_estimators': n_estimators, 'gamma': gamma, 'alpha': alpha, 'lambda': reg_lambda, 'one_step_ahead': one_step_ahead } save_dict = params_dict.copy() save_dict.update(TrainHelper.get_failure_eval_dict()) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) TrainHelper.save_csv_results(doc_results=doc_results, save_dir=base_dir + 'OptimResults/', company_model_desc='xgb', target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, featuresets=param_grid['featureset'], imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.')
i = 1 for target_column in result_file_dict.keys(): print('++++++ Processing Dataset ' + str(i) + '/' + str(len(result_file_dict.keys())) + ' ++++++') i += 1 # set standard values split_perc = 0.8 company = 'General' doc_results = None result_file_str = result_file_dict[target_column] # read config file config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, company=company, target_column=target_column, split_perc=split_perc) # set const hazard and scale window based on seasonal periods const_hazard = const_hazard_factor * seasonal_periods if const_hazard_user == 9999 else const_hazard_user scale_window = max(scale_window_minimum, int(scale_window_factor * seasonal_periods)) max_samples = max_samples_factor * seasonal_periods if max_samples_factor is not None else max_samples_user # read result file config result_file = pd.read_csv(optim_results_dir + result_file_str, sep=';', decimal=',', index_col=False) result_file.drop('Unnamed: 0', axis=1, inplace=True) result_file.replace(to_replace='NaN', value=np.nan, inplace=True) result_file.drop(result_file.index[result_file['shuf_cv_rmse_std'].isna()], inplace=True) result_file.dropna(subset=[el for el in result_file.columns if 'cv' in el], inplace=True) result_file.drop(result_file.index[result_file['shuf_cv_rmse_std'] == 0], inplace=True) sort_col = 'shuf_cv_rmse_mean' sorted_results = result_file.sort_values(sort_col) top_config = sorted_results.head(1).iloc[0] dict_top_config = TrainHelper.read_config_info(top_config, seasonal_periods)
def run_sarimax_optim(target_column: str, split_perc: float, imputation: str, featureset: str, univariate: bool): """ Run whole (S)ARIMA(X) optimization loop :param target_column: target variable for predictions :param split_perc: percentage of samples to use for train set :param imputation: imputation method for missing values :param featureset: featureset to use :param univariate: use univariate version SARIMA as well """ config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc) # load datasets datasets = TrainHelper.load_datasets(config=config, target_column=target_column) # prepare parameter grid param_grid = { 'dataset': datasets, 'imputation': [imputation], 'featureset': [featureset], 'dim_reduction': ['None', 'pca'], 'p': [0, 1, 2, 3], 'd': [0, 1], 'q': [0, 1, 2, 3], 'P': [0, 1, 2, 3], 'D': [0, 1], 'Q': [0, 1, 2, 3], 'osa': [True], 'transf': [False, 'log', 'pw'], 'exog': [True], 'wi': [True] } if univariate: param_grid['exog'] = [False, True] # random sample from parameter grid params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=0.2) doc_results = None best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = 'Dummy' imputation_last = 'Dummy' dim_reduction_last = 'Dummy' featureset_last = 'Dummy' for i in tqdm(range(len(params_lst))): warnings.simplefilter('ignore') dataset = params_lst[i]['dataset'] imputation = params_lst[i]['imputation'] featureset = params_lst[i]['featureset'] dim_reduction = None if params_lst[i][ 'dim_reduction'] == 'None' else params_lst[i]['dim_reduction'] p = params_lst[i]['p'] d = params_lst[i]['d'] q = params_lst[i]['q'] P = params_lst[i]['P'] D = params_lst[i]['D'] Q = params_lst[i]['Q'] one_step_ahead = params_lst[i]['osa'] transf = params_lst[i]['transf'] power, log = TrainHelper.get_pw_l_for_transf(transf=transf) use_exog = params_lst[i]['exog'] with_interc = params_lst[i]['wi'] order = [p, d, q] seasonal_order = [P, D, Q, seasonal_periods] # dim_reduction only done without NaNs if imputation is None and dim_reduction is not None: continue # dim_reduction does not make sense for few features if featureset == 'none' and dim_reduction is not None: continue if not ((dataset.name == dataset_last_name) and (imputation == imputation_last) and (dim_reduction == dim_reduction_last) and (featureset == featureset_last)): if resample_weekly and 'weekly' not in dataset.name: dataset.name = dataset.name + '_weekly' print(dataset.name + ' ' + str('None' if imputation is None else imputation) + ' ' + str('None' if dim_reduction is None else dim_reduction) + ' ' + featureset + ' ' + target_column) train_test_list = TrainHelper.get_ready_train_test_lst( dataset=dataset, config=config, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc, imputation=imputation, target_column=target_column, dimensionality_reduction=dim_reduction, featureset=featureset) if dataset.name != dataset_last_name: best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = dataset.name imputation_last = imputation dim_reduction_last = dim_reduction featureset_last = featureset sum_dict = None try: for train, test in train_test_list: model = ModelsARIMA.ARIMA(target_column=target_column, order=order, seasonal_order=seasonal_order, one_step_ahead=one_step_ahead, power_transf=power, log=log, use_exog=use_exog, with_intercept=with_interc) cross_val_dict = model.train(train=train, cross_val_call=False) eval_dict = model.evaluate(train=train, test=test) eval_dict.update(cross_val_dict) if sum_dict is None: sum_dict = eval_dict else: for k, v in eval_dict.items(): sum_dict[k] += v evaluation_dict = { k: v / len(train_test_list) for k, v in sum_dict.items() } params_dict = { 'dataset': dataset.name, 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'order': order, 'seasonal_order': seasonal_order, 'one_step_ahead': one_step_ahead, 'power_transform': power, 'log_transform': log, 'use_exog': use_exog, 'with_intercept': with_interc } save_dict = params_dict.copy() save_dict.update(evaluation_dict) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) best_rmse, best_mape, best_smape = TrainHelper.print_best_vals( evaluation_dict=evaluation_dict, best_rmse=best_rmse, best_mape=best_mape, best_smape=best_smape, run_number=i) except KeyboardInterrupt: print('Got interrupted') break except Exception as exc: print(exc) params_dict = { 'dataset': 'Failure', 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'order': order, 'seasonal_order': seasonal_order, 'one_step_ahead': one_step_ahead, 'power_transform': power, 'log_transform': log, 'use_exog': use_exog, 'with_intercept': with_interc } save_dict = params_dict.copy() save_dict.update(TrainHelper.get_failure_eval_dict()) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) TrainHelper.save_csv_results(doc_results=doc_results, save_dir=base_dir + 'OptimResults/', company_model_desc='sarima-x', target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, featuresets=param_grid['featureset'], imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.')
def run_es_optim(target_column: str, split_perc: float, imputation: str): """ Run whole ES optimization loop :param target_column: target variable for predictions :param split_perc: percentage of samples to use for train set :param imputation: imputation method for missing values """ config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc) # load datasets datasets = TrainHelper.load_datasets(config=config, target_column=target_column) # prepare parameter grid param_grid = { 'dataset': datasets, 'imputation': [imputation], 'trend': ['add', None], 'damp': [False, True], 'seasonality': ['add', 'mul', None], 'remove_bias': [False, True], 'brute': [False, True], 'osa': [True], 'transf': [False, 'log', 'pw'] } # random sample from parameter grid params_lst = sorted(list( sklearn.model_selection.ParameterSampler( param_distributions=param_grid, n_iter=int( 1 * MixedHelper.get_product_len_dict(dictionary=param_grid)), random_state=np.random.RandomState(42))), key=lambda d: (d['dataset'].name, d['imputation'])) doc_results = None best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = 'Dummy' imputation_last = 'Dummy' for i in tqdm(range(len(params_lst))): warnings.simplefilter('ignore') dataset = params_lst[i]['dataset'] imputation = params_lst[i]['imputation'] tr = params_lst[i]['trend'] damp = params_lst[i]['damp'] season = params_lst[i]['seasonality'] remo_bias = params_lst[i]['remove_bias'] brute = params_lst[i]['brute'] one_step_ahead = params_lst[i]['osa'] transf = params_lst[i]['transf'] power, log = TrainHelper.get_pw_l_for_transf(transf=transf) if not ((dataset.name == dataset_last_name) and (imputation == imputation_last)): if resample_weekly and 'weekly' not in dataset.name: dataset.name = dataset.name + '_weekly' print(dataset.name + ' ' + str('None' if imputation is None else imputation) + ' ' + target_column) train_test_list = TrainHelper.get_ready_train_test_lst( dataset=dataset, config=config, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc, imputation=imputation, target_column=target_column, reset_index=True) if dataset.name != dataset_last_name: best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = dataset.name imputation_last = imputation sum_dict = None try: for train, test in train_test_list: model = ModelsES.ExponentialSmoothing( target_column=target_column, trend=tr, damped=damp, seasonal=season, seasonal_periods=seasonal_periods, remove_bias=remo_bias, use_brute=brute, one_step_ahead=one_step_ahead, power_transf=power, log=log) cross_val_dict = model.train(train=train, cross_val_call=False) eval_dict = model.evaluate(train=train, test=test) eval_dict.update(cross_val_dict) if sum_dict is None: sum_dict = eval_dict else: for k, v in eval_dict.items(): sum_dict[k] += v evaluation_dict = { k: v / len(train_test_list) for k, v in sum_dict.items() } params_dict = { 'dataset': dataset.name, 'imputation': str('None' if imputation is None else imputation), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'trend': tr, 'damped': damp, 'seasonal': season, 'seasonal_periods': seasonal_periods, 'remove_bias': remo_bias, 'use_brute': brute, 'one_step_ahead': one_step_ahead, 'power_transform': power, 'log': log } save_dict = params_dict.copy() save_dict.update(evaluation_dict) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) best_rmse, best_mape, best_smape = TrainHelper.print_best_vals( evaluation_dict=evaluation_dict, best_rmse=best_rmse, best_mape=best_mape, best_smape=best_smape, run_number=i) except KeyboardInterrupt: print('Got interrupted') break except Exception as exc: print(exc) params_dict = { 'dataset': 'Failure', 'imputation': str('None' if imputation is None else imputation), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'trend': tr, 'damped': damp, 'seasonal': season, 'seasonal_periods': seasonal_periods, 'remove_bias': remo_bias, 'use_brute': brute, 'one_step_ahead': one_step_ahead, 'power_transform': power, 'log': log } save_dict = params_dict.copy() save_dict.update(TrainHelper.get_failure_eval_dict()) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) TrainHelper.save_csv_results(doc_results=doc_results, save_dir=base_dir + 'OptimResults/', company_model_desc='es', target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.')