Ejemplo n.º 1
0
def run_gp_optim(company: str, target_column: str, split_perc: float,
                 imputation: str, featureset: str):
    """
    Run GPR offline optimization loop
    :param company: prefix for data in case company data is also used
    :param target_column: target column to use
    :param split_perc: share of train data
    :param imputation: imputation method
    :param featureset: featureset to use
    """
    config = configparser.ConfigParser()
    config.read('Configs/dataset_specific_config.ini')
    # get optim parameters
    base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
        TrainHelper.get_optimization_run_parameters(config=config, company=company, target_column=target_column,
                                                    split_perc=split_perc)

    # load datasets
    datasets = TrainHelper.load_datasets(config=config,
                                         company=company,
                                         target_column=target_column)

    # prepare parameter grid
    kernels = []
    base_kernels = [
        ConstantKernel(constant_value=1000, constant_value_bounds=(1e-5, 1e5)),
        Matern(length_scale=1.0, length_scale_bounds=(1e-5, 1e5)),
        ExpSineSquared(length_scale=1.0,
                       periodicity=seasonal_periods,
                       length_scale_bounds=(1e-5, 1e5),
                       periodicity_bounds=(int(seasonal_periods * 0.8),
                                           int(seasonal_periods * 1.2))),
        RBF(length_scale=1.0, length_scale_bounds=(1e-5, 1e5)),
        RationalQuadratic(length_scale=1.0,
                          alpha=1.0,
                          length_scale_bounds=(1e-5, 1e5),
                          alpha_bounds=(1e-5, 1e5)),
        WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-5, 1e5))
    ]
    TrainHelper.extend_kernel_combinations(kernels=kernels,
                                           base_kernels=base_kernels)
    param_grid = {
        'dataset': [datasets[0]],
        'imputation': [imputation],
        'featureset': [featureset],
        'dim_reduction': ['None', 'pca'],
        'kernel': kernels,
        'alpha': [1e-5, 1e-3, 1e-1, 1, 1e1, 1e3],
        'n_restarts_optimizer': [0, 5, 10],
        'standardize': [False, True],
        'norm_y': [False, True],
        'osa': [False]
    }
    # random sample from parameter grid
    sample_share = 0.1
    params_lst = TrainHelper.random_sample_parameter_grid(
        param_grid=param_grid, sample_share=sample_share)

    doc_results = None
    best_rmse = 5000000.0
    dataset_last_name = 'Dummy'
    imputation_last = 'Dummy'
    dim_reduction_last = 'Dummy'
    featureset_last = 'Dummy'

    for i in tqdm(range(len(params_lst))):
        warnings.simplefilter('ignore')
        dataset = params_lst[i]['dataset']
        imputation = params_lst[i]['imputation']
        featureset = params_lst[i]['featureset']
        dim_reduction = None if params_lst[i][
            'dim_reduction'] == 'None' else params_lst[i]['dim_reduction']
        kernel = params_lst[i]['kernel']
        alpha = params_lst[i]['alpha']
        n_restarts_optimizer = params_lst[i]['n_restarts_optimizer']
        stand = params_lst[i]['standardize']
        norm_y = params_lst[i]['norm_y']
        one_step_ahead = params_lst[i]['osa']

        # dim_reduction can only be done without NaNs
        if imputation is None and dim_reduction is not None:
            continue
        # 'dim_reduction does not make sense for few features
        if featureset == 'none' and dim_reduction is not None:
            continue

        if not ((dataset.name == dataset_last_name) and
                (imputation == imputation_last) and
                (dim_reduction == dim_reduction_last) and
                (featureset == featureset_last)):
            if resample_weekly and 'weekly' not in dataset.name:
                dataset.name = dataset.name + '_weekly'
            print(dataset.name + ' ' +
                  str('None' if imputation is None else imputation) + ' ' +
                  str('None' if dim_reduction is None else dim_reduction) +
                  ' ' + featureset + ' ' + target_column)
            train_test_list = TrainHelper.get_ready_train_test_lst(
                dataset=dataset,
                config=config,
                init_train_len=init_train_len,
                test_len=test_len,
                split_perc=split_perc,
                imputation=imputation,
                target_column=target_column,
                dimensionality_reduction=dim_reduction,
                featureset=featureset)
            if dataset.name != dataset_last_name:
                best_rmse = 5000000.0
            dataset_last_name = dataset.name
            imputation_last = imputation
            dim_reduction_last = dim_reduction
            featureset_last = featureset

        sum_dict = None
        try:
            for train, test in train_test_list:
                model = ModelsGaussianProcessRegression.GaussianProcessRegression(
                    target_column=target_column,
                    seasonal_periods=seasonal_periods,
                    kernel=kernel,
                    alpha=alpha,
                    n_restarts_optimizer=n_restarts_optimizer,
                    one_step_ahead=one_step_ahead,
                    standardize=stand,
                    normalize_y=norm_y)
                cross_val_dict = model.train(train=train, cross_val_call=True)
                eval_dict = model.evaluate(train=train, test=test)
                eval_dict.update(cross_val_dict)
                if sum_dict is None:
                    sum_dict = eval_dict
                else:
                    for k, v in eval_dict.items():
                        sum_dict[k] += v
            evaluation_dict = {
                k: v / len(train_test_list)
                for k, v in sum_dict.items()
            }
            params_dict = {
                'dataset':
                dataset.name,
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'kernel':
                kernel,
                'alpha':
                alpha,
                'n_restarts_optimizer':
                n_restarts_optimizer,
                'standardize':
                stand,
                'normalize_y':
                norm_y,
                'one_step_ahead':
                one_step_ahead,
                'optimized_kernel':
                model.model.kernel_
            }
            save_dict = params_dict.copy()
            save_dict.update(evaluation_dict)
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
            best_rmse = TrainHelper.print_best_vals(
                evaluation_dict=evaluation_dict,
                best_rmse=best_rmse,
                run_number=i)
        except KeyboardInterrupt:
            print('Got interrupted')
            break
        except Exception as exc:
            print(exc)
            params_dict = {
                'dataset':
                'Failure',
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'kernel':
                kernel,
                'alpha':
                alpha,
                'n_restarts_optimizer':
                n_restarts_optimizer,
                'standardize':
                stand,
                'normalize_y':
                norm_y,
                'one_step_ahead':
                one_step_ahead,
                'optimized_kernel':
                'failed'
            }
            save_dict = params_dict.copy()
            save_dict.update(TrainHelper.get_failure_eval_dict())
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
    TrainHelper.save_csv_results(doc_results=doc_results,
                                 save_dir=base_dir + 'OptimResults/',
                                 company_model_desc=company +
                                 '-gp-sklearn_raw',
                                 target_column=target_column,
                                 seasonal_periods=seasonal_periods,
                                 datasets=datasets,
                                 featuresets=param_grid['featureset'],
                                 imputations=param_grid['imputation'],
                                 split_perc=split_perc)
    print('Optimization Done. Saved Results.')
Ejemplo n.º 2
0
def run_gp_optim(target_column: str, split_perc: float, imputation: str,
                 featureset: str):
    """
    Run whole GPR optimization loop
    :param target_column: target variable for predictions
    :param split_perc: percentage of samples to use for train set
    :param imputation: imputation method for missing values
    :param featureset: featureset to use
    """
    config = configparser.ConfigParser()
    config.read('Configs/dataset_specific_config.ini')
    # get optim parameters
    base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
        TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc)
    # load datasets
    datasets = TrainHelper.load_datasets(config=config,
                                         target_column=target_column)
    # prepare parameter grid
    kernels = []
    base_kernels = [
        SquaredExponential(),
        Matern52(),
        White(),
        RationalQuadratic(),
        Polynomial()
    ]
    for kern in base_kernels:
        if isinstance(kern, IsotropicStationary):
            base_kernels.append(Periodic(kern, period=seasonal_periods))
    TrainHelper.extend_kernel_combinations(kernels=kernels,
                                           base_kernels=base_kernels)
    param_grid = {
        'dataset': datasets,
        'imputation': [imputation],
        'featureset': [featureset],
        'dim_reduction': ['None', 'pca'],
        'kernel': kernels,
        'mean_function': [None, gpflow.mean_functions.Constant()],
        'noise_variance': [0.01, 1, 10, 100],
        'optimizer': [gpflow.optimizers.Scipy()],
        'standardize_x': [False, True],
        'standardize_y': [False, True],
        'osa': [True]
    }
    # random sample from parameter grid
    params_lst = TrainHelper.random_sample_parameter_grid(
        param_grid=param_grid, sample_share=0.2)

    doc_results = None
    best_rmse = 5000000.0
    best_mape = 5000000.0
    best_smape = 5000000.0
    dataset_last_name = 'Dummy'
    imputation_last = 'Dummy'
    dim_reduction_last = 'Dummy'
    featureset_last = 'Dummy'

    for i in tqdm(range(len(params_lst))):
        warnings.simplefilter('ignore')
        dataset = params_lst[i]['dataset']
        imputation = params_lst[i]['imputation']
        featureset = params_lst[i]['featureset']
        dim_reduction = None if params_lst[i][
            'dim_reduction'] == 'None' else params_lst[i]['dim_reduction']
        # deepcopy to prevent impact of previous optimizations
        kernel = gpflow.utilities.deepcopy(params_lst[i]['kernel'])
        mean_fct = gpflow.utilities.deepcopy(params_lst[i]['mean_function'])
        noise_var = params_lst[i]['noise_variance']
        optimizer = gpflow.utilities.deepcopy(params_lst[i]['optimizer'])
        stand_x = params_lst[i]['standardize_x']
        stand_y = params_lst[i]['standardize_y']
        one_step_ahead = params_lst[i]['osa']

        # dim_reduction only done without NaNs
        if imputation is None and dim_reduction is not None:
            continue
        # dim_reduction does not make sense for few features
        if featureset == 'none' and dim_reduction is not None:
            continue

        if not ((dataset.name == dataset_last_name) and
                (imputation == imputation_last) and
                (dim_reduction == dim_reduction_last) and
                (featureset == featureset_last)):
            if resample_weekly and 'weekly' not in dataset.name:
                dataset.name = dataset.name + '_weekly'
            print(dataset.name + ' ' +
                  str('None' if imputation is None else imputation) + ' ' +
                  str('None' if dim_reduction is None else dim_reduction) +
                  ' ' + featureset + ' ' + target_column)
            train_test_list = TrainHelper.get_ready_train_test_lst(
                dataset=dataset,
                config=config,
                init_train_len=init_train_len,
                test_len=test_len,
                split_perc=split_perc,
                imputation=imputation,
                target_column=target_column,
                dimensionality_reduction=dim_reduction,
                featureset=featureset)
            if dataset.name != dataset_last_name:
                best_rmse = 5000000.0
                best_mape = 5000000.0
                best_smape = 5000000.0
            dataset_last_name = dataset.name
            imputation_last = imputation
            dim_reduction_last = dim_reduction
            featureset_last = featureset

        kernel_string, mean_fct_string, optimizer_string = get_docresults_strings(
            kernel=kernel, mean_function=mean_fct, optimizer=optimizer)
        sum_dict = None
        try:
            for train, test in train_test_list:
                model = ModelsGPR.GaussianProcessRegressionGPFlow(
                    target_column=target_column,
                    seasonal_periods=seasonal_periods,
                    kernel=kernel,
                    mean_function=mean_fct,
                    noise_variance=noise_var,
                    optimizer=optimizer,
                    standardize_x=stand_x,
                    standardize_y=stand_y,
                    one_step_ahead=one_step_ahead)
                cross_val_dict = model.train(train=train, cross_val_call=False)
                eval_dict = model.evaluate(train=train, test=test)
                eval_dict.update(cross_val_dict)
                if sum_dict is None:
                    sum_dict = eval_dict
                else:
                    for k, v in eval_dict.items():
                        sum_dict[k] += v
            evaluation_dict = {
                k: v / len(train_test_list)
                for k, v in sum_dict.items()
            }
            params_dict = {
                'dataset':
                dataset.name,
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'kernel':
                kernel_string,
                'mean_function':
                mean_fct_string,
                'noise_variance':
                noise_var,
                'optimizer':
                optimizer_string,
                'standardize_x':
                stand_x,
                'standardize_y':
                stand_y,
                'one_step_ahead':
                one_step_ahead,
                'optim_mod_params':
                model.model.parameters
            }
            save_dict = params_dict.copy()
            save_dict.update(evaluation_dict)
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
            best_rmse, best_mape, best_smape = TrainHelper.print_best_vals(
                evaluation_dict=evaluation_dict,
                best_rmse=best_rmse,
                best_mape=best_mape,
                best_smape=best_smape,
                run_number=i)
        except KeyboardInterrupt:
            print('Got interrupted')
            break
        except Exception as exc:
            # print(exc)
            params_dict = {
                'dataset':
                'Failure',
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'kernel':
                kernel_string,
                'mean_function':
                mean_fct_string,
                'noise_variance':
                noise_var,
                'optimizer':
                optimizer_string,
                'standardize_x':
                stand_x,
                'standardize_y':
                stand_y,
                'one_step_ahead':
                one_step_ahead,
                'optim_mod_params':
                'failed'
            }
            save_dict = params_dict.copy()
            save_dict.update(TrainHelper.get_failure_eval_dict())
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
    TrainHelper.save_csv_results(doc_results=doc_results,
                                 save_dir=base_dir + 'OptimResults/',
                                 company_model_desc='gpr',
                                 target_column=target_column,
                                 seasonal_periods=seasonal_periods,
                                 datasets=datasets,
                                 featuresets=param_grid['featureset'],
                                 imputations=param_grid['imputation'],
                                 split_perc=split_perc)
    print('Optimization Done. Saved Results.')
def run_ann_optim(target_column: str, split_perc: float, imputation: str,
                  featureset: str):
    """
    Run whole ANN optimization loop
    :param target_column: target variable for predictions
    :param split_perc: percentage of samples to use for train set
    :param imputation: imputation method for missing values
    :param featureset: featureset to use
    """
    config = configparser.ConfigParser()
    config.read('Configs/dataset_specific_config.ini')
    # get optim parameters
    base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
        TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc)
    # load datasets
    datasets = TrainHelper.load_datasets(config=config,
                                         target_column=target_column)
    # prepare parameter grid
    param_grid = {
        'dataset': datasets,
        'imputation': [imputation],
        'featureset': [featureset],
        'dim_reduction': ['None', 'pca'],
        'dropout_rate': [0.0, 0.5],
        'batch_size': [4, 8, 16, 32],
        'learning_rate': [1e-4, 1e-3, 1e-2, 1e-1],
        'min_val_loss_improvement': [100, 1000],
        'max_epochs_wo_improvement': [20, 50, 100],
        'n_hidden': [10, 20, 50, 100],
        'num_hidden_layer': [1, 2, 3],
        'osa': [True]
    }
    # random samples from parameter grid
    params_lst = TrainHelper.random_sample_parameter_grid(
        param_grid=param_grid, sample_share=0.1)

    doc_results = None
    best_rmse = 5000000.0
    best_mape = 5000000.0
    best_smape = 5000000.0
    dataset_last_name = 'Dummy'
    imputation_last = 'Dummy'
    dim_reduction_last = 'Dummy'
    featureset_last = 'Dummy'

    for i in tqdm(range(len(params_lst))):
        dataset = params_lst[i]['dataset']
        imputation = params_lst[i]['imputation']
        featureset = params_lst[i]['featureset']
        dim_reduction = None if params_lst[i][
            'dim_reduction'] == 'None' else params_lst[i]['dim_reduction']
        dropout_rate = params_lst[i]['dropout_rate']
        batch_size = params_lst[i]['batch_size']
        learning_rate = params_lst[i]['learning_rate']
        min_val_loss_improvement = params_lst[i]['min_val_loss_improvement']
        max_epochs_wo_improvement = params_lst[i]['max_epochs_wo_improvement']
        one_step_ahead = params_lst[i]['osa']
        n_hidden = params_lst[i]['n_hidden']
        num_hidden_layer = params_lst[i]['num_hidden_layer']

        # dim_reduction does not make sense for few features
        if featureset == 'none' and dim_reduction is not None:
            continue

        if not ((dataset.name == dataset_last_name) and
                (imputation == imputation_last) and
                (dim_reduction == dim_reduction_last) and
                (featureset == featureset_last)):
            if resample_weekly and 'weekly' not in dataset.name:
                dataset.name = dataset.name + '_weekly'
            print(dataset.name + ' ' +
                  str('None' if imputation is None else imputation) + ' ' +
                  str('None' if dim_reduction is None else dim_reduction) +
                  ' ' + featureset + ' ' + target_column)
            train_test_list = TrainHelper.get_ready_train_test_lst(
                dataset=dataset,
                config=config,
                init_train_len=init_train_len,
                test_len=test_len,
                split_perc=split_perc,
                imputation=imputation,
                target_column=target_column,
                dimensionality_reduction=dim_reduction,
                featureset=featureset)
            if dataset.name != dataset_last_name:
                best_rmse = 5000000.0
                best_mape = 5000000.0
                best_smape = 5000000.0
            dataset_last_name = dataset.name
            imputation_last = imputation
            dim_reduction_last = dim_reduction
            featureset_last = featureset

        sum_dict = None
        try:
            for train, test in train_test_list:
                model = ModelsANN.AnnRegression(
                    target_column=target_column,
                    seasonal_periods=seasonal_periods,
                    one_step_ahead=one_step_ahead,
                    n_feature=train.shape[1] - 1,
                    n_hidden=n_hidden,
                    num_hidden_layer=num_hidden_layer,
                    dropout_rate=dropout_rate,
                    batch_size=batch_size,
                    learning_rate=learning_rate,
                    min_val_loss_improvement=min_val_loss_improvement,
                    max_epochs_wo_improvement=max_epochs_wo_improvement)
                cross_val_dict = model.train(train=train, cross_val_call=False)
                eval_dict = model.evaluate(train=train, test=test)
                eval_dict.update(cross_val_dict)
                if sum_dict is None:
                    sum_dict = eval_dict
                else:
                    for k, v in eval_dict.items():
                        sum_dict[k] += v
            evaluation_dict = {
                k: v / len(train_test_list)
                for k, v in sum_dict.items()
            }
            params_dict = {
                'dataset':
                dataset.name,
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'algo':
                model.name,
                'dropout_rate':
                dropout_rate,
                'batch_size':
                batch_size,
                'learning_rate':
                learning_rate,
                'min_val_loss_improvement':
                min_val_loss_improvement,
                'max_epochs_wo_improvement':
                max_epochs_wo_improvement,
                'n_hidden':
                n_hidden,
                'num_hidden_layer':
                num_hidden_layer,
                'one_step_ahead':
                one_step_ahead
            }
            save_dict = params_dict.copy()
            save_dict.update(evaluation_dict)
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
            best_rmse, best_mape, best_smape = TrainHelper.print_best_vals(
                evaluation_dict=evaluation_dict,
                best_rmse=best_rmse,
                best_mape=best_mape,
                best_smape=best_smape,
                run_number=i)
        except KeyboardInterrupt:
            print('Got interrupted')
            break
        except Exception as exc:
            print(exc)
            params_dict = {
                'dataset':
                'Failure',
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'algo':
                model.name,
                'dropout_rate':
                dropout_rate,
                'batch_size':
                batch_size,
                'learning_rate':
                learning_rate,
                'min_val_loss_improvement':
                min_val_loss_improvement,
                'max_epochs_wo_improvement':
                max_epochs_wo_improvement,
                'n_hidden':
                n_hidden,
                'num_hidden_layer':
                num_hidden_layer,
                'one_step_ahead':
                one_step_ahead
            }
            save_dict = params_dict.copy()
            save_dict.update(TrainHelper.get_failure_eval_dict())
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
    TrainHelper.save_csv_results(doc_results=doc_results,
                                 save_dir=base_dir + 'OptimResults/',
                                 company_model_desc='ANN',
                                 target_column=target_column,
                                 seasonal_periods=seasonal_periods,
                                 datasets=datasets,
                                 featuresets=param_grid['featureset'],
                                 imputations=param_grid['imputation'],
                                 split_perc=split_perc)
    print('Optimization Done. Saved Results.')
def run_regressions_optim(target_column: str, split_perc: float, algo: str):
    """
    Run whole multiple linear regression optimization loops
    :param target_column: target variable for predictions
    :param split_perc: percentage of samples to use for train set
    :param algo: algo to use for optimization (['lasso', 'ridge', 'elasticnet', 'bayesridge', 'ard'])
    """
    config = configparser.ConfigParser()
    config.read('Configs/dataset_specific_config.ini')
    # get optim parameters
    base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
        TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc)
    multiple_nans_raw_set = config[target_column].getboolean(
        'multiple_nans_raw_set')
    # load datasets
    datasets = TrainHelper.load_datasets(config=config,
                                         target_column=target_column)
    # prepare parameter grid
    # parameters relevant for all algos
    param_grid = {
        'dataset': datasets,
        'imputation': ['mean', 'iterative', 'knn'],
        'featureset': ['full', 'cal', 'stat', 'none'],
        'dim_reduction': ['None', 'pca'],
        'normalize': [False, True],
        'osa': [True]
    }
    # parameters relevant for lasso, ridge and elasticnet
    if algo in ['lasso', 'ridge', 'elasticnet']:
        param_grid['alpha'] = [10**x for x in range(-5, 5)]
        if algo == 'elasticnet':
            param_grid['l1_ratio'] = np.arange(0.1, 1, 0.1)
        # random sample from parameter grid: all combis for lasso, ridge, elasticnet
        params_lst = TrainHelper.random_sample_parameter_grid(
            param_grid=param_grid, sample_share=1)
    # parameters relevant for bayesian ridge and ard regression
    else:
        param_grid['alpha_1'] = [10**x for x in range(-6, 1)]
        param_grid['alpha_2'] = [10**x for x in range(-6, -4)]
        param_grid['lambda_1'] = [10**x for x in range(-6, 1)]
        param_grid['lambda_2'] = [10**x for x in range(-6, 1)]
        # random sample from parameter grid: 0.25 share for bayesridge
        params_lst = TrainHelper.random_sample_parameter_grid(
            param_grid=param_grid, sample_share=0.2)
        if algo == 'ard':
            param_grid['threshold_lambda'] = [10**x for x in range(2, 6)]
            # random sample from parameter grid: 0.2 share for ard
            params_lst = TrainHelper.random_sample_parameter_grid(
                param_grid=param_grid, sample_share=0.2)
    # remove non-relevant featureset imputation combis
    if not multiple_nans_raw_set:
        params_lst_small = params_lst.copy()
        for param_set in params_lst:
            feat = param_set['featureset']
            imp = param_set['imputation']
            if (feat == 'cal' or feat == 'none') and (imp == 'iterative'
                                                      or imp == 'knn'):
                params_lst_small.remove(param_set)
        params_lst = params_lst_small

    doc_results = None
    best_rmse = 5000000.0
    best_mape = 5000000.0
    best_smape = 5000000.0
    dataset_last_name = 'Dummy'
    imputation_last = 'Dummy'
    dim_reduction_last = 'Dummy'
    featureset_last = 'Dummy'

    for i in tqdm(range(len(params_lst))):
        warnings.simplefilter('ignore')
        dataset = params_lst[i]['dataset']
        imputation = params_lst[i]['imputation']
        featureset = params_lst[i]['featureset']
        dim_reduction = None if params_lst[i][
            'dim_reduction'] == 'None' else params_lst[i]['dim_reduction']
        normalize = params_lst[i]['normalize']
        one_step_ahead = params_lst[i]['osa']
        l1_ratio = params_lst[i]['l1_ratio'] if 'l1_ratio' in params_lst[
            i] else None
        alpha = params_lst[i]['alpha'] if 'alpha' in params_lst[i] else None
        alpha_1 = params_lst[i]['alpha_1'] if 'alpha_1' in params_lst[
            i] else None
        alpha_2 = params_lst[i]['alpha_2'] if 'alpha_2' in params_lst[
            i] else None
        lambda_1 = params_lst[i]['lambda_1'] if 'lambda_1' in params_lst[
            i] else None
        lambda_2 = params_lst[i]['lambda_2'] if 'lambda_2' in params_lst[
            i] else None
        threshold_lambda = params_lst[i][
            'threshold_lambda'] if 'threshold_lambda' in params_lst[i] else None

        # dim_reduction does not make sense for few features
        if featureset == 'none' and dim_reduction is not None:
            continue

        if not ((dataset.name == dataset_last_name) and
                (imputation == imputation_last) and
                (dim_reduction == dim_reduction_last) and
                (featureset == featureset_last)):
            if resample_weekly and 'weekly' not in dataset.name:
                dataset.name = dataset.name + '_weekly'
            print(dataset.name + ' ' +
                  str('None' if imputation is None else imputation) + ' ' +
                  str('None' if dim_reduction is None else dim_reduction) +
                  ' ' + featureset + ' ' + target_column)
            train_test_list = TrainHelper.get_ready_train_test_lst(
                dataset=dataset,
                config=config,
                init_train_len=init_train_len,
                test_len=test_len,
                split_perc=split_perc,
                imputation=imputation,
                target_column=target_column,
                dimensionality_reduction=dim_reduction,
                featureset=featureset)
            if dataset.name != dataset_last_name:
                best_rmse = 5000000.0
                best_mape = 5000000.0
                best_smape = 5000000.0
            dataset_last_name = dataset.name
            imputation_last = imputation
            dim_reduction_last = dim_reduction
            featureset_last = featureset

        sum_dict = None
        try:
            for train, test in train_test_list:
                model = ModelsMLR.MultipleLinearRegression(
                    model_to_use=algo,
                    target_column=target_column,
                    seasonal_periods=seasonal_periods,
                    one_step_ahead=one_step_ahead,
                    normalize=normalize,
                    l1_ratio=l1_ratio,
                    alpha=alpha,
                    alpha_1=alpha_1,
                    alpha_2=alpha_2,
                    lambda_1=lambda_1,
                    lambda_2=lambda_2,
                    threshold_lambda=threshold_lambda)
                cross_val_dict = model.train(train=train, cross_val_call=False)
                eval_dict = model.evaluate(train=train, test=test)
                eval_dict.update(cross_val_dict)
                if sum_dict is None:
                    sum_dict = eval_dict
                else:
                    for k, v in eval_dict.items():
                        sum_dict[k] += v
            evaluation_dict = {
                k: v / len(train_test_list)
                for k, v in sum_dict.items()
            }
            params_dict = {
                'dataset':
                dataset.name,
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'algo':
                model.name,
                'normalize':
                normalize,
                'alpha':
                alpha,
                'l1_ratio':
                l1_ratio,
                'alpha_1':
                alpha_1,
                'alpha_2':
                alpha_2,
                'lambda_1':
                lambda_1,
                'lambda_2':
                lambda_2,
                'threshold_lambda':
                threshold_lambda,
                'one_step_ahead':
                one_step_ahead,
                'fitted_coef':
                model.model.coef_,
                'fitted_intercept':
                model.model.intercept_
            }
            save_dict = params_dict.copy()
            save_dict.update(evaluation_dict)
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
            best_rmse, best_mape, best_smape = TrainHelper.print_best_vals(
                evaluation_dict=evaluation_dict,
                best_rmse=best_rmse,
                best_mape=best_mape,
                best_smape=best_smape,
                run_number=i)
        except KeyboardInterrupt:
            print('Got interrupted')
            break
        except Exception as exc:
            print(exc)
            params_dict = {
                'dataset':
                'Failure',
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'algo':
                model.name,
                'normalize':
                normalize,
                'alpha':
                alpha,
                'l1_ratio':
                l1_ratio,
                'alpha_1':
                alpha_1,
                'alpha_2':
                alpha_2,
                'lambda_1':
                lambda_1,
                'lambda_2':
                lambda_2,
                'threshold_lambda':
                threshold_lambda,
                'one_step_ahead':
                one_step_ahead,
                'fitted_coef':
                'failed',
                'fitted_intercept':
                'failed'
            }
            save_dict = params_dict.copy()
            save_dict.update(TrainHelper.get_failure_eval_dict())
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
    TrainHelper.save_csv_results(doc_results=doc_results,
                                 save_dir=base_dir + 'OptimResults/',
                                 company_model_desc=algo,
                                 target_column=target_column,
                                 seasonal_periods=seasonal_periods,
                                 datasets=datasets,
                                 featuresets=param_grid['featureset'],
                                 imputations=param_grid['imputation'],
                                 split_perc=split_perc)
    print('Optimization Done. Saved Results.')
Ejemplo n.º 5
0
def run_xgb_optim(target_column: str, split_perc: float, imputation: str,
                  featureset: str):
    """
    Run whole XGB optimization loop
    :param target_column: target variable for predictions
    :param split_perc: percentage of samples to use for train set
    :param imputation: imputation method for missing values
    :param featureset: featureset to use
    """
    config = configparser.ConfigParser()
    config.read('Configs/dataset_specific_config.ini')
    # get optim parameters
    base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
        TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc)
    # load datasets
    datasets = TrainHelper.load_datasets(config=config,
                                         target_column=target_column)
    # prepare parameter grid
    param_grid = {
        'dataset': datasets,
        'imputation': [imputation],
        'featureset': [featureset],
        'dim_reduction': ['None', 'pca'],
        'learning_rate': [0.05, 0.1, 0.3],
        'max_depth': [3, 5, 10],
        'subsample': [0.3, 0.7, 1],
        'n_estimators': [10, 100, 1000],
        'gamma': [0, 1, 10],
        'alpha': [0, 0.1, 1, 10],
        'reg_lambda': [0, 0.1, 1, 10],
        'osa': [True]
    }

    # random sample from parameter grid
    params_lst = TrainHelper.random_sample_parameter_grid(
        param_grid=param_grid, sample_share=0.2)

    doc_results = None
    best_rmse = 5000000.0
    best_mape = 5000000.0
    best_smape = 5000000.0
    dataset_last_name = 'Dummy'
    imputation_last = 'Dummy'
    dim_reduction_last = 'Dummy'
    featureset_last = 'Dummy'

    for i in tqdm(range(len(params_lst))):
        warnings.simplefilter('ignore')
        dataset = params_lst[i]['dataset']
        imputation = params_lst[i]['imputation']
        featureset = params_lst[i]['featureset']
        dim_reduction = None if params_lst[i][
            'dim_reduction'] == 'None' else params_lst[i]['dim_reduction']
        learning_rate = params_lst[i]['learning_rate']
        max_depth = params_lst[i]['max_depth']
        subsample = params_lst[i]['subsample']
        n_estimators = params_lst[i]['n_estimators']
        gamma = params_lst[i]['gamma']
        alpha = params_lst[i]['alpha']
        reg_lambda = params_lst[i]['reg_lambda']
        one_step_ahead = params_lst[i]['osa']

        # dim_reduction only done without NaNs
        if imputation is None and dim_reduction is not None:
            continue
        # dim_reduction does not make sense for few features
        if featureset == 'none' and dim_reduction is not None:
            continue

        if not ((dataset.name == dataset_last_name) and
                (imputation == imputation_last) and
                (dim_reduction == dim_reduction_last) and
                (featureset == featureset_last)):
            if resample_weekly and 'weekly' not in dataset.name:
                dataset.name = dataset.name + '_weekly'
            print(dataset.name + ' ' +
                  str('None' if imputation is None else imputation) + ' ' +
                  str('None' if dim_reduction is None else dim_reduction) +
                  ' ' + featureset + ' ' + target_column)
            train_test_list = TrainHelper.get_ready_train_test_lst(
                dataset=dataset,
                config=config,
                init_train_len=init_train_len,
                test_len=test_len,
                split_perc=split_perc,
                imputation=imputation,
                target_column=target_column,
                dimensionality_reduction=dim_reduction,
                featureset=featureset)
            if dataset.name != dataset_last_name:
                best_rmse = 5000000.0
                best_mape = 5000000.0
                best_smape = 5000000.0
            dataset_last_name = dataset.name
            imputation_last = imputation
            dim_reduction_last = dim_reduction
            featureset_last = featureset

        sum_dict = None
        try:
            for train, test in train_test_list:
                model = ModelXGBoost.XGBoostRegression(
                    target_column=target_column,
                    seasonal_periods=seasonal_periods,
                    learning_rate=learning_rate,
                    max_depth=max_depth,
                    subsample=subsample,
                    n_estimators=n_estimators,
                    gamma=gamma,
                    alpha=alpha,
                    reg_lambda=reg_lambda,
                    one_step_ahead=one_step_ahead)
                cross_val_dict = model.train(train=train, cross_val_call=False)
                eval_dict = model.evaluate(train=train, test=test)
                eval_dict.update(cross_val_dict)
                if sum_dict is None:
                    sum_dict = eval_dict
                else:
                    for k, v in eval_dict.items():
                        sum_dict[k] += v
            evaluation_dict = {
                k: v / len(train_test_list)
                for k, v in sum_dict.items()
            }
            params_dict = {
                'dataset':
                dataset.name,
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'learning_rate':
                learning_rate,
                'max_depth':
                max_depth,
                'subsample':
                subsample,
                'n_estimators':
                n_estimators,
                'gamma':
                gamma,
                'alpha':
                alpha,
                'lambda':
                reg_lambda,
                'one_step_ahead':
                one_step_ahead
            }
            save_dict = params_dict.copy()
            save_dict.update(evaluation_dict)
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
            best_rmse, best_mape, best_smape = TrainHelper.print_best_vals(
                evaluation_dict=evaluation_dict,
                best_rmse=best_rmse,
                best_mape=best_mape,
                best_smape=best_smape,
                run_number=i)
        except KeyboardInterrupt:
            print('Got interrupted')
            break
        except Exception as exc:
            print(exc)
            params_dict = {
                'dataset':
                'Failure',
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'learning_rate':
                learning_rate,
                'max_depth':
                max_depth,
                'subsample':
                subsample,
                'n_estimators':
                n_estimators,
                'gamma':
                gamma,
                'alpha':
                alpha,
                'lambda':
                reg_lambda,
                'one_step_ahead':
                one_step_ahead
            }
            save_dict = params_dict.copy()
            save_dict.update(TrainHelper.get_failure_eval_dict())
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
    TrainHelper.save_csv_results(doc_results=doc_results,
                                 save_dir=base_dir + 'OptimResults/',
                                 company_model_desc='xgb',
                                 target_column=target_column,
                                 seasonal_periods=seasonal_periods,
                                 datasets=datasets,
                                 featuresets=param_grid['featureset'],
                                 imputations=param_grid['imputation'],
                                 split_perc=split_perc)
    print('Optimization Done. Saved Results.')
def run_sarimax_optim(target_column: str, split_perc: float, imputation: str,
                      featureset: str, univariate: bool):
    """
    Run whole (S)ARIMA(X) optimization loop
    :param target_column: target variable for predictions
    :param split_perc: percentage of samples to use for train set
    :param imputation: imputation method for missing values
    :param featureset: featureset to use
    :param univariate: use univariate version SARIMA as well
    """
    config = configparser.ConfigParser()
    config.read('Configs/dataset_specific_config.ini')
    # get optim parameters
    base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
        TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc)
    # load datasets
    datasets = TrainHelper.load_datasets(config=config,
                                         target_column=target_column)
    # prepare parameter grid
    param_grid = {
        'dataset': datasets,
        'imputation': [imputation],
        'featureset': [featureset],
        'dim_reduction': ['None', 'pca'],
        'p': [0, 1, 2, 3],
        'd': [0, 1],
        'q': [0, 1, 2, 3],
        'P': [0, 1, 2, 3],
        'D': [0, 1],
        'Q': [0, 1, 2, 3],
        'osa': [True],
        'transf': [False, 'log', 'pw'],
        'exog': [True],
        'wi': [True]
    }
    if univariate:
        param_grid['exog'] = [False, True]
    # random sample from parameter grid
    params_lst = TrainHelper.random_sample_parameter_grid(
        param_grid=param_grid, sample_share=0.2)

    doc_results = None
    best_rmse = 5000000.0
    best_mape = 5000000.0
    best_smape = 5000000.0
    dataset_last_name = 'Dummy'
    imputation_last = 'Dummy'
    dim_reduction_last = 'Dummy'
    featureset_last = 'Dummy'

    for i in tqdm(range(len(params_lst))):
        warnings.simplefilter('ignore')
        dataset = params_lst[i]['dataset']
        imputation = params_lst[i]['imputation']
        featureset = params_lst[i]['featureset']
        dim_reduction = None if params_lst[i][
            'dim_reduction'] == 'None' else params_lst[i]['dim_reduction']
        p = params_lst[i]['p']
        d = params_lst[i]['d']
        q = params_lst[i]['q']
        P = params_lst[i]['P']
        D = params_lst[i]['D']
        Q = params_lst[i]['Q']
        one_step_ahead = params_lst[i]['osa']
        transf = params_lst[i]['transf']
        power, log = TrainHelper.get_pw_l_for_transf(transf=transf)
        use_exog = params_lst[i]['exog']
        with_interc = params_lst[i]['wi']
        order = [p, d, q]
        seasonal_order = [P, D, Q, seasonal_periods]

        # dim_reduction only done without NaNs
        if imputation is None and dim_reduction is not None:
            continue
        # dim_reduction does not make sense for few features
        if featureset == 'none' and dim_reduction is not None:
            continue

        if not ((dataset.name == dataset_last_name) and
                (imputation == imputation_last) and
                (dim_reduction == dim_reduction_last) and
                (featureset == featureset_last)):
            if resample_weekly and 'weekly' not in dataset.name:
                dataset.name = dataset.name + '_weekly'
            print(dataset.name + ' ' +
                  str('None' if imputation is None else imputation) + ' ' +
                  str('None' if dim_reduction is None else dim_reduction) +
                  ' ' + featureset + ' ' + target_column)
            train_test_list = TrainHelper.get_ready_train_test_lst(
                dataset=dataset,
                config=config,
                init_train_len=init_train_len,
                test_len=test_len,
                split_perc=split_perc,
                imputation=imputation,
                target_column=target_column,
                dimensionality_reduction=dim_reduction,
                featureset=featureset)
            if dataset.name != dataset_last_name:
                best_rmse = 5000000.0
                best_mape = 5000000.0
                best_smape = 5000000.0
            dataset_last_name = dataset.name
            imputation_last = imputation
            dim_reduction_last = dim_reduction
            featureset_last = featureset

        sum_dict = None
        try:
            for train, test in train_test_list:
                model = ModelsARIMA.ARIMA(target_column=target_column,
                                          order=order,
                                          seasonal_order=seasonal_order,
                                          one_step_ahead=one_step_ahead,
                                          power_transf=power,
                                          log=log,
                                          use_exog=use_exog,
                                          with_intercept=with_interc)
                cross_val_dict = model.train(train=train, cross_val_call=False)
                eval_dict = model.evaluate(train=train, test=test)
                eval_dict.update(cross_val_dict)
                if sum_dict is None:
                    sum_dict = eval_dict
                else:
                    for k, v in eval_dict.items():
                        sum_dict[k] += v
            evaluation_dict = {
                k: v / len(train_test_list)
                for k, v in sum_dict.items()
            }
            params_dict = {
                'dataset':
                dataset.name,
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'order':
                order,
                'seasonal_order':
                seasonal_order,
                'one_step_ahead':
                one_step_ahead,
                'power_transform':
                power,
                'log_transform':
                log,
                'use_exog':
                use_exog,
                'with_intercept':
                with_interc
            }
            save_dict = params_dict.copy()
            save_dict.update(evaluation_dict)
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
            best_rmse, best_mape, best_smape = TrainHelper.print_best_vals(
                evaluation_dict=evaluation_dict,
                best_rmse=best_rmse,
                best_mape=best_mape,
                best_smape=best_smape,
                run_number=i)
        except KeyboardInterrupt:
            print('Got interrupted')
            break
        except Exception as exc:
            print(exc)
            params_dict = {
                'dataset':
                'Failure',
                'featureset':
                featureset,
                'imputation':
                str('None' if imputation is None else imputation),
                'dim_reduction':
                str('None' if dim_reduction is None else dim_reduction),
                'init_train_len':
                init_train_len,
                'test_len':
                test_len,
                'split_perc':
                split_perc,
                'order':
                order,
                'seasonal_order':
                seasonal_order,
                'one_step_ahead':
                one_step_ahead,
                'power_transform':
                power,
                'log_transform':
                log,
                'use_exog':
                use_exog,
                'with_intercept':
                with_interc
            }
            save_dict = params_dict.copy()
            save_dict.update(TrainHelper.get_failure_eval_dict())
            if doc_results is None:
                doc_results = pd.DataFrame(columns=save_dict.keys())
            doc_results = doc_results.append(save_dict, ignore_index=True)
    TrainHelper.save_csv_results(doc_results=doc_results,
                                 save_dir=base_dir + 'OptimResults/',
                                 company_model_desc='sarima-x',
                                 target_column=target_column,
                                 seasonal_periods=seasonal_periods,
                                 datasets=datasets,
                                 featuresets=param_grid['featureset'],
                                 imputations=param_grid['imputation'],
                                 split_perc=split_perc)
    print('Optimization Done. Saved Results.')