Beispiel #1
0
def column_transformer_and_cols(request):
    if "with_duplicated_name" in request.param:
        tran = TSColumnTransformer(
            transformers=[
                ("scaler", StandardScaler(), ["trend"]),
                ("raw_cols", "passthrough", ["trend", "one_hot"]),
            ]
        )
        cols = ["trend_scaler", "trend", "one_hot"]

    elif "with_transformer_creating_many_cols" in request.param:
        tran = TSColumnTransformer(
            transformers=[
                ("raw_cols", "passthrough", ["trend", "one_hot"]),
                ("one_hot", OneHotEncoder(), ["one_hot"],),
            ]
        )
        cols = ["trend", "one_hot", "x0_1", "x0_2", "x0_3", "x0_4"]

    elif "passthrough_columns_in_the_middle" in request.param:
        tran = TSColumnTransformer(
            transformers=[
                ("one_hot", OneHotEncoder(), ["one_hot"]),
                ("raw_cols", "passthrough", ["one_hot"]),
                ("scaler", StandardScaler(), ["trend"]),
            ]
        )
        cols = ["x0_1", "x0_2", "x0_3", "x0_4", "one_hot", "trend"]

    return tran, cols
def test_target_transformer():
    X, y = generate_tsdata(n_dates=365 * 2)
    X["trend"] = np.arange(len(X))

    preprocessing = TSColumnTransformer(transformers=[("scaler",
                                                       StandardScaler(),
                                                       ["trend"])])
    # define random forest model
    rf_model = get_sklearn_wrapper(RandomForestRegressor)
    # glue it together
    sklearn_model_pipeline = Pipeline([("preprocessing", preprocessing),
                                       ("model", rf_model)])

    scaled_pipeline = TargetTransformer(sklearn_model_pipeline,
                                        StandardScaler())

    preds = scaled_pipeline.fit(X[:-10], y[:-10]).predict(X[-10:])

    assert hasattr(scaled_pipeline, "named_steps")
    assert isinstance(scaled_pipeline.y_transformer, StandardScaler)
    assert isinstance(preds, pd.DataFrame)
def get_gridsearch(
    frequency,
    horizon=10,
    n_splits=5,
    between_split_lag=None,
    scoring="neg_mean_absolute_error",
    country_code_column=None,
    country_code=None,
    holidays_days_before=0,
    holidays_days_after=0,
    holidays_bridge_days=False,
    sklearn_models=True,
    sklearn_models_optimize_for_horizon=False,
    autosarimax_models=False,
    autoarima_dict=None,
    prophet_models=False,
    tbats_models=False,
    exp_smooth_models=False,
    theta_models=False,
    average_ensembles=False,
    stacking_ensembles=False,
    stacking_ensembles_train_horizon=10,
    stacking_ensembles_train_n_splits=20,
    clip_predictions_lower=None,
    clip_predictions_upper=None,
    exog_cols=None,
):
    """Get grid search object based on selection criteria.

    Parameters
    ----------
    frequency : str
        Frequency of timeseries. Pandas compatible frequncies

    horizon : int
        How many units of frequency (e.g. 4 quarters), should be used to find the best models

    n_splits : int
        How many cross-validation folds should be used in model selection

    between_split_lag : int
        How big lag of observations should cv_splits have
        If kept as None, horizon is used resulting in non-overlaping cv_splits

    scoring : str, callable
        String of sklearn regression metric name, or hcrystalball compatible scorer. For creation
        of hcrystalball compatible scorer use `make_ts_scorer` function.

    country_code_column : str, list
        Column(s) in data, that contain country code in str (e.g. 'DE'). Used in holiday transformer.
        Only one of `country_code_column` or `country_code` can be set.

    country_code : str, list
        Country code(s) in str (e.g. 'DE'). Used in holiday transformer.
        Only one of `country_code_column` or `country_code` can be set.

    holidays_days_before : int
        Number of days before the holiday which will be taken into account
        (i.e. 2 means that new bool column will be created and will be True for 2 days before holidays,
        otherwise False)

    holidays_days_after : int
        Number of days after the holiday which will be taken into account
        (i.e. 2 means that new bool column will be created and will be True for 2 days after holidays,
        otherwise False)

    holidays_bridge_days : bool
        Overlaping `holidays_days_before` and `holidays_days_after` feature which serves for modeling between
        holidays working days

    sklearn_models : bool
        Whether to consider sklearn models

    sklearn_models_optimize_for_horizon: bool
        Whether to add to default sklearn behavior also models, that optimize predictions for each horizon

    autosarimax_models : bool
        Whether to consider auto sarimax models

    autoarima_dict : dict
        Specification of pmdautoarima search space

    prophet_models : bool
        Whether to consider FB prophet models

    exp_smooth_models : bool
        Whether to consider exponential smoothing models

    average_ensembles : bool
        Whether to consider average ensemble models

    stacking_ensembles : bool
        Whether to consider stacking ensemble models

    stacking_ensembles_train_horizon : int
        Which horizon should be used in meta model in stacking ensembles

    stacking_ensembles_train_n_splits : int
        Number of splits used in meta model in stacking ensembles

    clip_predictions_lower : float, int
        Minimal number allowed in the predictions

    clip_predictions_upper : float, int
        Maximal number allowed in the predictions

    exog_cols : list
        List of columns to be used as exogenous variables

    Returns
    -------
    sklearn.model_selection.GridSearchCV
        CV / Model selection configuration
    """
    exog_cols = exog_cols or []
    country_code_columns = ([country_code_column] if isinstance(
        country_code_column, str) else country_code_column)
    country_codes = [country_code] if isinstance(country_code,
                                                 str) else country_code

    # ensures only exogenous columns and country code column will be passed to model if provided
    # and columns names will be stored in TSColumnTransformer
    if exog_cols:
        cols = exog_cols + country_code_columns if country_code_columns else exog_cols
        exog_passthrough = TSColumnTransformer(transformers=[("raw_cols",
                                                              "passthrough",
                                                              cols)])
    else:
        exog_passthrough = "passthrough"
    # ensures holiday transformer is added to the pipeline if requested
    if country_codes:
        holiday = Pipeline([(
            f"holiday_{code}",
            HolidayTransformer(
                country_code=code,
                days_before=holidays_days_before,
                days_after=holidays_days_after,
                bridge_days=holidays_bridge_days,
            ),
        ) for code in country_codes])
    elif country_code_columns:
        holiday = Pipeline([(
            f"holiday_{col}",
            HolidayTransformer(
                country_code_column=col,
                days_before=holidays_days_before,
                days_after=holidays_days_after,
                bridge_days=holidays_bridge_days,
            ),
        ) for col in country_code_columns])
    else:
        holiday = "passthrough"

    estimator = Pipeline([("exog_passthrough", exog_passthrough),
                          ("holiday", holiday), ("model", "passthrough")])

    cv = FinerTimeSplit(n_splits=n_splits,
                        horizon=horizon,
                        between_split_lag=between_split_lag)

    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=[],
        scoring=get_scorer(scoring),
        cv=cv,
        refit=False,
        error_score=np.nan,
    )

    if autosarimax_models:
        # adding autosarimax to param_grid might cause differently found models
        # for different splits and raise inconsistency based errors.
        # sarimax pipeline is added to new grid_search's attribute (`grid_search.autosarimax`)
        # and handled in `hcrystalball.model_seleciton.select_model` function in following way
        # 1. get best model for the data part on last split
        # 2. append this best model to original `param_grid`
        # 3. run full grid search with `param_grid` containing
        #    sarimax model selected from autosarimax in point 1
        from hcrystalball.wrappers import SarimaxWrapper

        if autoarima_dict is None:
            autoarima_dict = {}
        if "error_action" not in autoarima_dict:
            autoarima_dict.update({"error_action": "raise"})

        grid_search.autosarimax = Pipeline(estimator.steps[:-1])
        grid_search.autosarimax.steps.append((
            "model",
            SarimaxWrapper(
                init_with_autoarima=True,
                autoarima_dict=autoarima_dict,
                clip_predictions_lower=clip_predictions_lower,
                clip_predictions_upper=clip_predictions_upper,
            ),
        ))

    if stacking_ensembles or average_ensembles or sklearn_models:
        from sklearn.linear_model import ElasticNet
        from sklearn.ensemble import RandomForestRegressor

        # TODO when scoring time is fixed, add HistGradientBoostingRegressor
        # from sklearn.experimental import enable_hist_gradient_boosting
        # from sklearn.ensemble import HistGradientBoostingRegressor
        from hcrystalball.wrappers import get_sklearn_wrapper
        from hcrystalball.feature_extraction import SeasonalityTransformer

        sklearn_model = get_sklearn_wrapper(
            RandomForestRegressor,
            clip_predictions_lower=clip_predictions_lower,
            clip_predictions_upper=clip_predictions_upper,
        )

        sklearn_model_pipeline = Pipeline([
            ("seasonality", SeasonalityTransformer(auto=True, freq=frequency)),
            ("model", sklearn_model)
        ])
        # TODO make sure naming here works as expected
        sklearn_model_pipeline.name = f"seasonality_{sklearn_model.name}"

    if sklearn_models:
        classes = [ElasticNet, RandomForestRegressor]
        models = {
            model_class.__name__: get_sklearn_wrapper(
                model_class,
                clip_predictions_lower=clip_predictions_lower,
                clip_predictions_upper=clip_predictions_upper,
            )
            for model_class in classes
        }

        optimize_for_horizon = [
            False, True
        ] if sklearn_models_optimize_for_horizon else [False]

        grid_search.param_grid.append({
            "model": [sklearn_model_pipeline],
            "model__seasonality__weekly": [True, False],
            "model__model":
            list(models.values()),
            # TODO change add once HistGradientBoostingRegressor is back
            # "model__model": list(models.values()) + [sklearn_model]
            "model__model__optimize_for_horizon":
            optimize_for_horizon,
            "model__model__lags": [3, 7, 10, 14],
        })

        grid_search.param_grid.append({
            "model": [sklearn_model_pipeline],
            "model__seasonality__weekly": [True, False],
            "model__model__optimize_for_horizon":
            optimize_for_horizon,
            "model__model": [sklearn_model],
            "model__model__max_depth": [6],
        })

    if prophet_models:
        from hcrystalball.wrappers import ProphetWrapper

        extra_regressors = [None] if exog_cols is None else [None, exog_cols]

        grid_search.param_grid.append({
            "model": [
                ProphetWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__seasonality_mode": ["multiplicative", "additive"],
            "model__extra_regressors":
            extra_regressors,
        })

        grid_search.param_grid.append({
            "model": [
                ProphetWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__extra_seasonalities": [[{
                "name": "quarterly",
                "period": 90.0625,
                "fourier_order": 5,
                "prior_scale": 15.0,
                "mode": None,
            }]],
            "model__extra_regressors":
            extra_regressors,
        })

    if exp_smooth_models:
        from hcrystalball.wrappers import ExponentialSmoothingWrapper
        from hcrystalball.wrappers import HoltSmoothingWrapper
        from hcrystalball.wrappers import SimpleSmoothingWrapper

        # commented options show non deterministic behavior
        grid_search.param_grid.append({
            "model": [
                ExponentialSmoothingWrapper(
                    freq=frequency,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__trend": ["add"],
            "model__seasonal": [None, "add"],
            "model__damped": [True, False],
            "model__fit_params": [
                {
                    "use_boxcox": True,
                    "use_basinhopping": False
                },
                # {'use_boxcox':True, 'use_basinhopping':True},
                {
                    "use_boxcox": False,
                    "use_basinhopping": False
                },
                # {'use_boxcox':False, 'use_basinhopping':True}
            ],
        })

        grid_search.param_grid.append({
            "model": [
                ExponentialSmoothingWrapper(
                    freq=frequency,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__trend": ["add"],
            "model__seasonal": ["mul"],
            "model__damped": [True, False],
            "model__fit_params": [
                {
                    "use_boxcox": False,
                    "use_basinhopping": False
                },
                # {'use_boxcox':False, 'use_basinhopping':True}
            ],
        })

        grid_search.param_grid.append({
            "model": [
                ExponentialSmoothingWrapper(
                    freq=frequency,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__trend": [None],
            "model__seasonal": [None, "add", "mul"],
            "model__damped": [False],
            "model__fit_params": [
                {
                    "use_boxcox": False,
                    "use_basinhopping": False
                },
                # {'use_boxcox':False, 'use_basinhopping':True}
            ],
        })

        grid_search.param_grid.append({
            "model": [
                SimpleSmoothingWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                ),
                HoltSmoothingWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                ),
            ]
        })

    if theta_models:
        from hcrystalball.wrappers import ThetaWrapper

        grid_search.param_grid.append({
            "model": [
                ThetaWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ]
        })

    if tbats_models:
        from hcrystalball.wrappers import TBATSWrapper

        grid_search.param_grid.append({
            "model": [
                TBATSWrapper(
                    use_arma_errors=False,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ]
        })

    if stacking_ensembles:
        from hcrystalball.ensemble import StackingEnsemble
        from hcrystalball.wrappers import ProphetWrapper
        from hcrystalball.wrappers import ThetaWrapper
        from sklearn.ensemble import RandomForestRegressor

        grid_search.param_grid.append({
            "model": [
                StackingEnsemble(
                    train_n_splits=stacking_ensembles_train_n_splits,
                    train_horizon=stacking_ensembles_train_horizon,
                    meta_model=ElasticNet(),
                    horizons_as_features=True,
                    weekdays_as_features=True,
                    base_learners=[],
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__meta_model": [ElasticNet(),
                                  RandomForestRegressor()],
            "model__base_learners": [
                [
                    ProphetWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                    sklearn_model_pipeline,
                    ThetaWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                ],
            ],
        })
    if average_ensembles:
        from hcrystalball.ensemble import SimpleEnsemble
        from hcrystalball.wrappers import ProphetWrapper
        from hcrystalball.wrappers import ThetaWrapper

        grid_search.param_grid.append({
            "model": [
                SimpleEnsemble(
                    base_learners=[],
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__base_learners": [
                [
                    ProphetWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                    sklearn_model_pipeline,
                    ThetaWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                ],
            ],
        })

    return grid_search
def pipeline(request):
    if "passthrough_position" in request.param:
        return TSColumnTransformer(transformers=[
            ("scaler", StandardScaler(), ["one_hot"]),
            ("raw_cols_1", "passthrough", ["trend"]),
        ])
    if "col_name_clash" in request.param:
        return TSColumnTransformer(
            transformers=[("raw_cols_1", "passthrough",
                           ["trend"]), ("scaler", StandardScaler(),
                                        ["trend"])])
    if "more_dimensions_with_get_feature_names" in request.param:
        return TSColumnTransformer(
            transformers=[("raw_cols_1", "passthrough",
                           ["trend"]), ("scaler", OneHotEncoder(),
                                        ["one_hot"])])
    if "less_dimensions_without_get_feature_names" in request.param:
        return TSColumnTransformer(transformers=[
            ("raw_cols_1", "passthrough", ["trend"]),
            ("pca", PCA(n_components=1), ["one_hot", "trend"]),
        ])
    if "with_model" in request.param:
        return Pipeline([
            (
                "preproc",
                TSColumnTransformer(transformers=[
                    ("raw_cols_1", "passthrough", ["trend"]),
                    ("scaler", StandardScaler(), ["trend", "one_hot"]),
                ]),
            ),
            ("model", ExponentialSmoothingWrapper(trend="add")),
        ])
    if "more_layers_builtin_transformers" in request.param:
        return Pipeline([
            (
                "first",
                TSColumnTransformer(transformers=[
                    ("raw_cols_1", "passthrough", ["trend"]),
                    ("one_hot", OneHotEncoder(sparse=False), ["one_hot"]),
                    ("scaler", StandardScaler(), ["trend"]),
                ]),
            ),
            (
                "second",
                TSColumnTransformer(transformers=[
                    ("raw_cols_2", "passthrough", ["trend"]),
                    ("one_hot", StandardScaler(), ["x0_1"]),
                ]),
            ),
        ])
    if "more_layers_custom_transformers_same_level_country_code_country_col" in request.param:
        return Pipeline([
            (
                "first",
                TSColumnTransformer(transformers=[
                    ("raw_cols_1", "passthrough", ["trend", "country"]),
                    ("scaler", StandardScaler(), ["trend", "one_hot"]),
                ]),
            ),
            ("holiday", HolidayTransformer(country_code_column="country")),
            (
                "second",
                TSColumnTransformer(transformers=[
                    ("one_hot", OneHotEncoder(sparse=False), ["holiday"]),
                    ("raw_cols_2", "passthrough", ["trend"]),
                ]),
            ),
        ])

    if "more_layers_custom_transformers_same_level_country_code" in request.param:
        return Pipeline([
            (
                "first",
                TSColumnTransformer(transformers=[
                    ("raw_cols_1", "passthrough", ["trend"]),
                    ("scaler", StandardScaler(), ["trend", "one_hot"]),
                ]),
            ),
            ("holiday", HolidayTransformer(country_code="DE")),
            (
                "second",
                TSColumnTransformer(transformers=[
                    ("one_hot", OneHotEncoder(sparse=False), ["holiday"]),
                    ("raw_cols_2", "passthrough", ["trend"]),
                ]),
            ),
        ])

    if "more_layers_holiday_in_column_transformer" in request.param:
        return Pipeline([
            (
                "first",
                TSColumnTransformer(transformers=[
                    ("raw_cols_1", "passthrough", ["trend", "country"]),
                    (
                        "holiday",
                        HolidayTransformer(country_code_column="country"),
                        ["country"],
                    ),
                    ("scaler", StandardScaler(), ["trend", "one_hot"]),
                ]),
            ),
            (
                "second",
                TSColumnTransformer(transformers=[
                    ("one_hot", OneHotEncoder(sparse=False), ["holiday"]),
                    ("raw_cols_2", "passthrough", ["trend", "country"]),
                ]),
            ),
        ])