Exemple #1
0
def test_holidays_related_features(
    country_code,
    days_before,
    days_after,
    bridge_days,
    expected_result_holidays_related_features,
    extected_error,
):
    X = pd.DataFrame(index=pd.date_range(start="2020-04-10", periods=10))
    if extected_error is None:
        df_result = HolidayTransformer(
            country_code=country_code,
            days_before=days_before,
            days_after=days_after,
            bridge_days=bridge_days,
        ).fit_transform(X)
        assert_frame_equal(df_result,
                           expected_result_holidays_related_features)

    else:
        with pytest.raises(extected_error):
            df_result = HolidayTransformer(
                country_code=country_code,
                days_before=days_before,
                days_after=days_after,
                bridge_days=bridge_days,
            ).fit_transform(X)
def test_holiday_transformer_inputs(
    X_y_with_freq,
    country_code,
    country_code_column,
    country_code_column_value,
    extected_error,
):

    X, _ = X_y_with_freq
    if extected_error is not None:
        with pytest.raises(extected_error):
            holiday_transformer = HolidayTransformer(
                country_code=country_code,
                country_code_column=country_code_column)
            if country_code_column:
                X["holiday_col"] = country_code_column_value
            holiday_transformer.fit_transform(X)
    else:
        holiday_transformer = HolidayTransformer(
            country_code=country_code, country_code_column=country_code_column)
        if country_code_column:
            X[country_code_column] = country_code_column_value
        holiday_transformer.fit_transform(X)

        if country_code_column:
            assert holiday_transformer.get_params()["country_code"] is None
def X_with_holidays(request):
    from hcrystalball.feature_extraction import HolidayTransformer

    X = pd.DataFrame(index=pd.date_range(start="2019-01-01", periods=300))
    holidays = HolidayTransformer(country_code="DE",
                                  days_before=2,
                                  days_after=1,
                                  bridge_days=1).fit_transform(X)
    if "double_holidays" in request.param:
        X = X.join(
            HolidayTransformer(country_code="BE", days_before=0,
                               days_after=2).fit_transform(X))

    return X.join(holidays)
def transformers(request):
    if request.param is None:
        return None
    else:
        options = {"holiday": ("holiday", HolidayTransformer(country_code="DE"))}
        transformers = request.param.split(",")

        return [options[t] for t in transformers if t in options.keys()]
Exemple #5
0
def test_holiday_transformer_transform(country_code, country_code_column, country_code_column_value):
    expected = {"holiday": ["Labour Day", "", "", "", "", "", "", "Liberation Day", "", ""]}
    X = pd.DataFrame(index=pd.date_range(start="2019-05-01", periods=10))
    df_expected = pd.DataFrame(expected, index=X.index)
    if country_code_column:
        X[country_code_column] = country_code_column_value
    df_result = HolidayTransformer(
        country_code=country_code, country_code_column=country_code_column
    ).fit_transform(X)
    assert_frame_equal(df_result, df_expected)
def grid_search(request):
    from sklearn.dummy import DummyRegressor
    from sklearn.metrics import mean_absolute_error
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline

    from hcrystalball.feature_extraction import HolidayTransformer
    from hcrystalball.feature_extraction import SeasonalityTransformer
    from hcrystalball.metrics import make_ts_scorer
    from hcrystalball.model_selection import FinerTimeSplit
    from hcrystalball.wrappers import get_sklearn_wrapper

    scoring = make_ts_scorer(mean_absolute_error, greater_is_better=False)

    bad_dummy = get_sklearn_wrapper(DummyRegressor,
                                    strategy="constant",
                                    constant=42,
                                    name="bad_dummy",
                                    lags=2)
    good_dummy = get_sklearn_wrapper(DummyRegressor,
                                     strategy="mean",
                                     name="good_dummy",
                                     lags=2)

    parameters = [
        {
            "model": [good_dummy]
        },
        {
            "model": [bad_dummy],
            "model__strategy": ["constant"],
            "model__constant": [42],
        },
    ]

    holiday_model = Pipeline([
        ("holiday", HolidayTransformer(country_code_column="Holidays_code")),
        ("seasonality", SeasonalityTransformer(week_day=True, freq="D")),
        ("model", good_dummy),
    ])
    cv = FinerTimeSplit(n_splits=2, horizon=5)
    grid_search = GridSearchCV(holiday_model,
                               parameters,
                               cv=cv,
                               scoring=scoring)

    return grid_search
def get_gridsearch(
    frequency,
    horizon=10,
    n_splits=5,
    between_split_lag=None,
    scoring="neg_mean_absolute_error",
    country_code_column=None,
    country_code=None,
    holidays_days_before=0,
    holidays_days_after=0,
    holidays_bridge_days=False,
    sklearn_models=True,
    sklearn_models_optimize_for_horizon=False,
    autosarimax_models=False,
    autoarima_dict=None,
    prophet_models=False,
    tbats_models=False,
    exp_smooth_models=False,
    theta_models=False,
    average_ensembles=False,
    stacking_ensembles=False,
    stacking_ensembles_train_horizon=10,
    stacking_ensembles_train_n_splits=20,
    clip_predictions_lower=None,
    clip_predictions_upper=None,
    exog_cols=None,
):
    """Get grid search object based on selection criteria.

    Parameters
    ----------
    frequency : str
        Frequency of timeseries. Pandas compatible frequncies

    horizon : int
        How many units of frequency (e.g. 4 quarters), should be used to find the best models

    n_splits : int
        How many cross-validation folds should be used in model selection

    between_split_lag : int
        How big lag of observations should cv_splits have
        If kept as None, horizon is used resulting in non-overlaping cv_splits

    scoring : str, callable
        String of sklearn regression metric name, or hcrystalball compatible scorer. For creation
        of hcrystalball compatible scorer use `make_ts_scorer` function.

    country_code_column : str, list
        Column(s) in data, that contain country code in str (e.g. 'DE'). Used in holiday transformer.
        Only one of `country_code_column` or `country_code` can be set.

    country_code : str, list
        Country code(s) in str (e.g. 'DE'). Used in holiday transformer.
        Only one of `country_code_column` or `country_code` can be set.

    holidays_days_before : int
        Number of days before the holiday which will be taken into account
        (i.e. 2 means that new bool column will be created and will be True for 2 days before holidays,
        otherwise False)

    holidays_days_after : int
        Number of days after the holiday which will be taken into account
        (i.e. 2 means that new bool column will be created and will be True for 2 days after holidays,
        otherwise False)

    holidays_bridge_days : bool
        Overlaping `holidays_days_before` and `holidays_days_after` feature which serves for modeling between
        holidays working days

    sklearn_models : bool
        Whether to consider sklearn models

    sklearn_models_optimize_for_horizon: bool
        Whether to add to default sklearn behavior also models, that optimize predictions for each horizon

    autosarimax_models : bool
        Whether to consider auto sarimax models

    autoarima_dict : dict
        Specification of pmdautoarima search space

    prophet_models : bool
        Whether to consider FB prophet models

    exp_smooth_models : bool
        Whether to consider exponential smoothing models

    average_ensembles : bool
        Whether to consider average ensemble models

    stacking_ensembles : bool
        Whether to consider stacking ensemble models

    stacking_ensembles_train_horizon : int
        Which horizon should be used in meta model in stacking ensembles

    stacking_ensembles_train_n_splits : int
        Number of splits used in meta model in stacking ensembles

    clip_predictions_lower : float, int
        Minimal number allowed in the predictions

    clip_predictions_upper : float, int
        Maximal number allowed in the predictions

    exog_cols : list
        List of columns to be used as exogenous variables

    Returns
    -------
    sklearn.model_selection.GridSearchCV
        CV / Model selection configuration
    """
    exog_cols = exog_cols or []
    country_code_columns = ([country_code_column] if isinstance(
        country_code_column, str) else country_code_column)
    country_codes = [country_code] if isinstance(country_code,
                                                 str) else country_code

    # ensures only exogenous columns and country code column will be passed to model if provided
    # and columns names will be stored in TSColumnTransformer
    if exog_cols:
        cols = exog_cols + country_code_columns if country_code_columns else exog_cols
        exog_passthrough = TSColumnTransformer(transformers=[("raw_cols",
                                                              "passthrough",
                                                              cols)])
    else:
        exog_passthrough = "passthrough"
    # ensures holiday transformer is added to the pipeline if requested
    if country_codes:
        holiday = Pipeline([(
            f"holiday_{code}",
            HolidayTransformer(
                country_code=code,
                days_before=holidays_days_before,
                days_after=holidays_days_after,
                bridge_days=holidays_bridge_days,
            ),
        ) for code in country_codes])
    elif country_code_columns:
        holiday = Pipeline([(
            f"holiday_{col}",
            HolidayTransformer(
                country_code_column=col,
                days_before=holidays_days_before,
                days_after=holidays_days_after,
                bridge_days=holidays_bridge_days,
            ),
        ) for col in country_code_columns])
    else:
        holiday = "passthrough"

    estimator = Pipeline([("exog_passthrough", exog_passthrough),
                          ("holiday", holiday), ("model", "passthrough")])

    cv = FinerTimeSplit(n_splits=n_splits,
                        horizon=horizon,
                        between_split_lag=between_split_lag)

    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=[],
        scoring=get_scorer(scoring),
        cv=cv,
        refit=False,
        error_score=np.nan,
    )

    if autosarimax_models:
        # adding autosarimax to param_grid might cause differently found models
        # for different splits and raise inconsistency based errors.
        # sarimax pipeline is added to new grid_search's attribute (`grid_search.autosarimax`)
        # and handled in `hcrystalball.model_seleciton.select_model` function in following way
        # 1. get best model for the data part on last split
        # 2. append this best model to original `param_grid`
        # 3. run full grid search with `param_grid` containing
        #    sarimax model selected from autosarimax in point 1
        from hcrystalball.wrappers import SarimaxWrapper

        if autoarima_dict is None:
            autoarima_dict = {}
        if "error_action" not in autoarima_dict:
            autoarima_dict.update({"error_action": "raise"})

        grid_search.autosarimax = Pipeline(estimator.steps[:-1])
        grid_search.autosarimax.steps.append((
            "model",
            SarimaxWrapper(
                init_with_autoarima=True,
                autoarima_dict=autoarima_dict,
                clip_predictions_lower=clip_predictions_lower,
                clip_predictions_upper=clip_predictions_upper,
            ),
        ))

    if stacking_ensembles or average_ensembles or sklearn_models:
        from sklearn.linear_model import ElasticNet
        from sklearn.ensemble import RandomForestRegressor

        # TODO when scoring time is fixed, add HistGradientBoostingRegressor
        # from sklearn.experimental import enable_hist_gradient_boosting
        # from sklearn.ensemble import HistGradientBoostingRegressor
        from hcrystalball.wrappers import get_sklearn_wrapper
        from hcrystalball.feature_extraction import SeasonalityTransformer

        sklearn_model = get_sklearn_wrapper(
            RandomForestRegressor,
            clip_predictions_lower=clip_predictions_lower,
            clip_predictions_upper=clip_predictions_upper,
        )

        sklearn_model_pipeline = Pipeline([
            ("seasonality", SeasonalityTransformer(auto=True, freq=frequency)),
            ("model", sklearn_model)
        ])
        # TODO make sure naming here works as expected
        sklearn_model_pipeline.name = f"seasonality_{sklearn_model.name}"

    if sklearn_models:
        classes = [ElasticNet, RandomForestRegressor]
        models = {
            model_class.__name__: get_sklearn_wrapper(
                model_class,
                clip_predictions_lower=clip_predictions_lower,
                clip_predictions_upper=clip_predictions_upper,
            )
            for model_class in classes
        }

        optimize_for_horizon = [
            False, True
        ] if sklearn_models_optimize_for_horizon else [False]

        grid_search.param_grid.append({
            "model": [sklearn_model_pipeline],
            "model__seasonality__weekly": [True, False],
            "model__model":
            list(models.values()),
            # TODO change add once HistGradientBoostingRegressor is back
            # "model__model": list(models.values()) + [sklearn_model]
            "model__model__optimize_for_horizon":
            optimize_for_horizon,
            "model__model__lags": [3, 7, 10, 14],
        })

        grid_search.param_grid.append({
            "model": [sklearn_model_pipeline],
            "model__seasonality__weekly": [True, False],
            "model__model__optimize_for_horizon":
            optimize_for_horizon,
            "model__model": [sklearn_model],
            "model__model__max_depth": [6],
        })

    if prophet_models:
        from hcrystalball.wrappers import ProphetWrapper

        extra_regressors = [None] if exog_cols is None else [None, exog_cols]

        grid_search.param_grid.append({
            "model": [
                ProphetWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__seasonality_mode": ["multiplicative", "additive"],
            "model__extra_regressors":
            extra_regressors,
        })

        grid_search.param_grid.append({
            "model": [
                ProphetWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__extra_seasonalities": [[{
                "name": "quarterly",
                "period": 90.0625,
                "fourier_order": 5,
                "prior_scale": 15.0,
                "mode": None,
            }]],
            "model__extra_regressors":
            extra_regressors,
        })

    if exp_smooth_models:
        from hcrystalball.wrappers import ExponentialSmoothingWrapper
        from hcrystalball.wrappers import HoltSmoothingWrapper
        from hcrystalball.wrappers import SimpleSmoothingWrapper

        # commented options show non deterministic behavior
        grid_search.param_grid.append({
            "model": [
                ExponentialSmoothingWrapper(
                    freq=frequency,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__trend": ["add"],
            "model__seasonal": [None, "add"],
            "model__damped": [True, False],
            "model__fit_params": [
                {
                    "use_boxcox": True,
                    "use_basinhopping": False
                },
                # {'use_boxcox':True, 'use_basinhopping':True},
                {
                    "use_boxcox": False,
                    "use_basinhopping": False
                },
                # {'use_boxcox':False, 'use_basinhopping':True}
            ],
        })

        grid_search.param_grid.append({
            "model": [
                ExponentialSmoothingWrapper(
                    freq=frequency,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__trend": ["add"],
            "model__seasonal": ["mul"],
            "model__damped": [True, False],
            "model__fit_params": [
                {
                    "use_boxcox": False,
                    "use_basinhopping": False
                },
                # {'use_boxcox':False, 'use_basinhopping':True}
            ],
        })

        grid_search.param_grid.append({
            "model": [
                ExponentialSmoothingWrapper(
                    freq=frequency,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__trend": [None],
            "model__seasonal": [None, "add", "mul"],
            "model__damped": [False],
            "model__fit_params": [
                {
                    "use_boxcox": False,
                    "use_basinhopping": False
                },
                # {'use_boxcox':False, 'use_basinhopping':True}
            ],
        })

        grid_search.param_grid.append({
            "model": [
                SimpleSmoothingWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                ),
                HoltSmoothingWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                ),
            ]
        })

    if theta_models:
        from hcrystalball.wrappers import ThetaWrapper

        grid_search.param_grid.append({
            "model": [
                ThetaWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ]
        })

    if tbats_models:
        from hcrystalball.wrappers import TBATSWrapper

        grid_search.param_grid.append({
            "model": [
                TBATSWrapper(
                    use_arma_errors=False,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ]
        })

    if stacking_ensembles:
        from hcrystalball.ensemble import StackingEnsemble
        from hcrystalball.wrappers import ProphetWrapper
        from hcrystalball.wrappers import ThetaWrapper
        from sklearn.ensemble import RandomForestRegressor

        grid_search.param_grid.append({
            "model": [
                StackingEnsemble(
                    train_n_splits=stacking_ensembles_train_n_splits,
                    train_horizon=stacking_ensembles_train_horizon,
                    meta_model=ElasticNet(),
                    horizons_as_features=True,
                    weekdays_as_features=True,
                    base_learners=[],
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__meta_model": [ElasticNet(),
                                  RandomForestRegressor()],
            "model__base_learners": [
                [
                    ProphetWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                    sklearn_model_pipeline,
                    ThetaWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                ],
            ],
        })
    if average_ensembles:
        from hcrystalball.ensemble import SimpleEnsemble
        from hcrystalball.wrappers import ProphetWrapper
        from hcrystalball.wrappers import ThetaWrapper

        grid_search.param_grid.append({
            "model": [
                SimpleEnsemble(
                    base_learners=[],
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__base_learners": [
                [
                    ProphetWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                    sklearn_model_pipeline,
                    ThetaWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                ],
            ],
        })

    return grid_search
def pipeline(request):
    if "passthrough_position" in request.param:
        return TSColumnTransformer(transformers=[
            ("scaler", StandardScaler(), ["one_hot"]),
            ("raw_cols_1", "passthrough", ["trend"]),
        ])
    if "col_name_clash" in request.param:
        return TSColumnTransformer(
            transformers=[("raw_cols_1", "passthrough",
                           ["trend"]), ("scaler", StandardScaler(),
                                        ["trend"])])
    if "more_dimensions_with_get_feature_names" in request.param:
        return TSColumnTransformer(
            transformers=[("raw_cols_1", "passthrough",
                           ["trend"]), ("scaler", OneHotEncoder(),
                                        ["one_hot"])])
    if "less_dimensions_without_get_feature_names" in request.param:
        return TSColumnTransformer(transformers=[
            ("raw_cols_1", "passthrough", ["trend"]),
            ("pca", PCA(n_components=1), ["one_hot", "trend"]),
        ])
    if "with_model" in request.param:
        return Pipeline([
            (
                "preproc",
                TSColumnTransformer(transformers=[
                    ("raw_cols_1", "passthrough", ["trend"]),
                    ("scaler", StandardScaler(), ["trend", "one_hot"]),
                ]),
            ),
            ("model", ExponentialSmoothingWrapper(trend="add")),
        ])
    if "more_layers_builtin_transformers" in request.param:
        return Pipeline([
            (
                "first",
                TSColumnTransformer(transformers=[
                    ("raw_cols_1", "passthrough", ["trend"]),
                    ("one_hot", OneHotEncoder(sparse=False), ["one_hot"]),
                    ("scaler", StandardScaler(), ["trend"]),
                ]),
            ),
            (
                "second",
                TSColumnTransformer(transformers=[
                    ("raw_cols_2", "passthrough", ["trend"]),
                    ("one_hot", StandardScaler(), ["x0_1"]),
                ]),
            ),
        ])
    if "more_layers_custom_transformers_same_level_country_code_country_col" in request.param:
        return Pipeline([
            (
                "first",
                TSColumnTransformer(transformers=[
                    ("raw_cols_1", "passthrough", ["trend", "country"]),
                    ("scaler", StandardScaler(), ["trend", "one_hot"]),
                ]),
            ),
            ("holiday", HolidayTransformer(country_code_column="country")),
            (
                "second",
                TSColumnTransformer(transformers=[
                    ("one_hot", OneHotEncoder(sparse=False), ["holiday"]),
                    ("raw_cols_2", "passthrough", ["trend"]),
                ]),
            ),
        ])

    if "more_layers_custom_transformers_same_level_country_code" in request.param:
        return Pipeline([
            (
                "first",
                TSColumnTransformer(transformers=[
                    ("raw_cols_1", "passthrough", ["trend"]),
                    ("scaler", StandardScaler(), ["trend", "one_hot"]),
                ]),
            ),
            ("holiday", HolidayTransformer(country_code="DE")),
            (
                "second",
                TSColumnTransformer(transformers=[
                    ("one_hot", OneHotEncoder(sparse=False), ["holiday"]),
                    ("raw_cols_2", "passthrough", ["trend"]),
                ]),
            ),
        ])

    if "more_layers_holiday_in_column_transformer" in request.param:
        return Pipeline([
            (
                "first",
                TSColumnTransformer(transformers=[
                    ("raw_cols_1", "passthrough", ["trend", "country"]),
                    (
                        "holiday",
                        HolidayTransformer(country_code_column="country"),
                        ["country"],
                    ),
                    ("scaler", StandardScaler(), ["trend", "one_hot"]),
                ]),
            ),
            (
                "second",
                TSColumnTransformer(transformers=[
                    ("one_hot", OneHotEncoder(sparse=False), ["holiday"]),
                    ("raw_cols_2", "passthrough", ["trend", "country"]),
                ]),
            ),
        ])
Exemple #9
0
def test_two_transformers(
    country_code_first,
    country_code_column_first,
    country_code_column_first_value,
    country_code_second,
    country_code_column_second,
    country_code_column_second_value,
):
    first_suffix = country_code_first or country_code_column_first
    second_suffix = country_code_second or country_code_column_second
    expected = {
        f"_holiday_{first_suffix}": [
            "Labour Day",
            "",
            "",
            "",
            "",
            "",
            "",
            "Liberation Day",
            "",
            "",
        ],
        f"_holiday_{second_suffix}": [
            "Labour Day",
            "",
            "",
            "",
            "",
            "",
            "",
            "Liberation Day",
            "",
            "",
        ],
    }
    X = pd.DataFrame(index=pd.date_range(start="2019-05-01", periods=10))
    df_expected = pd.DataFrame(expected, index=X.index)
    if country_code_column_first:
        X[country_code_column_first] = country_code_column_first_value
    if country_code_column_second:
        X[country_code_column_second] = country_code_column_second_value

    pipeline = Pipeline([
        (
            f"holidays_{first_suffix}",
            HolidayTransformer(
                country_code_column=country_code_column_first,
                country_code=country_code_first,
            ),
        ),
        (
            f"holidays_{second_suffix}",
            HolidayTransformer(
                country_code_column=country_code_column_second,
                country_code=country_code_second,
            ),
        ),
    ])

    df_result = pipeline.fit_transform(X)
    assert_frame_equal(df_result, df_expected)
Exemple #10
0
def X_with_holidays():
    from hcrystalball.feature_extraction import HolidayTransformer

    X = pd.DataFrame(index=pd.date_range(start="2019-01-01", periods=300))
    holidays = HolidayTransformer(country_code="DE").fit_transform(X)
    return X.join(holidays)