def wrapper_instance(request):

    if request.param == "prophet":
        return ProphetWrapper(daily_seasonality=False,
                              weekly_seasonality=False,
                              yearly_seasonality=False)
    elif request.param == "smoothing":
        return ExponentialSmoothingWrapper(trend="add")
    elif request.param == "tbats":
        return TBATSWrapper(use_arma_errors=False, use_box_cox=False)
    elif request.param == "sklearn":
        return get_sklearn_wrapper(LinearRegression, lags=4)
    elif request.param == "sarimax":
        return SarimaxWrapper(order=(1, 1, 0), seasonal_order=(1, 1, 1, 2))
    elif request.param == "stacking_ensemble":
        return StackingEnsemble(
            base_learners=[
                ExponentialSmoothingWrapper(name="smoot_exp1", trend="add"),
                ExponentialSmoothingWrapper(name="smoot_exp2"),
            ],
            meta_model=LinearRegression(),
            horizons_as_features=False,
            weekdays_as_features=False,
        )
    elif request.param == "simple_ensemble":
        return SimpleEnsemble(base_learners=[
            ExponentialSmoothingWrapper(name="smoot_exp1", trend="add"),
            ExponentialSmoothingWrapper(name="smoot_exp2"),
        ])
Example #2
0
def test_get_best_not_failing_model(X_y_optional, negative_data, best_model_name, rank, expected_error):
    X, y = X_y_optional
    # data contains 0
    y[y < 1] = 1
    if negative_data:
        y[-1] = -1
    models = [
        ExponentialSmoothingWrapper(freq="D", trend="mul"),
        get_sklearn_wrapper(DummyRegressor, strategy="constant", constant=-5000),
    ]
    models = models if expected_error is None else models[:1]
    grid_search = GridSearchCV(
        estimator=Pipeline([("model", "passthrough")]),
        param_grid=[{"model": models}],
        scoring=get_scorer("neg_mean_absolute_error"),
        cv=FinerTimeSplit(n_splits=1, horizon=5),
        refit=False,
        error_score=np.nan,
    )

    grid_search.fit(X, y)

    if expected_error:
        with pytest.raises(expected_error):
            get_best_not_failing_model(grid_search, X, y)
    else:
        best_param_rank = get_best_not_failing_model(grid_search, X, y)
        assert isinstance(best_param_rank, dict)
        assert best_param_rank["params"]["model"].__class__.__name__ == best_model_name
        assert best_param_rank["rank"] == rank
def grid_search(request):
    from sklearn.dummy import DummyRegressor
    from sklearn.metrics import mean_absolute_error
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline

    from hcrystalball.feature_extraction import HolidayTransformer
    from hcrystalball.feature_extraction import SeasonalityTransformer
    from hcrystalball.metrics import make_ts_scorer
    from hcrystalball.model_selection import FinerTimeSplit
    from hcrystalball.wrappers import get_sklearn_wrapper

    scoring = make_ts_scorer(mean_absolute_error, greater_is_better=False)

    bad_dummy = get_sklearn_wrapper(DummyRegressor,
                                    strategy="constant",
                                    constant=42,
                                    name="bad_dummy",
                                    lags=2)
    good_dummy = get_sklearn_wrapper(DummyRegressor,
                                     strategy="mean",
                                     name="good_dummy",
                                     lags=2)

    parameters = [
        {
            "model": [good_dummy]
        },
        {
            "model": [bad_dummy],
            "model__strategy": ["constant"],
            "model__constant": [42],
        },
    ]

    holiday_model = Pipeline([
        ("holiday", HolidayTransformer(country_code_column="Holidays_code")),
        ("seasonality", SeasonalityTransformer(week_day=True, freq="D")),
        ("model", good_dummy),
    ])
    cv = FinerTimeSplit(n_splits=2, horizon=5)
    grid_search = GridSearchCV(holiday_model,
                               parameters,
                               cv=cv,
                               scoring=scoring)

    return grid_search
Example #4
0
def test_model_selector(tmp_path):

    n_regions = 1
    n_plants = 1
    n_products = 2
    target_col_name = "Quantity"
    persist_path = os.path.join(tmp_path, "results")

    df = generate_multiple_tsdata(
        n_dates=200, n_regions=n_regions, n_plants=n_plants, n_products=n_products
    )
    ms = ModelSelector(frequency="D", horizon=1, country_code_column="Country")
    assert ms.horizon == 1
    ms.create_gridsearch(
        n_splits=1,
        prophet_models=True,
        sklearn_models=False,
        sklearn_models_optimize_for_horizon=False,
        autosarimax_models=False,
        tbats_models=False,
        exp_smooth_models=False,
        average_ensembles=False,
        stacking_ensembles=False,
        exog_cols=["Raining"],
    )
    assert hasattr(ms, "grid_search")
    ms.add_model_to_gridsearch(get_sklearn_wrapper(LinearRegression))
    ms.select_model(
        df=df,
        target_col_name=target_col_name,
        partition_columns=["Region", "Plant", "Product"],
    )

    assert len(ms.results) == n_regions * n_plants * n_products
    assert len(ms.partitions) == n_regions * n_plants * n_products

    ms.persist_results(persist_path)

    print(ms.partitions)

    ms_load = load_model_selector(folder_path=persist_path)

    # we do not ensure the same order of results and partitions after loading, thus checking they are all there
    assert all([partition in ms_load.partitions for partition in ms.partitions])
    # TODO redefine __eq__ for ModelSelectorResult to str(MSR).__dict__?
    assert all(
        [
            str(ms_load.get_result_for_partition(partition).__dict__)
            == str(ms.get_result_for_partition(partition).__dict__)
            for partition in ms.partitions
        ]
    )
    assert ms.horizon == ms_load.horizon
    assert ms.frequency == ms_load.frequency
def pipeline_instance_model_only(request):

    if request.param == "prophet":
        return Pipeline([(
            "regressor",
            ProphetWrapper(
                daily_seasonality=False,
                weekly_seasonality=False,
                yearly_seasonality=False,
            ),
        )])
    elif request.param == "smoothing":
        return Pipeline([("regressor",
                          ExponentialSmoothingWrapper(trend="add"))])

    elif request.param == "tbats":
        return Pipeline([("regressor",
                          TBATSWrapper(use_arma_errors=False,
                                       use_box_cox=False))])

    elif request.param == "sklearn":
        return Pipeline([("regressor",
                          get_sklearn_wrapper(LinearRegression, lags=4))])

    elif request.param == "sarimax":
        return Pipeline([(
            "regressor",
            SarimaxWrapper(order=(1, 1, 0), seasonal_order=(1, 1, 1, 1)),
        )])

    elif request.param == "stacking_ensemble":
        return Pipeline([(
            "regressor",
            StackingEnsemble(
                base_learners=[
                    ExponentialSmoothingWrapper(name="smoot_exp1",
                                                trend="add"),
                    ExponentialSmoothingWrapper(name="smoot_exp2"),
                ],
                meta_model=LinearRegression(),
            ),
        )])

    elif request.param == "simple_ensemble":
        return Pipeline([(
            "regressor",
            SimpleEnsemble(base_learners=[
                ExponentialSmoothingWrapper(name="smoot_exp1", trend="add"),
                ExponentialSmoothingWrapper(name="smoot_exp2"),
            ]),
        )])
    else:
        return None
def test_sklearn_wrapper_overal(X_y_linear_trend, horizon, exp_error):
    CONSTANT = 50
    X, y = X_y_linear_trend
    model = get_sklearn_wrapper(DummyRegressor, lags=3, strategy="constant", constant=CONSTANT)
    model.fit(X[:-horizon], y[:-horizon])

    if exp_error is not None:
        with pytest.raises(exp_error):
            preds = model.predict(X[-horizon:])
    else:
        preds = model.predict(X[-horizon:])
        assert all(preds == CONSTANT)
def estimators(request):
    if request.param is None:
        return ["no_estimator"]

    options = {
        "prophet": [(
            "prophet",
            ProphetWrapper(
                daily_seasonality=False,
                weekly_seasonality=False,
                yearly_seasonality=False,
            ),
        )],
        "sarimax": [("sarimax",
                     SarimaxWrapper(order=(1, 1, 1),
                                    seasonal_order=(1, 1, 1, 2)))],
        "smoothing": [("smoothing", ExponentialSmoothingWrapper())],
        "sklearn": [("sklearn", get_sklearn_wrapper(LinearRegression))],
        "tbats":
        [("tbats", TBATSWrapper(use_arma_errors=False, use_box_cox=False))],
        "stacking_ensemble": [(
            "stacking_ensemble",
            StackingEnsemble(
                base_learners=[
                    ExponentialSmoothingWrapper(name="smoot_exp1",
                                                trend="add"),
                    ExponentialSmoothingWrapper(name="smoot_exp2"),
                ],
                meta_model=LinearRegression(),
            ),
        )],
        "simple_ensemble": [(
            "simple_ensemble",
            SimpleEnsemble(base_learners=[
                ExponentialSmoothingWrapper(name="smoot_exp1", trend="add"),
                ExponentialSmoothingWrapper(name="smoot_exp2"),
            ]),
        )],
    }

    if "all" in request.param:
        models = []
        [models.extend(options[key]) for key in options]
        return models
    else:
        return options[request.param]
def test_target_transformer():
    X, y = generate_tsdata(n_dates=365 * 2)
    X["trend"] = np.arange(len(X))

    preprocessing = TSColumnTransformer(transformers=[("scaler",
                                                       StandardScaler(),
                                                       ["trend"])])
    # define random forest model
    rf_model = get_sklearn_wrapper(RandomForestRegressor)
    # glue it together
    sklearn_model_pipeline = Pipeline([("preprocessing", preprocessing),
                                       ("model", rf_model)])

    scaled_pipeline = TargetTransformer(sklearn_model_pipeline,
                                        StandardScaler())

    preds = scaled_pipeline.fit(X[:-10], y[:-10]).predict(X[-10:])

    assert hasattr(scaled_pipeline, "named_steps")
    assert isinstance(scaled_pipeline.y_transformer, StandardScaler)
    assert isinstance(preds, pd.DataFrame)
def get_gridsearch(
    frequency,
    horizon=10,
    n_splits=5,
    between_split_lag=None,
    scoring="neg_mean_absolute_error",
    country_code_column=None,
    country_code=None,
    holidays_days_before=0,
    holidays_days_after=0,
    holidays_bridge_days=False,
    sklearn_models=True,
    sklearn_models_optimize_for_horizon=False,
    autosarimax_models=False,
    autoarima_dict=None,
    prophet_models=False,
    tbats_models=False,
    exp_smooth_models=False,
    theta_models=False,
    average_ensembles=False,
    stacking_ensembles=False,
    stacking_ensembles_train_horizon=10,
    stacking_ensembles_train_n_splits=20,
    clip_predictions_lower=None,
    clip_predictions_upper=None,
    exog_cols=None,
):
    """Get grid search object based on selection criteria.

    Parameters
    ----------
    frequency : str
        Frequency of timeseries. Pandas compatible frequncies

    horizon : int
        How many units of frequency (e.g. 4 quarters), should be used to find the best models

    n_splits : int
        How many cross-validation folds should be used in model selection

    between_split_lag : int
        How big lag of observations should cv_splits have
        If kept as None, horizon is used resulting in non-overlaping cv_splits

    scoring : str, callable
        String of sklearn regression metric name, or hcrystalball compatible scorer. For creation
        of hcrystalball compatible scorer use `make_ts_scorer` function.

    country_code_column : str, list
        Column(s) in data, that contain country code in str (e.g. 'DE'). Used in holiday transformer.
        Only one of `country_code_column` or `country_code` can be set.

    country_code : str, list
        Country code(s) in str (e.g. 'DE'). Used in holiday transformer.
        Only one of `country_code_column` or `country_code` can be set.

    holidays_days_before : int
        Number of days before the holiday which will be taken into account
        (i.e. 2 means that new bool column will be created and will be True for 2 days before holidays,
        otherwise False)

    holidays_days_after : int
        Number of days after the holiday which will be taken into account
        (i.e. 2 means that new bool column will be created and will be True for 2 days after holidays,
        otherwise False)

    holidays_bridge_days : bool
        Overlaping `holidays_days_before` and `holidays_days_after` feature which serves for modeling between
        holidays working days

    sklearn_models : bool
        Whether to consider sklearn models

    sklearn_models_optimize_for_horizon: bool
        Whether to add to default sklearn behavior also models, that optimize predictions for each horizon

    autosarimax_models : bool
        Whether to consider auto sarimax models

    autoarima_dict : dict
        Specification of pmdautoarima search space

    prophet_models : bool
        Whether to consider FB prophet models

    exp_smooth_models : bool
        Whether to consider exponential smoothing models

    average_ensembles : bool
        Whether to consider average ensemble models

    stacking_ensembles : bool
        Whether to consider stacking ensemble models

    stacking_ensembles_train_horizon : int
        Which horizon should be used in meta model in stacking ensembles

    stacking_ensembles_train_n_splits : int
        Number of splits used in meta model in stacking ensembles

    clip_predictions_lower : float, int
        Minimal number allowed in the predictions

    clip_predictions_upper : float, int
        Maximal number allowed in the predictions

    exog_cols : list
        List of columns to be used as exogenous variables

    Returns
    -------
    sklearn.model_selection.GridSearchCV
        CV / Model selection configuration
    """
    exog_cols = exog_cols or []
    country_code_columns = ([country_code_column] if isinstance(
        country_code_column, str) else country_code_column)
    country_codes = [country_code] if isinstance(country_code,
                                                 str) else country_code

    # ensures only exogenous columns and country code column will be passed to model if provided
    # and columns names will be stored in TSColumnTransformer
    if exog_cols:
        cols = exog_cols + country_code_columns if country_code_columns else exog_cols
        exog_passthrough = TSColumnTransformer(transformers=[("raw_cols",
                                                              "passthrough",
                                                              cols)])
    else:
        exog_passthrough = "passthrough"
    # ensures holiday transformer is added to the pipeline if requested
    if country_codes:
        holiday = Pipeline([(
            f"holiday_{code}",
            HolidayTransformer(
                country_code=code,
                days_before=holidays_days_before,
                days_after=holidays_days_after,
                bridge_days=holidays_bridge_days,
            ),
        ) for code in country_codes])
    elif country_code_columns:
        holiday = Pipeline([(
            f"holiday_{col}",
            HolidayTransformer(
                country_code_column=col,
                days_before=holidays_days_before,
                days_after=holidays_days_after,
                bridge_days=holidays_bridge_days,
            ),
        ) for col in country_code_columns])
    else:
        holiday = "passthrough"

    estimator = Pipeline([("exog_passthrough", exog_passthrough),
                          ("holiday", holiday), ("model", "passthrough")])

    cv = FinerTimeSplit(n_splits=n_splits,
                        horizon=horizon,
                        between_split_lag=between_split_lag)

    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=[],
        scoring=get_scorer(scoring),
        cv=cv,
        refit=False,
        error_score=np.nan,
    )

    if autosarimax_models:
        # adding autosarimax to param_grid might cause differently found models
        # for different splits and raise inconsistency based errors.
        # sarimax pipeline is added to new grid_search's attribute (`grid_search.autosarimax`)
        # and handled in `hcrystalball.model_seleciton.select_model` function in following way
        # 1. get best model for the data part on last split
        # 2. append this best model to original `param_grid`
        # 3. run full grid search with `param_grid` containing
        #    sarimax model selected from autosarimax in point 1
        from hcrystalball.wrappers import SarimaxWrapper

        if autoarima_dict is None:
            autoarima_dict = {}
        if "error_action" not in autoarima_dict:
            autoarima_dict.update({"error_action": "raise"})

        grid_search.autosarimax = Pipeline(estimator.steps[:-1])
        grid_search.autosarimax.steps.append((
            "model",
            SarimaxWrapper(
                init_with_autoarima=True,
                autoarima_dict=autoarima_dict,
                clip_predictions_lower=clip_predictions_lower,
                clip_predictions_upper=clip_predictions_upper,
            ),
        ))

    if stacking_ensembles or average_ensembles or sklearn_models:
        from sklearn.linear_model import ElasticNet
        from sklearn.ensemble import RandomForestRegressor

        # TODO when scoring time is fixed, add HistGradientBoostingRegressor
        # from sklearn.experimental import enable_hist_gradient_boosting
        # from sklearn.ensemble import HistGradientBoostingRegressor
        from hcrystalball.wrappers import get_sklearn_wrapper
        from hcrystalball.feature_extraction import SeasonalityTransformer

        sklearn_model = get_sklearn_wrapper(
            RandomForestRegressor,
            clip_predictions_lower=clip_predictions_lower,
            clip_predictions_upper=clip_predictions_upper,
        )

        sklearn_model_pipeline = Pipeline([
            ("seasonality", SeasonalityTransformer(auto=True, freq=frequency)),
            ("model", sklearn_model)
        ])
        # TODO make sure naming here works as expected
        sklearn_model_pipeline.name = f"seasonality_{sklearn_model.name}"

    if sklearn_models:
        classes = [ElasticNet, RandomForestRegressor]
        models = {
            model_class.__name__: get_sklearn_wrapper(
                model_class,
                clip_predictions_lower=clip_predictions_lower,
                clip_predictions_upper=clip_predictions_upper,
            )
            for model_class in classes
        }

        optimize_for_horizon = [
            False, True
        ] if sklearn_models_optimize_for_horizon else [False]

        grid_search.param_grid.append({
            "model": [sklearn_model_pipeline],
            "model__seasonality__weekly": [True, False],
            "model__model":
            list(models.values()),
            # TODO change add once HistGradientBoostingRegressor is back
            # "model__model": list(models.values()) + [sklearn_model]
            "model__model__optimize_for_horizon":
            optimize_for_horizon,
            "model__model__lags": [3, 7, 10, 14],
        })

        grid_search.param_grid.append({
            "model": [sklearn_model_pipeline],
            "model__seasonality__weekly": [True, False],
            "model__model__optimize_for_horizon":
            optimize_for_horizon,
            "model__model": [sklearn_model],
            "model__model__max_depth": [6],
        })

    if prophet_models:
        from hcrystalball.wrappers import ProphetWrapper

        extra_regressors = [None] if exog_cols is None else [None, exog_cols]

        grid_search.param_grid.append({
            "model": [
                ProphetWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__seasonality_mode": ["multiplicative", "additive"],
            "model__extra_regressors":
            extra_regressors,
        })

        grid_search.param_grid.append({
            "model": [
                ProphetWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__extra_seasonalities": [[{
                "name": "quarterly",
                "period": 90.0625,
                "fourier_order": 5,
                "prior_scale": 15.0,
                "mode": None,
            }]],
            "model__extra_regressors":
            extra_regressors,
        })

    if exp_smooth_models:
        from hcrystalball.wrappers import ExponentialSmoothingWrapper
        from hcrystalball.wrappers import HoltSmoothingWrapper
        from hcrystalball.wrappers import SimpleSmoothingWrapper

        # commented options show non deterministic behavior
        grid_search.param_grid.append({
            "model": [
                ExponentialSmoothingWrapper(
                    freq=frequency,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__trend": ["add"],
            "model__seasonal": [None, "add"],
            "model__damped": [True, False],
            "model__fit_params": [
                {
                    "use_boxcox": True,
                    "use_basinhopping": False
                },
                # {'use_boxcox':True, 'use_basinhopping':True},
                {
                    "use_boxcox": False,
                    "use_basinhopping": False
                },
                # {'use_boxcox':False, 'use_basinhopping':True}
            ],
        })

        grid_search.param_grid.append({
            "model": [
                ExponentialSmoothingWrapper(
                    freq=frequency,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__trend": ["add"],
            "model__seasonal": ["mul"],
            "model__damped": [True, False],
            "model__fit_params": [
                {
                    "use_boxcox": False,
                    "use_basinhopping": False
                },
                # {'use_boxcox':False, 'use_basinhopping':True}
            ],
        })

        grid_search.param_grid.append({
            "model": [
                ExponentialSmoothingWrapper(
                    freq=frequency,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__trend": [None],
            "model__seasonal": [None, "add", "mul"],
            "model__damped": [False],
            "model__fit_params": [
                {
                    "use_boxcox": False,
                    "use_basinhopping": False
                },
                # {'use_boxcox':False, 'use_basinhopping':True}
            ],
        })

        grid_search.param_grid.append({
            "model": [
                SimpleSmoothingWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                ),
                HoltSmoothingWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                ),
            ]
        })

    if theta_models:
        from hcrystalball.wrappers import ThetaWrapper

        grid_search.param_grid.append({
            "model": [
                ThetaWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ]
        })

    if tbats_models:
        from hcrystalball.wrappers import TBATSWrapper

        grid_search.param_grid.append({
            "model": [
                TBATSWrapper(
                    use_arma_errors=False,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ]
        })

    if stacking_ensembles:
        from hcrystalball.ensemble import StackingEnsemble
        from hcrystalball.wrappers import ProphetWrapper
        from hcrystalball.wrappers import ThetaWrapper
        from sklearn.ensemble import RandomForestRegressor

        grid_search.param_grid.append({
            "model": [
                StackingEnsemble(
                    train_n_splits=stacking_ensembles_train_n_splits,
                    train_horizon=stacking_ensembles_train_horizon,
                    meta_model=ElasticNet(),
                    horizons_as_features=True,
                    weekdays_as_features=True,
                    base_learners=[],
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__meta_model": [ElasticNet(),
                                  RandomForestRegressor()],
            "model__base_learners": [
                [
                    ProphetWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                    sklearn_model_pipeline,
                    ThetaWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                ],
            ],
        })
    if average_ensembles:
        from hcrystalball.ensemble import SimpleEnsemble
        from hcrystalball.wrappers import ProphetWrapper
        from hcrystalball.wrappers import ThetaWrapper

        grid_search.param_grid.append({
            "model": [
                SimpleEnsemble(
                    base_learners=[],
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__base_learners": [
                [
                    ProphetWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                    sklearn_model_pipeline,
                    ThetaWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                ],
            ],
        })

    return grid_search
Example #10
0
def test_model_selector(tmp_path):

    n_regions = 1
    n_plants = 1
    n_products = 2
    target_col_name = "Quantity"
    persist_path = os.path.join(tmp_path, "results")

    df = generate_multiple_tsdata(n_dates=200,
                                  n_regions=n_regions,
                                  n_plants=n_plants,
                                  n_products=n_products)
    ms = ModelSelector(frequency="D", horizon=1, country_code_column="Country")

    with pytest.raises(ValueError):
        ms.results
    with pytest.raises(ValueError):
        ms.partitions
    with pytest.raises(ValueError):
        ms.stored_path
    with pytest.raises(ValueError):
        ms.get_result_for_partition(partition="non existing partition")
    assert ms.horizon == 1

    ms.create_gridsearch(
        n_splits=1,
        prophet_models=True,
        sklearn_models=False,
        sklearn_models_optimize_for_horizon=False,
        autosarimax_models=False,
        tbats_models=False,
        exp_smooth_models=False,
        average_ensembles=False,
        stacking_ensembles=False,
        exog_cols=["Raining"],
    )
    assert hasattr(ms, "grid_search")
    assert isinstance(ms.grid_search.estimator.named_steps["holiday"],
                      HolidayTransformer)

    ms.add_model_to_gridsearch(get_sklearn_wrapper(LinearRegression))
    ms.select_model(
        df=df,
        target_col_name=target_col_name,
        partition_columns=["Region", "Plant", "Product"],
    )

    assert len(ms.results) == n_regions * n_plants * n_products
    assert len(ms.partitions) == n_regions * n_plants * n_products

    ms.persist_results(persist_path)

    print(ms.partitions)

    ms_load = load_model_selector(folder_path=persist_path)

    # we do not ensure the same order of results and partitions after loading,
    # thus checking they are all there
    assert all(
        [partition in ms_load.partitions for partition in ms.partitions])
    # TODO redefine __eq__ for ModelSelectorResult to str(MSR).__dict__?
    assert all([
        str(ms_load.get_result_for_partition(partition).__dict__) == str(
            ms.get_result_for_partition(partition).__dict__)
        for partition in ms.partitions
    ])
    assert ms.horizon == ms_load.horizon
    assert ms.frequency == ms_load.frequency

    ms.plot_best_wrapper_classes()
    ms.plot_results()
    assert "ModelSelector" in repr(ms)
    assert "ModelSelectorResults" in repr(ms)
    assert "ModelSelectorResult" in repr(ms.results[0])

    with pytest.raises(ValueError):
        ms.results[0].persist(attribute_name="non_existing_attribute")

    assert ms.results[0].cv_splits_overlap is False

    ms.results[0].plot_error()
Example #11
0
def wrapper_instance_capped(request):
    if request.param.split(";")[0] == "prophet":
        return ProphetWrapper(
            daily_seasonality=False,
            weekly_seasonality=False,
            yearly_seasonality=False,
            clip_predictions_lower=float(request.param.split(";")[1]),
            clip_predictions_upper=float(request.param.split(";")[2]),
        )
    elif request.param.split(";")[0] == "smoothing":
        return ExponentialSmoothingWrapper(
            trend="add",
            clip_predictions_lower=float(request.param.split(";")[1]),
            clip_predictions_upper=float(request.param.split(";")[2]),
        )
    elif request.param.split(";")[0] == "tbats":
        return TBATSWrapper(
            use_arma_errors=False,
            use_box_cox=False,
            clip_predictions_lower=float(request.param.split(";")[1]),
            clip_predictions_upper=float(request.param.split(";")[2]),
        )
    elif request.param.split(";")[0] == "sklearn":
        return get_sklearn_wrapper(
            LinearRegression,
            lags=4,
            clip_predictions_lower=float(request.param.split(";")[1]),
            clip_predictions_upper=float(request.param.split(";")[2]),
        )
    elif request.param.split(";")[0] == "sarimax":
        return SarimaxWrapper(
            order=(1, 1, 0),
            seasonal_order=(1, 1, 1, 2),
            clip_predictions_lower=float(request.param.split(";")[1]),
            clip_predictions_upper=float(request.param.split(";")[2]),
        )
    elif request.param.split(";")[0] == "stacking_ensemble":
        return StackingEnsemble(
            base_learners=[
                ExponentialSmoothingWrapper(
                    name="smoot_exp1",
                    trend="add",
                    clip_predictions_lower=float(request.param.split(";")[1]),
                    clip_predictions_upper=float(request.param.split(";")[2]),
                ),
                ExponentialSmoothingWrapper(
                    name="smoot_exp2",
                    clip_predictions_lower=float(request.param.split(";")[1]),
                    clip_predictions_upper=float(request.param.split(";")[2]),
                ),
            ],
            meta_model=LinearRegression(),
            horizons_as_features=False,
            weekdays_as_features=False,
            train_n_splits=1,
            train_horizon=10,
            clip_predictions_lower=float(request.param.split(";")[1]),
            clip_predictions_upper=float(request.param.split(";")[2]),
        )
    elif request.param.split(";")[0] == "simple_ensemble":
        return SimpleEnsemble(base_learners=[
            ExponentialSmoothingWrapper(
                name="smoot_exp1",
                trend="add",
                clip_predictions_lower=float(request.param.split(";")[1]),
                clip_predictions_upper=float(request.param.split(";")[2]),
            ),
            ExponentialSmoothingWrapper(
                name="smoot_exp2",
                clip_predictions_lower=float(request.param.split(";")[1]),
                clip_predictions_upper=float(request.param.split(";")[2]),
            ),
        ])
Example #12
0
def seleciona_modelo_horizonte3(dtf_train='',
                                target_col='',
                                seed=42,
                                horizonte=1,
                                exog_features_list='',
                                lags=3,
                                pack=''):
    #if horizonte<3:
    #   lags=horizonte

    ms = ModelSelector(
        horizon=horizonte,
        frequency='D',
        country_code_column=
        None  # 'country' --> deixar None, se nao ocorre erro de execucao
    )
    ms.create_gridsearch(
        sklearn_models=False,
        n_splits=2,  # 10 cross-validation splits
        between_split_lag=None,
        sklearn_models_optimize_for_horizon=False,
        autosarimax_models=
        False,  # Autosarimax agora esta funcionando, com pmdarima=1.5.3
        prophet_models=
        False,  # Nao ativar, pois usaremos o NeuralProphet em seu lugar
        tbats_models=False,  # TBATS funcionando OK (pip install tbats)
        exp_smooth_models=False,  # exp_smooth funcionando OK
        average_ensembles=False,  # average_ensembles, funcionando OK
        stacking_ensembles=
        False,  # Nao vamos usar, demora muito e nao da bom resultado
        exog_cols=exog_features_list,
        # exog_cols=None,
        #holidays_days_before=2,
        #holidays_days_after=1,
        #holidays_bridge_days=True,
    )
    #ms.add_model_to_gridsearch(NeuralProphetWrapper(exog_cols=exog_features.columns.tolist()))
    use_scikit = True

    huber, ridge, xgb_sq, xgb_hb = pack
    if use_scikit:
        if target_col in xgb_hb:
            xgb_r = get_sklearn_wrapper(
                XGBRegressor, lags=lags,
                objective="reg:pseudohubererror")  # , random_state=seed))
            #xgb_r = get_sklearn_wrapper(XGBRegressor,lags=lags,objective="reg:tweedie")  # , random_state=seed))
            xgb_r.name = 'XGBRegressor_Huber'
            ms.add_model_to_gridsearch(xgb_r)
        elif target_col in xgb_sq:
            xgb_r = get_sklearn_wrapper(XGBRegressor,
                                        lags=lags)  # , random_state=seed))
            #xgb_r = get_sklearn_wrapper(XGBRegressor,lags=lags,objective="reg:tweedie")  # , random_state=seed))
            xgb_r.name = 'XGBRegressor_Squared_Loss'
            ms.add_model_to_gridsearch(xgb_r)
        elif target_col in ridge:
            ridge_r = get_sklearn_wrapper(Ridge, random_state=seed, lags=lags)
            ridge_r.name = 'Ridge'
            ms.add_model_to_gridsearch(ridge_r)
        else:
            huber_r = get_sklearn_wrapper(HuberRegressor, max_iter=160)
            huber_r.name = 'Huber'
            ms.add_model_to_gridsearch(huber_r)
    # Method `select_model` is doing majority of the magic for you - it creates forecast for each combination of
    # columns specified in `partition_columns` and for each of the time series it will run grid_search mentioned
    # above. Optionally once can select list of columns over which the model selection will run in parallel using
    # prefect (`parallel_over_columns`).
    # Required format for data is Datetime index, unsuprisingly numerical column for `target_col_name` all other
    # columns except `partition_columns` will be used as exogenous variables - as additional features for modeling.
    ms.select_model(
        df=dtf_train,
        target_col_name=target_col,
        partition_columns=None,
        #                 parallel_over_columns=['Assortment'],
        #                 persist_model_selector_results=False,
        #                 output_path='my_results',
        #                 executor = LocalDaskExecutor(),
    )

    ms.persist_results('results')
    #mlflow.log_metric("score", 0.75)

    # ============================  Train model  ==================================
    result = ms.results[0]
    print('Model selection result: \n', str(result))
    best_model = result.best_model
    return ms, best_model, result