def wrapper_instance(request): if request.param == "prophet": return ProphetWrapper(daily_seasonality=False, weekly_seasonality=False, yearly_seasonality=False) elif request.param == "smoothing": return ExponentialSmoothingWrapper(trend="add") elif request.param == "tbats": return TBATSWrapper(use_arma_errors=False, use_box_cox=False) elif request.param == "sklearn": return get_sklearn_wrapper(LinearRegression, lags=4) elif request.param == "sarimax": return SarimaxWrapper(order=(1, 1, 0), seasonal_order=(1, 1, 1, 2)) elif request.param == "stacking_ensemble": return StackingEnsemble( base_learners=[ ExponentialSmoothingWrapper(name="smoot_exp1", trend="add"), ExponentialSmoothingWrapper(name="smoot_exp2"), ], meta_model=LinearRegression(), horizons_as_features=False, weekdays_as_features=False, ) elif request.param == "simple_ensemble": return SimpleEnsemble(base_learners=[ ExponentialSmoothingWrapper(name="smoot_exp1", trend="add"), ExponentialSmoothingWrapper(name="smoot_exp2"), ])
def test_get_best_not_failing_model(X_y_optional, negative_data, best_model_name, rank, expected_error): X, y = X_y_optional # data contains 0 y[y < 1] = 1 if negative_data: y[-1] = -1 models = [ ExponentialSmoothingWrapper(freq="D", trend="mul"), get_sklearn_wrapper(DummyRegressor, strategy="constant", constant=-5000), ] models = models if expected_error is None else models[:1] grid_search = GridSearchCV( estimator=Pipeline([("model", "passthrough")]), param_grid=[{"model": models}], scoring=get_scorer("neg_mean_absolute_error"), cv=FinerTimeSplit(n_splits=1, horizon=5), refit=False, error_score=np.nan, ) grid_search.fit(X, y) if expected_error: with pytest.raises(expected_error): get_best_not_failing_model(grid_search, X, y) else: best_param_rank = get_best_not_failing_model(grid_search, X, y) assert isinstance(best_param_rank, dict) assert best_param_rank["params"]["model"].__class__.__name__ == best_model_name assert best_param_rank["rank"] == rank
def grid_search(request): from sklearn.dummy import DummyRegressor from sklearn.metrics import mean_absolute_error from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from hcrystalball.feature_extraction import HolidayTransformer from hcrystalball.feature_extraction import SeasonalityTransformer from hcrystalball.metrics import make_ts_scorer from hcrystalball.model_selection import FinerTimeSplit from hcrystalball.wrappers import get_sklearn_wrapper scoring = make_ts_scorer(mean_absolute_error, greater_is_better=False) bad_dummy = get_sklearn_wrapper(DummyRegressor, strategy="constant", constant=42, name="bad_dummy", lags=2) good_dummy = get_sklearn_wrapper(DummyRegressor, strategy="mean", name="good_dummy", lags=2) parameters = [ { "model": [good_dummy] }, { "model": [bad_dummy], "model__strategy": ["constant"], "model__constant": [42], }, ] holiday_model = Pipeline([ ("holiday", HolidayTransformer(country_code_column="Holidays_code")), ("seasonality", SeasonalityTransformer(week_day=True, freq="D")), ("model", good_dummy), ]) cv = FinerTimeSplit(n_splits=2, horizon=5) grid_search = GridSearchCV(holiday_model, parameters, cv=cv, scoring=scoring) return grid_search
def test_model_selector(tmp_path): n_regions = 1 n_plants = 1 n_products = 2 target_col_name = "Quantity" persist_path = os.path.join(tmp_path, "results") df = generate_multiple_tsdata( n_dates=200, n_regions=n_regions, n_plants=n_plants, n_products=n_products ) ms = ModelSelector(frequency="D", horizon=1, country_code_column="Country") assert ms.horizon == 1 ms.create_gridsearch( n_splits=1, prophet_models=True, sklearn_models=False, sklearn_models_optimize_for_horizon=False, autosarimax_models=False, tbats_models=False, exp_smooth_models=False, average_ensembles=False, stacking_ensembles=False, exog_cols=["Raining"], ) assert hasattr(ms, "grid_search") ms.add_model_to_gridsearch(get_sklearn_wrapper(LinearRegression)) ms.select_model( df=df, target_col_name=target_col_name, partition_columns=["Region", "Plant", "Product"], ) assert len(ms.results) == n_regions * n_plants * n_products assert len(ms.partitions) == n_regions * n_plants * n_products ms.persist_results(persist_path) print(ms.partitions) ms_load = load_model_selector(folder_path=persist_path) # we do not ensure the same order of results and partitions after loading, thus checking they are all there assert all([partition in ms_load.partitions for partition in ms.partitions]) # TODO redefine __eq__ for ModelSelectorResult to str(MSR).__dict__? assert all( [ str(ms_load.get_result_for_partition(partition).__dict__) == str(ms.get_result_for_partition(partition).__dict__) for partition in ms.partitions ] ) assert ms.horizon == ms_load.horizon assert ms.frequency == ms_load.frequency
def pipeline_instance_model_only(request): if request.param == "prophet": return Pipeline([( "regressor", ProphetWrapper( daily_seasonality=False, weekly_seasonality=False, yearly_seasonality=False, ), )]) elif request.param == "smoothing": return Pipeline([("regressor", ExponentialSmoothingWrapper(trend="add"))]) elif request.param == "tbats": return Pipeline([("regressor", TBATSWrapper(use_arma_errors=False, use_box_cox=False))]) elif request.param == "sklearn": return Pipeline([("regressor", get_sklearn_wrapper(LinearRegression, lags=4))]) elif request.param == "sarimax": return Pipeline([( "regressor", SarimaxWrapper(order=(1, 1, 0), seasonal_order=(1, 1, 1, 1)), )]) elif request.param == "stacking_ensemble": return Pipeline([( "regressor", StackingEnsemble( base_learners=[ ExponentialSmoothingWrapper(name="smoot_exp1", trend="add"), ExponentialSmoothingWrapper(name="smoot_exp2"), ], meta_model=LinearRegression(), ), )]) elif request.param == "simple_ensemble": return Pipeline([( "regressor", SimpleEnsemble(base_learners=[ ExponentialSmoothingWrapper(name="smoot_exp1", trend="add"), ExponentialSmoothingWrapper(name="smoot_exp2"), ]), )]) else: return None
def test_sklearn_wrapper_overal(X_y_linear_trend, horizon, exp_error): CONSTANT = 50 X, y = X_y_linear_trend model = get_sklearn_wrapper(DummyRegressor, lags=3, strategy="constant", constant=CONSTANT) model.fit(X[:-horizon], y[:-horizon]) if exp_error is not None: with pytest.raises(exp_error): preds = model.predict(X[-horizon:]) else: preds = model.predict(X[-horizon:]) assert all(preds == CONSTANT)
def estimators(request): if request.param is None: return ["no_estimator"] options = { "prophet": [( "prophet", ProphetWrapper( daily_seasonality=False, weekly_seasonality=False, yearly_seasonality=False, ), )], "sarimax": [("sarimax", SarimaxWrapper(order=(1, 1, 1), seasonal_order=(1, 1, 1, 2)))], "smoothing": [("smoothing", ExponentialSmoothingWrapper())], "sklearn": [("sklearn", get_sklearn_wrapper(LinearRegression))], "tbats": [("tbats", TBATSWrapper(use_arma_errors=False, use_box_cox=False))], "stacking_ensemble": [( "stacking_ensemble", StackingEnsemble( base_learners=[ ExponentialSmoothingWrapper(name="smoot_exp1", trend="add"), ExponentialSmoothingWrapper(name="smoot_exp2"), ], meta_model=LinearRegression(), ), )], "simple_ensemble": [( "simple_ensemble", SimpleEnsemble(base_learners=[ ExponentialSmoothingWrapper(name="smoot_exp1", trend="add"), ExponentialSmoothingWrapper(name="smoot_exp2"), ]), )], } if "all" in request.param: models = [] [models.extend(options[key]) for key in options] return models else: return options[request.param]
def test_target_transformer(): X, y = generate_tsdata(n_dates=365 * 2) X["trend"] = np.arange(len(X)) preprocessing = TSColumnTransformer(transformers=[("scaler", StandardScaler(), ["trend"])]) # define random forest model rf_model = get_sklearn_wrapper(RandomForestRegressor) # glue it together sklearn_model_pipeline = Pipeline([("preprocessing", preprocessing), ("model", rf_model)]) scaled_pipeline = TargetTransformer(sklearn_model_pipeline, StandardScaler()) preds = scaled_pipeline.fit(X[:-10], y[:-10]).predict(X[-10:]) assert hasattr(scaled_pipeline, "named_steps") assert isinstance(scaled_pipeline.y_transformer, StandardScaler) assert isinstance(preds, pd.DataFrame)
def get_gridsearch( frequency, horizon=10, n_splits=5, between_split_lag=None, scoring="neg_mean_absolute_error", country_code_column=None, country_code=None, holidays_days_before=0, holidays_days_after=0, holidays_bridge_days=False, sklearn_models=True, sklearn_models_optimize_for_horizon=False, autosarimax_models=False, autoarima_dict=None, prophet_models=False, tbats_models=False, exp_smooth_models=False, theta_models=False, average_ensembles=False, stacking_ensembles=False, stacking_ensembles_train_horizon=10, stacking_ensembles_train_n_splits=20, clip_predictions_lower=None, clip_predictions_upper=None, exog_cols=None, ): """Get grid search object based on selection criteria. Parameters ---------- frequency : str Frequency of timeseries. Pandas compatible frequncies horizon : int How many units of frequency (e.g. 4 quarters), should be used to find the best models n_splits : int How many cross-validation folds should be used in model selection between_split_lag : int How big lag of observations should cv_splits have If kept as None, horizon is used resulting in non-overlaping cv_splits scoring : str, callable String of sklearn regression metric name, or hcrystalball compatible scorer. For creation of hcrystalball compatible scorer use `make_ts_scorer` function. country_code_column : str, list Column(s) in data, that contain country code in str (e.g. 'DE'). Used in holiday transformer. Only one of `country_code_column` or `country_code` can be set. country_code : str, list Country code(s) in str (e.g. 'DE'). Used in holiday transformer. Only one of `country_code_column` or `country_code` can be set. holidays_days_before : int Number of days before the holiday which will be taken into account (i.e. 2 means that new bool column will be created and will be True for 2 days before holidays, otherwise False) holidays_days_after : int Number of days after the holiday which will be taken into account (i.e. 2 means that new bool column will be created and will be True for 2 days after holidays, otherwise False) holidays_bridge_days : bool Overlaping `holidays_days_before` and `holidays_days_after` feature which serves for modeling between holidays working days sklearn_models : bool Whether to consider sklearn models sklearn_models_optimize_for_horizon: bool Whether to add to default sklearn behavior also models, that optimize predictions for each horizon autosarimax_models : bool Whether to consider auto sarimax models autoarima_dict : dict Specification of pmdautoarima search space prophet_models : bool Whether to consider FB prophet models exp_smooth_models : bool Whether to consider exponential smoothing models average_ensembles : bool Whether to consider average ensemble models stacking_ensembles : bool Whether to consider stacking ensemble models stacking_ensembles_train_horizon : int Which horizon should be used in meta model in stacking ensembles stacking_ensembles_train_n_splits : int Number of splits used in meta model in stacking ensembles clip_predictions_lower : float, int Minimal number allowed in the predictions clip_predictions_upper : float, int Maximal number allowed in the predictions exog_cols : list List of columns to be used as exogenous variables Returns ------- sklearn.model_selection.GridSearchCV CV / Model selection configuration """ exog_cols = exog_cols or [] country_code_columns = ([country_code_column] if isinstance( country_code_column, str) else country_code_column) country_codes = [country_code] if isinstance(country_code, str) else country_code # ensures only exogenous columns and country code column will be passed to model if provided # and columns names will be stored in TSColumnTransformer if exog_cols: cols = exog_cols + country_code_columns if country_code_columns else exog_cols exog_passthrough = TSColumnTransformer(transformers=[("raw_cols", "passthrough", cols)]) else: exog_passthrough = "passthrough" # ensures holiday transformer is added to the pipeline if requested if country_codes: holiday = Pipeline([( f"holiday_{code}", HolidayTransformer( country_code=code, days_before=holidays_days_before, days_after=holidays_days_after, bridge_days=holidays_bridge_days, ), ) for code in country_codes]) elif country_code_columns: holiday = Pipeline([( f"holiday_{col}", HolidayTransformer( country_code_column=col, days_before=holidays_days_before, days_after=holidays_days_after, bridge_days=holidays_bridge_days, ), ) for col in country_code_columns]) else: holiday = "passthrough" estimator = Pipeline([("exog_passthrough", exog_passthrough), ("holiday", holiday), ("model", "passthrough")]) cv = FinerTimeSplit(n_splits=n_splits, horizon=horizon, between_split_lag=between_split_lag) grid_search = GridSearchCV( estimator=estimator, param_grid=[], scoring=get_scorer(scoring), cv=cv, refit=False, error_score=np.nan, ) if autosarimax_models: # adding autosarimax to param_grid might cause differently found models # for different splits and raise inconsistency based errors. # sarimax pipeline is added to new grid_search's attribute (`grid_search.autosarimax`) # and handled in `hcrystalball.model_seleciton.select_model` function in following way # 1. get best model for the data part on last split # 2. append this best model to original `param_grid` # 3. run full grid search with `param_grid` containing # sarimax model selected from autosarimax in point 1 from hcrystalball.wrappers import SarimaxWrapper if autoarima_dict is None: autoarima_dict = {} if "error_action" not in autoarima_dict: autoarima_dict.update({"error_action": "raise"}) grid_search.autosarimax = Pipeline(estimator.steps[:-1]) grid_search.autosarimax.steps.append(( "model", SarimaxWrapper( init_with_autoarima=True, autoarima_dict=autoarima_dict, clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ), )) if stacking_ensembles or average_ensembles or sklearn_models: from sklearn.linear_model import ElasticNet from sklearn.ensemble import RandomForestRegressor # TODO when scoring time is fixed, add HistGradientBoostingRegressor # from sklearn.experimental import enable_hist_gradient_boosting # from sklearn.ensemble import HistGradientBoostingRegressor from hcrystalball.wrappers import get_sklearn_wrapper from hcrystalball.feature_extraction import SeasonalityTransformer sklearn_model = get_sklearn_wrapper( RandomForestRegressor, clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) sklearn_model_pipeline = Pipeline([ ("seasonality", SeasonalityTransformer(auto=True, freq=frequency)), ("model", sklearn_model) ]) # TODO make sure naming here works as expected sklearn_model_pipeline.name = f"seasonality_{sklearn_model.name}" if sklearn_models: classes = [ElasticNet, RandomForestRegressor] models = { model_class.__name__: get_sklearn_wrapper( model_class, clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) for model_class in classes } optimize_for_horizon = [ False, True ] if sklearn_models_optimize_for_horizon else [False] grid_search.param_grid.append({ "model": [sklearn_model_pipeline], "model__seasonality__weekly": [True, False], "model__model": list(models.values()), # TODO change add once HistGradientBoostingRegressor is back # "model__model": list(models.values()) + [sklearn_model] "model__model__optimize_for_horizon": optimize_for_horizon, "model__model__lags": [3, 7, 10, 14], }) grid_search.param_grid.append({ "model": [sklearn_model_pipeline], "model__seasonality__weekly": [True, False], "model__model__optimize_for_horizon": optimize_for_horizon, "model__model": [sklearn_model], "model__model__max_depth": [6], }) if prophet_models: from hcrystalball.wrappers import ProphetWrapper extra_regressors = [None] if exog_cols is None else [None, exog_cols] grid_search.param_grid.append({ "model": [ ProphetWrapper( clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ], "model__seasonality_mode": ["multiplicative", "additive"], "model__extra_regressors": extra_regressors, }) grid_search.param_grid.append({ "model": [ ProphetWrapper( clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ], "model__extra_seasonalities": [[{ "name": "quarterly", "period": 90.0625, "fourier_order": 5, "prior_scale": 15.0, "mode": None, }]], "model__extra_regressors": extra_regressors, }) if exp_smooth_models: from hcrystalball.wrappers import ExponentialSmoothingWrapper from hcrystalball.wrappers import HoltSmoothingWrapper from hcrystalball.wrappers import SimpleSmoothingWrapper # commented options show non deterministic behavior grid_search.param_grid.append({ "model": [ ExponentialSmoothingWrapper( freq=frequency, clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ], "model__trend": ["add"], "model__seasonal": [None, "add"], "model__damped": [True, False], "model__fit_params": [ { "use_boxcox": True, "use_basinhopping": False }, # {'use_boxcox':True, 'use_basinhopping':True}, { "use_boxcox": False, "use_basinhopping": False }, # {'use_boxcox':False, 'use_basinhopping':True} ], }) grid_search.param_grid.append({ "model": [ ExponentialSmoothingWrapper( freq=frequency, clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ], "model__trend": ["add"], "model__seasonal": ["mul"], "model__damped": [True, False], "model__fit_params": [ { "use_boxcox": False, "use_basinhopping": False }, # {'use_boxcox':False, 'use_basinhopping':True} ], }) grid_search.param_grid.append({ "model": [ ExponentialSmoothingWrapper( freq=frequency, clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ], "model__trend": [None], "model__seasonal": [None, "add", "mul"], "model__damped": [False], "model__fit_params": [ { "use_boxcox": False, "use_basinhopping": False }, # {'use_boxcox':False, 'use_basinhopping':True} ], }) grid_search.param_grid.append({ "model": [ SimpleSmoothingWrapper( clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ), HoltSmoothingWrapper( clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ), ] }) if theta_models: from hcrystalball.wrappers import ThetaWrapper grid_search.param_grid.append({ "model": [ ThetaWrapper( clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ] }) if tbats_models: from hcrystalball.wrappers import TBATSWrapper grid_search.param_grid.append({ "model": [ TBATSWrapper( use_arma_errors=False, clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ] }) if stacking_ensembles: from hcrystalball.ensemble import StackingEnsemble from hcrystalball.wrappers import ProphetWrapper from hcrystalball.wrappers import ThetaWrapper from sklearn.ensemble import RandomForestRegressor grid_search.param_grid.append({ "model": [ StackingEnsemble( train_n_splits=stacking_ensembles_train_n_splits, train_horizon=stacking_ensembles_train_horizon, meta_model=ElasticNet(), horizons_as_features=True, weekdays_as_features=True, base_learners=[], clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ], "model__meta_model": [ElasticNet(), RandomForestRegressor()], "model__base_learners": [ [ ProphetWrapper( clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ), sklearn_model_pipeline, ThetaWrapper( clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ), ], ], }) if average_ensembles: from hcrystalball.ensemble import SimpleEnsemble from hcrystalball.wrappers import ProphetWrapper from hcrystalball.wrappers import ThetaWrapper grid_search.param_grid.append({ "model": [ SimpleEnsemble( base_learners=[], clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ], "model__base_learners": [ [ ProphetWrapper( clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ), sklearn_model_pipeline, ThetaWrapper( clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ), ], ], }) return grid_search
def test_model_selector(tmp_path): n_regions = 1 n_plants = 1 n_products = 2 target_col_name = "Quantity" persist_path = os.path.join(tmp_path, "results") df = generate_multiple_tsdata(n_dates=200, n_regions=n_regions, n_plants=n_plants, n_products=n_products) ms = ModelSelector(frequency="D", horizon=1, country_code_column="Country") with pytest.raises(ValueError): ms.results with pytest.raises(ValueError): ms.partitions with pytest.raises(ValueError): ms.stored_path with pytest.raises(ValueError): ms.get_result_for_partition(partition="non existing partition") assert ms.horizon == 1 ms.create_gridsearch( n_splits=1, prophet_models=True, sklearn_models=False, sklearn_models_optimize_for_horizon=False, autosarimax_models=False, tbats_models=False, exp_smooth_models=False, average_ensembles=False, stacking_ensembles=False, exog_cols=["Raining"], ) assert hasattr(ms, "grid_search") assert isinstance(ms.grid_search.estimator.named_steps["holiday"], HolidayTransformer) ms.add_model_to_gridsearch(get_sklearn_wrapper(LinearRegression)) ms.select_model( df=df, target_col_name=target_col_name, partition_columns=["Region", "Plant", "Product"], ) assert len(ms.results) == n_regions * n_plants * n_products assert len(ms.partitions) == n_regions * n_plants * n_products ms.persist_results(persist_path) print(ms.partitions) ms_load = load_model_selector(folder_path=persist_path) # we do not ensure the same order of results and partitions after loading, # thus checking they are all there assert all( [partition in ms_load.partitions for partition in ms.partitions]) # TODO redefine __eq__ for ModelSelectorResult to str(MSR).__dict__? assert all([ str(ms_load.get_result_for_partition(partition).__dict__) == str( ms.get_result_for_partition(partition).__dict__) for partition in ms.partitions ]) assert ms.horizon == ms_load.horizon assert ms.frequency == ms_load.frequency ms.plot_best_wrapper_classes() ms.plot_results() assert "ModelSelector" in repr(ms) assert "ModelSelectorResults" in repr(ms) assert "ModelSelectorResult" in repr(ms.results[0]) with pytest.raises(ValueError): ms.results[0].persist(attribute_name="non_existing_attribute") assert ms.results[0].cv_splits_overlap is False ms.results[0].plot_error()
def wrapper_instance_capped(request): if request.param.split(";")[0] == "prophet": return ProphetWrapper( daily_seasonality=False, weekly_seasonality=False, yearly_seasonality=False, clip_predictions_lower=float(request.param.split(";")[1]), clip_predictions_upper=float(request.param.split(";")[2]), ) elif request.param.split(";")[0] == "smoothing": return ExponentialSmoothingWrapper( trend="add", clip_predictions_lower=float(request.param.split(";")[1]), clip_predictions_upper=float(request.param.split(";")[2]), ) elif request.param.split(";")[0] == "tbats": return TBATSWrapper( use_arma_errors=False, use_box_cox=False, clip_predictions_lower=float(request.param.split(";")[1]), clip_predictions_upper=float(request.param.split(";")[2]), ) elif request.param.split(";")[0] == "sklearn": return get_sklearn_wrapper( LinearRegression, lags=4, clip_predictions_lower=float(request.param.split(";")[1]), clip_predictions_upper=float(request.param.split(";")[2]), ) elif request.param.split(";")[0] == "sarimax": return SarimaxWrapper( order=(1, 1, 0), seasonal_order=(1, 1, 1, 2), clip_predictions_lower=float(request.param.split(";")[1]), clip_predictions_upper=float(request.param.split(";")[2]), ) elif request.param.split(";")[0] == "stacking_ensemble": return StackingEnsemble( base_learners=[ ExponentialSmoothingWrapper( name="smoot_exp1", trend="add", clip_predictions_lower=float(request.param.split(";")[1]), clip_predictions_upper=float(request.param.split(";")[2]), ), ExponentialSmoothingWrapper( name="smoot_exp2", clip_predictions_lower=float(request.param.split(";")[1]), clip_predictions_upper=float(request.param.split(";")[2]), ), ], meta_model=LinearRegression(), horizons_as_features=False, weekdays_as_features=False, train_n_splits=1, train_horizon=10, clip_predictions_lower=float(request.param.split(";")[1]), clip_predictions_upper=float(request.param.split(";")[2]), ) elif request.param.split(";")[0] == "simple_ensemble": return SimpleEnsemble(base_learners=[ ExponentialSmoothingWrapper( name="smoot_exp1", trend="add", clip_predictions_lower=float(request.param.split(";")[1]), clip_predictions_upper=float(request.param.split(";")[2]), ), ExponentialSmoothingWrapper( name="smoot_exp2", clip_predictions_lower=float(request.param.split(";")[1]), clip_predictions_upper=float(request.param.split(";")[2]), ), ])
def seleciona_modelo_horizonte3(dtf_train='', target_col='', seed=42, horizonte=1, exog_features_list='', lags=3, pack=''): #if horizonte<3: # lags=horizonte ms = ModelSelector( horizon=horizonte, frequency='D', country_code_column= None # 'country' --> deixar None, se nao ocorre erro de execucao ) ms.create_gridsearch( sklearn_models=False, n_splits=2, # 10 cross-validation splits between_split_lag=None, sklearn_models_optimize_for_horizon=False, autosarimax_models= False, # Autosarimax agora esta funcionando, com pmdarima=1.5.3 prophet_models= False, # Nao ativar, pois usaremos o NeuralProphet em seu lugar tbats_models=False, # TBATS funcionando OK (pip install tbats) exp_smooth_models=False, # exp_smooth funcionando OK average_ensembles=False, # average_ensembles, funcionando OK stacking_ensembles= False, # Nao vamos usar, demora muito e nao da bom resultado exog_cols=exog_features_list, # exog_cols=None, #holidays_days_before=2, #holidays_days_after=1, #holidays_bridge_days=True, ) #ms.add_model_to_gridsearch(NeuralProphetWrapper(exog_cols=exog_features.columns.tolist())) use_scikit = True huber, ridge, xgb_sq, xgb_hb = pack if use_scikit: if target_col in xgb_hb: xgb_r = get_sklearn_wrapper( XGBRegressor, lags=lags, objective="reg:pseudohubererror") # , random_state=seed)) #xgb_r = get_sklearn_wrapper(XGBRegressor,lags=lags,objective="reg:tweedie") # , random_state=seed)) xgb_r.name = 'XGBRegressor_Huber' ms.add_model_to_gridsearch(xgb_r) elif target_col in xgb_sq: xgb_r = get_sklearn_wrapper(XGBRegressor, lags=lags) # , random_state=seed)) #xgb_r = get_sklearn_wrapper(XGBRegressor,lags=lags,objective="reg:tweedie") # , random_state=seed)) xgb_r.name = 'XGBRegressor_Squared_Loss' ms.add_model_to_gridsearch(xgb_r) elif target_col in ridge: ridge_r = get_sklearn_wrapper(Ridge, random_state=seed, lags=lags) ridge_r.name = 'Ridge' ms.add_model_to_gridsearch(ridge_r) else: huber_r = get_sklearn_wrapper(HuberRegressor, max_iter=160) huber_r.name = 'Huber' ms.add_model_to_gridsearch(huber_r) # Method `select_model` is doing majority of the magic for you - it creates forecast for each combination of # columns specified in `partition_columns` and for each of the time series it will run grid_search mentioned # above. Optionally once can select list of columns over which the model selection will run in parallel using # prefect (`parallel_over_columns`). # Required format for data is Datetime index, unsuprisingly numerical column for `target_col_name` all other # columns except `partition_columns` will be used as exogenous variables - as additional features for modeling. ms.select_model( df=dtf_train, target_col_name=target_col, partition_columns=None, # parallel_over_columns=['Assortment'], # persist_model_selector_results=False, # output_path='my_results', # executor = LocalDaskExecutor(), ) ms.persist_results('results') #mlflow.log_metric("score", 0.75) # ============================ Train model ================================== result = ms.results[0] print('Model selection result: \n', str(result)) best_model = result.best_model return ms, best_model, result