# Runs the forecast result = forecaster.run_forecast_config( df=df, config=ForecastConfig( model_template=ModelTemplateEnum.SILVERKITE.name, forecast_horizon=365, # forecasts 365 steps ahead coverage=0.95, # 95% prediction intervals metadata_param=metadata, evaluation_period_param=evaluation_period)) # Summarizes the cv result cv_results = summarize_grid_search_results( grid_search=result.grid_search, decimals=1, # The below saves space in the printed output. Remove to show all available metrics and columns. cv_report_metrics=None, column_order=[ "rank", "mean_test", "split_test", "mean_train", "split_train", "mean_fit_time", "mean_score_time", "params" ]) # Transposes to save space in the printed output cv_results["params"] = cv_results["params"].astype(str) cv_results.set_index("params", drop=True, inplace=True) cv_results.transpose() # %% # By default, all metrics in `~greykite.common.evaluation.ElementwiseEvaluationMetricEnum` # are computed on each CV train/test split. # The configuration of CV evaluation metrics can be found at # `Evaluation Metric <../../pages/stepbystep/0400_configuration.html#evaluation-metric>`_. # Here, we show the Mean Absolute Percentage Error (MAPE)
def test_run_forecast_config_with_single_simple_silverkite_template(): # The generic name of single simple silverkite templates are not added to `ModelTemplateEnum`, # therefore we test if these are recognized. data = generate_df_for_tests(freq="D", periods=365) df = data["df"] metric = EvaluationMetricEnum.MeanAbsoluteError evaluation_metric = EvaluationMetricParam(cv_selection_metric=metric.name, agg_periods=7, agg_func=np.max, null_model_params={ "strategy": "quantile", "constant": None, "quantile": 0.5 }) evaluation_period = EvaluationPeriodParam(test_horizon=10, periods_between_train_test=5, cv_horizon=4, cv_min_train_periods=80, cv_expanding_window=False, cv_periods_between_splits=20, cv_periods_between_train_test=3, cv_max_splits=2) model_components = ModelComponentsParam( hyperparameter_override=[{ "estimator__yearly_seasonality": 1 }, { "estimator__yearly_seasonality": 2 }]) computation = ComputationParam(verbose=2) forecast_horizon = 27 coverage = 0.90 single_template_class = SimpleSilverkiteTemplateOptions( freq=SILVERKITE_COMPONENT_KEYWORDS.FREQ.value.DAILY, seas=SILVERKITE_COMPONENT_KEYWORDS.SEAS.value.NONE) forecast_config = ForecastConfig(model_template=[ single_template_class, "DAILY_ALGO_SGD", "SILVERKITE_DAILY_90" ], computation_param=computation, coverage=coverage, evaluation_metric_param=evaluation_metric, evaluation_period_param=evaluation_period, forecast_horizon=forecast_horizon, model_components_param=model_components) forecaster = Forecaster() result = forecaster.run_forecast_config(df=df, config=forecast_config) summary = summarize_grid_search_results(result.grid_search) # single_template_class is 1 template, # "DAILY_ALGO_SGD" is 1 template and "SILVERKITE_DAILY_90" has 4 templates. # With 2 items in `hyperparameter_override, there should be a total of 12 cases. assert summary.shape[0] == 12 # Tests functionality for single template class only. forecast_config = ForecastConfig(model_template=single_template_class, computation_param=computation, coverage=coverage, evaluation_metric_param=evaluation_metric, evaluation_period_param=evaluation_period, forecast_horizon=forecast_horizon) forecaster = Forecaster() pipeline_parameters = forecaster.apply_forecast_config( df=df, config=forecast_config) assert_equal(actual=pipeline_parameters["hyperparameter_grid"], expected={ "estimator__time_properties": [None], "estimator__origin_for_time_vars": [None], "estimator__train_test_thresh": [None], "estimator__training_fraction": [None], "estimator__fit_algorithm_dict": [{ "fit_algorithm": "linear", "fit_algorithm_params": None }], "estimator__holidays_to_model_separately": [[]], "estimator__holiday_lookup_countries": [[]], "estimator__holiday_pre_num_days": [0], "estimator__holiday_post_num_days": [0], "estimator__holiday_pre_post_num_dict": [None], "estimator__daily_event_df_dict": [None], "estimator__changepoints_dict": [None], "estimator__seasonality_changepoints_dict": [None], "estimator__yearly_seasonality": [0], "estimator__quarterly_seasonality": [0], "estimator__monthly_seasonality": [0], "estimator__weekly_seasonality": [0], "estimator__daily_seasonality": [0], "estimator__max_daily_seas_interaction_order": [0], "estimator__max_weekly_seas_interaction_order": [2], "estimator__autoreg_dict": [None], "estimator__min_admissible_value": [None], "estimator__max_admissible_value": [None], "estimator__uncertainty_dict": [None], "estimator__growth_term": ["linear"], "estimator__regressor_cols": [[]], "estimator__feature_sets_enabled": [False], "estimator__extra_pred_cols": [[]] }, ignore_keys={"estimator__time_properties": None})
def assert_proper_grid_search( grid_search, expected_grid_size=None, lower_bound=None, upper_bound=None, score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name, greater_is_better=False, cv_report_metrics_names=None): """Checks fitted hyperparameter grid search result. Parameters ---------- grid_search : `sklearn.model_selection.RandomizedSearchCV` Fitted RandomizedSearchCV object expected_grid_size : `int` or None, default None Expected number of options evaluated in grid search. If None, does not check the expected size. lower_bound : `float` or None, default None Lower bound on CV test set error. If None, does not check the test error. upper_bound : `float` or None, default None Upper bound on CV test set error. If None, does not check the test error. score_func : `str` or callable, default ``EvaluationMetricEnum.MeanAbsolutePercentError.name`` Score function used to select optimal model in CV. The same as passed to ``forecast_pipeline`` and grid search. If a callable, takes arrays ``y_true``, ``y_pred`` and returns a float. If a string, must be either a `~greykite.common.evaluation.EvaluationMetricEnum` member name or `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`. greater_is_better : `bool`, default False Whether higher values are better. Must be explicitly passed for testing (not derived from ``score_func``). cv_report_metrics_names : `list` [`str`] or None, default None Additional metrics besides ``metric`` calculated during CV. If None, no other metrics are checked in the result. Unlike in ``forecast_pipeline``, these are the expected names, in the CV output, such as: - ``enum.get_metric_name()`` - ``"CUSTOM_SCORE_FUNC_NAME"`` - ``"FRACTION_OUTSIDE_TOLERANCE_NAME"`` Raises ------ AssertionError If grid search did not run as expected. """ _, _, short_name = get_score_func_with_aggregation( score_func=score_func, # string or callable greater_is_better=greater_is_better, # Dummy value, doesn't matter because we ignore the returned `score_func` relative_error_tolerance=0.01) # attributes are populated assert hasattr(grid_search, "best_estimator_") assert hasattr(grid_search, "cv_results_") if callable(grid_search.refit): # `grid_search.refit` is a callable if `grid_search` comes from # `forecast_pipeline`. # Checks if `best_index_` and `refit` match `metric` and `greater_is_better`. assert grid_search.best_index_ == grid_search.refit( grid_search.cv_results_) split_scores = grid_search.cv_results_[f"mean_test_{short_name}"] expected_best_score = max(split_scores) if greater_is_better else min( split_scores) assert split_scores[grid_search.best_index_] == expected_best_score assert split_scores[grid_search.best_index_] is not None assert not np.isnan(split_scores[grid_search.best_index_]) assert_refit(grid_search.refit, expected_metric=short_name, expected_greater_is_better=greater_is_better) elif grid_search.refit is True: # In single metric evaluation, refit_metric is "score". short_name = "score" # `best_score_` is populated, and the optimal score is the highest # test set score. Metrics where `greater_is_better=False` are # assumed to be negated in the ``scoring`` parameter so that # higher values are better. assert hasattr(grid_search, "best_score_") best_score = grid_search.best_score_ test_scores = grid_search.cv_results_[f"mean_test_{short_name}"] best_score2 = test_scores[grid_search.best_index_] assert best_score == max(test_scores) assert best_score2 == max(test_scores) if expected_grid_size is not None: assert len(grid_search.cv_results_[f"mean_test_{short_name}"] ) == expected_grid_size # Parameters are populated assert_equal(grid_search.cv_results_["params"][grid_search.best_index_], grid_search.best_params_) # All metrics are computed if cv_report_metrics_names is None: cv_report_metrics_names = [] for expected_metric in cv_report_metrics_names + [short_name]: assert f"mean_test_{expected_metric}" in grid_search.cv_results_.keys() assert f"std_test_{expected_metric}" in grid_search.cv_results_.keys() assert f"mean_train_{expected_metric}" in grid_search.cv_results_.keys( ) assert f"std_train_{expected_metric}" in grid_search.cv_results_.keys() if lower_bound is not None or upper_bound is not None: grid_results = summarize_grid_search_results(grid_search, score_func=score_func) if lower_bound is not None: assert all(grid_results[f"mean_test_{short_name}"] >= lower_bound) if upper_bound is not None: assert all(grid_results[f"mean_test_{short_name}"] <= upper_bound)
def check_forecast_pipeline_result( result, coverage=0.95, strategy=None, interactive=False, expected_grid_size=None, lower_bound_cv=None, upper_bound_cv=None, score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name, greater_is_better=False, cv_report_metrics_names=None, relative_error_tolerance=None): """Helper function that validates forecast_pipeline output. Raises an AssertionError is results do not match the expected values. Parameters ---------- result : :class:`~greykite.framework.pipeline.pipeline.ForecastResult` ``forecast_pipeline`` output to check coverage : `float` or None, default 0.95 The ``coverage`` passed to ``forecast_pipeline`` strategy : `str` or None, default None Null model strategy. If None, not checked. interactive : `bool`, default False Whether to plot and print results. expected_grid_size : `int` or None, default None Expected number of options evaluated in grid search. If None, does not check the expected size. lower_bound_cv : `float` or None, default None Lower bound on CV test set error. If None, does not check the test error. upper_bound_cv : `float` or None, default None Upper bound on CV test set error. If None, does not check the test error. score_func : `str` or callable, default ``EvaluationMetricEnum.MeanAbsolutePercentError.name`` Score function used to select optimal model in CV. The same as passed to ``forecast_pipeline`` and grid search. If a callable, takes arrays ``y_true``, ``y_pred`` and returns a float. If a string, must be either a `~greykite.common.evaluation.EvaluationMetricEnum` member name or `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`. greater_is_better : `bool`, default False Whether higher values are better. Must be explicitly passed for testing (not derived from ``score_func``). cv_report_metrics_names : `list` [`str`] or None, default None Additional metrics besides ``metric`` calculated during CV. If None, no other metrics are checked in the result. Unlike in ``forecast_pipeline``, these are the expected names, in the CV output, such as: - ``enum.get_metric_name()`` - ``"CUSTOM_SCORE_FUNC_NAME"`` - ``"FRACTION_OUTSIDE_TOLERANCE_NAME"`` relative_error_tolerance : `float` or None The ``relative_error_tolerance`` passed to ``forecast_pipeline`` """ assert isinstance(result.grid_search, RandomizedSearchCV) assert isinstance(result.model, Pipeline) assert isinstance(result.backtest, UnivariateForecast) assert isinstance(result.forecast, UnivariateForecast) assert_proper_grid_search(result.grid_search, expected_grid_size=expected_grid_size, lower_bound=lower_bound_cv, upper_bound=upper_bound_cv, score_func=score_func, greater_is_better=greater_is_better, cv_report_metrics_names=cv_report_metrics_names) ts = result.timeseries assert ts.df[VALUE_COL].equals(ts.y) assert result.backtest.train_evaluation is not None assert result.backtest.test_evaluation is not None if coverage is None: assert result.forecast.coverage is None assert result.backtest.coverage is None assert result.backtest.train_evaluation[ PREDICTION_BAND_COVERAGE] is None assert result.backtest.test_evaluation[PREDICTION_BAND_COVERAGE] is None expected_cols = [TIME_COL, ACTUAL_COL, PREDICTED_COL] assert list(result.backtest.df.columns) == expected_cols assert list(result.forecast.df.columns) == expected_cols else: assert round(result.forecast.coverage, 3) == round(coverage, 3) assert round(result.backtest.coverage, 3) == round(coverage, 3) assert result.backtest.train_evaluation[ PREDICTION_BAND_COVERAGE] is not None assert result.backtest.test_evaluation[ PREDICTION_BAND_COVERAGE] is not None assert result.forecast.train_evaluation is not None # Tests if null model params are set for CV estimator = result.model.steps[-1][-1] if estimator.null_model is not None and strategy is not None: assert estimator.null_model.strategy == strategy # Tests if relative_error_tolerance is set for backtest/forecast if relative_error_tolerance is not None: assert result.backtest.relative_error_tolerance == relative_error_tolerance assert result.forecast.relative_error_tolerance == relative_error_tolerance if interactive: print("backtest_train_evaluation", result.backtest.train_evaluation) print("backtest_test_evaluation", result.backtest.test_evaluation) print("forecast_train_evaluation", result.forecast.train_evaluation) print("forecast_test_evaluation", result.forecast.test_evaluation) print( summarize_grid_search_results( result.grid_search, score_func=score_func, score_func_greater_is_better=greater_is_better)) plotly.offline.plot(ts.plot()) plotly.offline.plot(result.backtest.plot()) plotly.offline.plot(result.forecast.plot())
def test_summarize_grid_search_results(pipeline_results): """Tests summarize_grid_search_results""" # Tests EvaluationMetricEnum `score_func`, `cv_report_metrics=CV_REPORT_METRICS_ALL` grid_search = pipeline_results["1"].grid_search metric = EvaluationMetricEnum.MeanAbsolutePercentError cv_result = summarize_grid_search_results( grid_search=grid_search, only_changing_params=True, combine_splits=True, score_func=metric.name, score_func_greater_is_better=metric.get_metric_greater_is_better()) assert cv_result.shape == (4, 60) # The proper scores are extracted short_name = metric.get_metric_name() expected = grid_search.cv_results_[f"mean_test_{short_name}"] assert_equal(np.array(cv_result[f"mean_test_{short_name}"]), expected) # Rank direction is correct assert cv_result[f"rank_test_{short_name}"].idxmin( ) == cv_result[f"mean_test_{short_name}"].idxmin() assert all(cv_result[f"mean_test_{short_name}"] > 0) assert [ "rank_test_MAE", "rank_test_MSE", "rank_test_MedAPE", "rank_test_MAPE", "mean_test_MAE", "mean_test_MSE", "mean_test_MedAPE", "mean_test_MAPE", "split_test_MAPE", "split_test_MSE", "split_test_MAE", "split_test_MedAPE", "mean_train_MAE", "mean_train_MSE", "mean_train_MedAPE", "mean_train_MAPE", "params", "param_estimator__strategy", "param_estimator__quantile", "param_estimator__constant", "split_train_MAPE", "split_train_MSE", "split_train_MAE", "split_train_MedAPE", "mean_fit_time", "std_fit_time", "mean_score_time", "std_score_time", "split0_test_MAE", "split1_test_MAE", "split2_test_MAE", "std_test_MAE", "split0_train_MAE", "split1_train_MAE", "split2_train_MAE", "std_train_MAE", "split0_test_MSE", "split1_test_MSE", "split2_test_MSE", "std_test_MSE", "split0_train_MSE", "split1_train_MSE", "split2_train_MSE", "std_train_MSE", "split0_test_MedAPE", "split1_test_MedAPE", "split2_test_MedAPE", "std_test_MedAPE", "split0_train_MedAPE", "split1_train_MedAPE", "split2_train_MedAPE", "std_train_MedAPE", "split0_test_MAPE", "split1_test_MAPE", "split2_test_MAPE", "std_test_MAPE", "split0_train_MAPE", "split1_train_MAPE", "split2_train_MAPE", "std_train_MAPE" ] == list(cv_result.columns) # `combine_splits=False` cv_result = summarize_grid_search_results( grid_search=grid_search, only_changing_params=True, combine_splits=False, score_func=metric.name, score_func_greater_is_better=metric.get_metric_greater_is_better(), cv_report_metrics=CV_REPORT_METRICS_ALL) assert cv_result.shape == (4, 52 ) # no train/test split summary for 4 metrics assert "split_test_MedAPE" not in cv_result.columns # cv_report_metrics=list, different column_order cv_result = summarize_grid_search_results( grid_search=grid_search, only_changing_params=True, combine_splits=False, score_func=metric.name, score_func_greater_is_better=metric.get_metric_greater_is_better(), cv_report_metrics=[EvaluationMetricEnum.MeanSquaredError.name], column_order=["mean", "time", ".*"]) assert cv_result.shape == (4, 30) # only two metrics in the summary assert [ "mean_fit_time", "mean_score_time", "mean_test_MSE", "mean_train_MSE", "mean_test_MAPE", "mean_train_MAPE", "std_fit_time", "std_score_time", "param_estimator__strategy", "param_estimator__quantile", "param_estimator__constant", "params", "split0_test_MSE", "split1_test_MSE", "split2_test_MSE", "std_test_MSE", "rank_test_MSE", "split0_train_MSE", "split1_train_MSE", "split2_train_MSE", "std_train_MSE", "split0_test_MAPE", "split1_test_MAPE", "split2_test_MAPE", "std_test_MAPE", "rank_test_MAPE", "split0_train_MAPE", "split1_train_MAPE", "split2_train_MAPE", "std_train_MAPE" ] == list(cv_result.columns) # These metrics are computed but not requested in summary assert "rank_test_MedAPE" not in cv_result.columns assert "mean_test_MAE" not in cv_result.columns # cv_report_metrics=None, different column order cv_result = summarize_grid_search_results( grid_search=grid_search, only_changing_params=True, combine_splits=False, score_func=metric.name, score_func_greater_is_better=metric.get_metric_greater_is_better(), cv_report_metrics=None, column_order=["split", "rank", "mean", "params"]) assert cv_result.shape == (4, 12) # only one metric in the summary assert [ "split0_test_MAPE", "split1_test_MAPE", "split2_test_MAPE", "split0_train_MAPE", "split1_train_MAPE", "split2_train_MAPE", "rank_test_MAPE", "mean_fit_time", "mean_score_time", "mean_test_MAPE", "mean_train_MAPE", "params" ] == list(cv_result.columns) assert "rank_test_MSE" not in cv_result.columns # Tests FRACTION_OUTSIDE_TOLERANCE `score_func` grid_search = pipeline_results["2"].grid_search cv_result = summarize_grid_search_results( grid_search=grid_search, only_changing_params=True, score_func=FRACTION_OUTSIDE_TOLERANCE, score_func_greater_is_better=False) assert cv_result.shape == (4, 242) # The proper scores are extracted short_name = FRACTION_OUTSIDE_TOLERANCE_NAME expected = grid_search.cv_results_[f"mean_test_{short_name}"] assert_equal(np.array(cv_result[f"mean_test_{short_name}"]), expected) # Rank direction is correct assert cv_result[f"rank_test_{short_name}"].idxmin( ) == cv_result[f"mean_test_{short_name}"].idxmin() assert all(cv_result[f"mean_test_{short_name}"] > 0) # Tests callable `score_func`, greater_is_better=True, split scores grid_search = pipeline_results["3"].grid_search cv_max_splits = 2 cv_result = summarize_grid_search_results( grid_search=grid_search, only_changing_params=True, score_func=mean_absolute_error, score_func_greater_is_better=True) assert cv_result.shape == (4, 20) # the proper scores are extracted short_name = CUSTOM_SCORE_FUNC_NAME expected = grid_search.cv_results_[f"mean_test_{short_name}"] assert_equal(np.array(cv_result[f"mean_test_{short_name}"]), expected) # Rank direction is correct assert cv_result[f"rank_test_{short_name}"].idxmin( ) == cv_result[f"mean_test_{short_name}"].idxmax() # NB: max assert all(cv_result[f"mean_test_{short_name}"] > 0) assert len(cv_result["params"] [0]) == 2 # two params have multiple options in the grid assert len(cv_result[f"split_test_{short_name}"][0]) == cv_max_splits # no rounding is applied assert cv_result[f"mean_test_{short_name}"][1] == pytest.approx(2.430402, rel=1e-5) assert cv_result[f"mean_train_{short_name}"][1] == pytest.approx(1.839883, rel=1e-5) assert cv_result[f"std_test_{short_name}"][1] == pytest.approx(0.16548, rel=1e-5) assert cv_result[f"split_test_{short_name}"][1][0] == pytest.approx( 2.26492, rel=1e-5) assert cv_result[f"split_train_{short_name}"][1][0] == pytest.approx( 1.84082, rel=1e-5) expected = grid_search.cv_results_ for k, v in cv_result.items(): if k in expected and k not in ("params", f"rank_test_{short_name}"): assert_equal(pd.Series(expected[k], name=k), v) # decimals=2, and only_changing_params=False cv_result = summarize_grid_search_results( grid_search=grid_search, only_changing_params=False, decimals=2, score_func=mean_absolute_error, score_func_greater_is_better=False) assert cv_result.shape == (4, 20) # only_changing_params=False, so all params in hyperparameter_grid are included assert len(cv_result["params"][0]) == 4 # rounding is applied assert cv_result[f"mean_test_{short_name}"][1] == 2.43 assert cv_result[f"mean_train_{short_name}"][1] == 1.84 assert cv_result[f"std_test_{short_name}"][1] == 0.17 assert cv_result[f"split_test_{short_name}"][1][0] == 2.26 assert cv_result[f"split_train_{short_name}"][1][0] == 1.84
def test_forecast_pipeline_rolling_evaluation_prophet(): """Checks the output rolling evaluation with Prophet template""" data = generate_df_with_reg_for_tests(freq="D", periods=30, remove_extra_cols=True, mask_test_actuals=True) reg_cols = ["regressor1", "regressor2", "regressor3"] keep_cols = [TIME_COL, VALUE_COL] + reg_cols df = data["df"][keep_cols] hyperparameter_grid = { "estimator__weekly_seasonality": [True], "estimator__daily_seasonality": [True, False], "estimator__n_changepoints": [0], # to speed up test case, remove for better fit "estimator__uncertainty_samples": [10], # to speed up test case "estimator__add_regressor_dict": [{ "regressor1": { "prior_scale": 10, "standardize": True, "mode": 'additive' }, "regressor2": { "prior_scale": 15, "standardize": False, "mode": 'additive' }, "regressor3": {} }] } pipeline_params = mock_pipeline( df=df, forecast_horizon=3, regressor_cols=["regressor1", "regressor2", "regressor3"], estimator=ProphetEstimator(), hyperparameter_grid=hyperparameter_grid) tscv = RollingTimeSeriesSplit(forecast_horizon=3, expanding_window=True, max_splits=1) rolling_evaluation = forecast_pipeline_rolling_evaluation( pipeline_params=pipeline_params, tscv=tscv) expected_splits_n = tscv.max_splits assert len(rolling_evaluation.keys()) == expected_splits_n assert set(rolling_evaluation.keys()) == {"split_0"} split0_output = rolling_evaluation["split_0"] assert round(split0_output["runtime_sec"], 3) == split0_output["runtime_sec"] pipeline_result = split0_output["pipeline_result"] # Calculates expected pipeline train, test = list(tscv.split(X=df))[0] df_train = df.loc[train] pipeline_params_updated = pipeline_params pipeline_params_updated["test_horizon"] = 0 pipeline_params_updated["df"] = df_train expected_pipeline_result = forecast_pipeline(**pipeline_params_updated) assert pipeline_result.backtest is None # Checks output is identical when there is only 1 split pipeline_grid_search = summarize_grid_search_results( pipeline_result.grid_search, score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name) expected_grid_search = summarize_grid_search_results( expected_pipeline_result.grid_search, score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name) assert_equal(pipeline_grid_search["mean_test_MAPE"], expected_grid_search["mean_test_MAPE"]) assert_equal(pipeline_result.grid_search.cv.__dict__, expected_pipeline_result.grid_search.cv.__dict__) # Checks forecast df has the correct number of rows expected_rows = pipeline_result.timeseries.fit_df.shape[ 0] + tscv.forecast_horizon assert pipeline_result.forecast.df.shape[0] == expected_rows