コード例 #1
0
def test_add_time_features_df():
    """Tests add_time_features_df"""
    # create indexed input
    date_list = pd.date_range(start=datetime.datetime(2019, 1, 1),
                              periods=100,
                              freq="H").tolist()
    df0 = pd.DataFrame({TIME_COL: date_list}, index=date_list)

    df = add_time_features_df(df=df0,
                              time_col=TIME_COL,
                              conti_year_origin=2018)
    assert df["year"][0] == 2019
    assert df.shape[0] == df0.shape[0]

    hourly_data = generate_df_with_reg_for_tests(
        freq="H",
        periods=24 * 500,
        train_start_date=datetime.datetime(2018, 7, 1),
        conti_year_origin=2018)
    cols = [TIME_COL, "regressor1", "regressor_bool", "regressor_categ"]
    train_df = hourly_data["train_df"]
    df = add_time_features_df(df=train_df[cols],
                              time_col=TIME_COL,
                              conti_year_origin=2018)
    assert df["year"][0] == 2018
    assert (df["dow_hr"][:3] == ["7_00", "7_01", "7_02"]).all()
    assert df.shape[0] == train_df.shape[0]
コード例 #2
0
def df():
    data = generate_df_with_reg_for_tests(freq="D",
                                          periods=20 * 7,
                                          train_frac=0.9,
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    reg_cols = ["regressor1", "regressor2", "regressor3"]
    keep_cols = [TIME_COL, VALUE_COL] + reg_cols
    df = data["df"][keep_cols]
    return df
コード例 #3
0
def df_config():
    data = generate_df_with_reg_for_tests(freq="W-MON",
                                          periods=140,
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    reg_cols = ["regressor1", "regressor2", "regressor_categ"]
    keep_cols = [TIME_COL, VALUE_COL] + reg_cols
    df = data["df"][keep_cols]

    model_template = "SILVERKITE"
    evaluation_metric = EvaluationMetricParam(
        cv_selection_metric=EvaluationMetricEnum.MeanAbsoluteError.name,
        agg_periods=7,
        agg_func=np.max,
        null_model_params={
            "strategy": "quantile",
            "constant": None,
            "quantile": 0.5
        })
    evaluation_period = EvaluationPeriodParam(test_horizon=10,
                                              periods_between_train_test=5,
                                              cv_horizon=4,
                                              cv_min_train_periods=80,
                                              cv_expanding_window=False,
                                              cv_periods_between_splits=20,
                                              cv_periods_between_train_test=3,
                                              cv_max_splits=3)
    model_components = ModelComponentsParam(
        regressors={"regressor_cols": reg_cols},
        custom={
            "fit_algorithm_dict": {
                "fit_algorithm": "ridge",
                "fit_algorithm_params": {
                    "cv": 2
                }
            }
        })
    computation = ComputationParam(verbose=2)
    forecast_horizon = 27
    coverage = 0.90
    config = ForecastConfig(model_template=model_template,
                            computation_param=computation,
                            coverage=coverage,
                            evaluation_metric_param=evaluation_metric,
                            evaluation_period_param=evaluation_period,
                            forecast_horizon=forecast_horizon,
                            model_components_param=model_components)
    return {
        "df": df,
        "config": config,
        "model_template": model_template,
        "reg_cols": reg_cols,
    }
コード例 #4
0
def test_generate_df_with_reg_for_tests():
    """Basic test of generate_df_with_reg_for_tests"""
    data = generate_df_with_reg_for_tests(freq="D",
                                          periods=20,
                                          train_frac=0.75,
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    # test remove_extra_cols
    assert data["df"].shape == (20, 7)
    # test mask_test_actuals
    assert not data["train_df"][TIME_COL].isna().any()
    assert not data["train_df"][VALUE_COL].isna().any()
    assert not data["test_df"][TIME_COL].isna().any()
    assert data["test_df"][VALUE_COL].isna().all()
コード例 #5
0
def df():
    data = generate_df_with_reg_for_tests(freq="H",
                                          periods=300 * 24,
                                          train_start_date=datetime.datetime(
                                              2018, 7, 1),
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    df = data["df"]
    time_col = NEW_TIME_COL
    value_col = NEW_VALUE_COL
    df.rename({TIME_COL: time_col, VALUE_COL: value_col}, axis=1, inplace=True)
    regressor_cols = ["regressor1", "regressor2", "regressor_categ"]
    keep_cols = [time_col, value_col] + regressor_cols
    return df[keep_cols]
コード例 #6
0
def test_get_basic_pipeline_apply_reg():
    """Tests get_basic_pipeline fit and predict methods on
    a dataset with regressors, and checks if pipeline parameters
    can be set.
    """
    df = generate_df_with_reg_for_tests("D", 50)
    # adds degenerate columns
    df["train_df"]["cst1"] = "constant"
    df["train_df"]["cst2"] = 1.0
    df["test_df"]["cst1"] = "constant"
    df["test_df"]["cst2"] = 1.0
    pipeline = get_basic_pipeline(
        estimator=SilverkiteEstimator(),
        score_func=EvaluationMetricEnum.MeanSquaredError.name,
        score_func_greater_is_better=False,
        agg_periods=None,
        agg_func=None,
        relative_error_tolerance=None,
        coverage=0.95,
        null_model_params=None,
        regressor_cols=[
            "regressor1", "regressor2", "regressor3", "regressor_bool",
            "regressor_categ", "cst1", "cst2"
        ])
    pipeline.fit(df["train_df"])
    assert pipeline.named_steps["degenerate"].drop_cols == []
    pipeline.predict(df["test_df"])

    # drops degenerate columns, normalizes
    pipeline.set_params(
        degenerate__drop_degenerate=True,
        input__regressors_numeric__normalize__normalize_algorithm=
        "PowerTransformer",
    )
    pipeline.fit(df["train_df"])
    # (column order is swapped by column selectors and feature union)
    assert pipeline.named_steps["degenerate"].drop_cols == ["cst2", "cst1"]
    predictions = pipeline.predict(df["test_df"])
    assert predictions.shape[0] == df["test_df"].shape[0]

    with pytest.raises(
            ValueError,
            match=
            "Invalid parameter unknown_param for estimator NormalizeTransformer"
    ):
        pipeline.set_params(
            degenerate__drop_degenerate=True,
            input__regressors_numeric__normalize__unknown_param=
            "PowerTransformer",
        )
コード例 #7
0
def data():
    """Generates dataset for test cases
    :return: pd.DataFrame with columns of type:
        datetime, number, number, boolean, object, category
    """
    df = generate_df_with_reg_for_tests(freq="D",
                                        periods=50,
                                        remove_extra_cols=False)["df"]
    df["dow_categorical"] = df["str_dow"].astype("category")
    df = df[[
        TIME_COL, "regressor1", "regressor2", "regressor_bool", "str_dow",
        "dow_categorical"
    ]]
    return df
コード例 #8
0
def test_run_template_4():
    """Runs custom template with monthly data and auto-regression"""
    data = generate_df_with_reg_for_tests(
        freq="MS",
        periods=48,
        remove_extra_cols=True,
        mask_test_actuals=True)
    reg_cols = ["regressor1", "regressor2", "regressor_categ"]
    keep_cols = [TIME_COL, VALUE_COL] + reg_cols
    df = data["df"][keep_cols]
    forecast_horizon = data["test_df"].shape[0]

    model_components = ModelComponentsParam(
        custom=dict(
            fit_algorithm_dict=dict(fit_algorithm="linear"),
            extra_pred_cols=["ct2"]),
        autoregression=dict(autoreg_dict=dict(lag_dict=dict(orders=[1]))),
        uncertainty=dict(uncertainty_dict=None))
    config = ForecastConfig(
        model_template=ModelTemplateEnum.SK.name,
        forecast_horizon=forecast_horizon,
        coverage=0.9,
        model_components_param=model_components,
    )

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        result = Forecaster().run_forecast_config(
            df=df,
            config=config,
        )
        rmse = EvaluationMetricEnum.RootMeanSquaredError.get_metric_name()
        assert result.backtest.test_evaluation[rmse] == pytest.approx(4.95, rel=1e-1)
        check_forecast_pipeline_result(
            result,
            coverage=0.9,
            strategy=None,
            score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name,
            greater_is_better=False)
コード例 #9
0
def test_run_template_2():
    """Runs custom template with all options"""
    data = generate_df_with_reg_for_tests(
        freq="D",
        periods=400,
        remove_extra_cols=True,
        mask_test_actuals=True)
    reg_cols = ["regressor1", "regressor2", "regressor_categ"]
    keep_cols = [TIME_COL, VALUE_COL] + reg_cols
    df = data["df"][keep_cols]
    forecast_horizon = data["test_df"].shape[0]

    daily_event_df_dict = generate_holiday_events(
        countries=["UnitedStates"],
        holidays_to_model_separately=["New Year's Day"],
        year_start=2017,
        year_end=2022,
        pre_num=2,
        post_num=2)
    event_pred_cols = get_event_pred_cols(daily_event_df_dict)
    model_components = ModelComponentsParam(
        seasonality={
            "fs_components_df": pd.DataFrame({
                "name": ["tow", "tom", "toq", "toy"],
                "period": [7.0, 1.0, 1.0, 1.0],
                "order": [2, 1, 1, 5],
                "seas_names": ["weekly", "monthly", "quarterly", "yearly"]
            })
        },
        events={
            "daily_event_df_dict": daily_event_df_dict
        },
        changepoints={
            "changepoints_dict": {
                "method": "auto",
                "yearly_seasonality_order": 3,
                "regularization_strength": 0.5,
                "resample_freq": "14D",
                "potential_changepoint_distance": "56D",
                "no_changepoint_proportion_from_end": 0.2
            },
            "seasonality_changepoints_dict": {
                "potential_changepoint_distance": "60D",
                "regularization_strength": 0.5,
                "no_changepoint_proportion_from_end": 0.2
            },
        },
        autoregression=None,
        uncertainty={
            "uncertainty_dict": None,
        },
        custom={
            "origin_for_time_vars": None,
            "extra_pred_cols": [["ct1"] + reg_cols + event_pred_cols],  # growth, regressors, events
            "fit_algorithm_dict": {
                "fit_algorithm": "ridge",
                "fit_algorithm_params": {"cv": 2}
            },
            "min_admissible_value": min(df[VALUE_COL]) - abs(max(df[VALUE_COL])),
            "max_admissible_value": max(df[VALUE_COL]) * 2,
        }
    )
    config = ForecastConfig(
        model_template=ModelTemplateEnum.SK.name,
        forecast_horizon=forecast_horizon,
        coverage=0.9,
        model_components_param=model_components,
    )

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        result = Forecaster().run_forecast_config(
            df=df,
            config=config,
        )
        rmse = EvaluationMetricEnum.RootMeanSquaredError.get_metric_name()
        q80 = EvaluationMetricEnum.Quantile80.get_metric_name()
        assert result.backtest.test_evaluation[rmse] == pytest.approx(2.692, rel=1e-2)
        assert result.backtest.test_evaluation[q80] == pytest.approx(1.531, rel=1e-2)
        assert result.backtest.test_evaluation[PREDICTION_BAND_COVERAGE] == pytest.approx(0.823, rel=1e-2)
        assert result.forecast.train_evaluation[rmse] == pytest.approx(2.304, rel=1e-2)
        assert result.forecast.train_evaluation[q80] == pytest.approx(0.921, rel=1e-2)
        assert result.forecast.train_evaluation[PREDICTION_BAND_COVERAGE] == pytest.approx(0.897, rel=1e-2)
        check_forecast_pipeline_result(
            result,
            coverage=0.9,
            strategy=None,
            score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name,
            greater_is_better=False)
コード例 #10
0
def test_silverkite_template_custom(model_components_param):
    """"Tests simple_silverkite_template with custom parameters,
    and data that has regressors"""
    data = generate_df_with_reg_for_tests(
        freq="H",
        periods=300*24,
        remove_extra_cols=True,
        mask_test_actuals=True)
    df = data["df"]
    time_col = "some_time_col"
    value_col = "some_value_col"
    df.rename({
        TIME_COL: time_col,
        VALUE_COL: value_col
    }, axis=1, inplace=True)

    metric = EvaluationMetricEnum.MeanAbsoluteError
    # anomaly adjustment adds 10.0 to every record
    adjustment_size = 10.0
    anomaly_df = pd.DataFrame({
        START_DATE_COL: [df[time_col].min()],
        END_DATE_COL: [df[time_col].max()],
        ADJUSTMENT_DELTA_COL: [adjustment_size],
        METRIC_COL: [value_col]
    })
    anomaly_info = {
        "value_col": VALUE_COL,
        "anomaly_df": anomaly_df,
        "start_date_col": START_DATE_COL,
        "end_date_col": END_DATE_COL,
        "adjustment_delta_col": ADJUSTMENT_DELTA_COL,
        "filter_by_dict": {METRIC_COL: VALUE_COL},
        "adjustment_method": "add"
    }
    metadata = MetadataParam(
        time_col=time_col,
        value_col=value_col,
        freq="H",
        date_format="%Y-%m-%d-%H",
        train_end_date=datetime.datetime(2019, 7, 1),
        anomaly_info=anomaly_info
    )
    evaluation_metric = EvaluationMetricParam(
        cv_selection_metric=metric.name,
        cv_report_metrics=[EvaluationMetricEnum.MedianAbsolutePercentError.name],
        agg_periods=24,
        agg_func=np.max,
        null_model_params={
            "strategy": "quantile",
            "constant": None,
            "quantile": 0.8
        },
        relative_error_tolerance=0.01
    )
    evaluation_period = EvaluationPeriodParam(
        test_horizon=1,
        periods_between_train_test=2,
        cv_horizon=3,
        cv_min_train_periods=4,
        cv_expanding_window=True,
        cv_periods_between_splits=5,
        cv_periods_between_train_test=6,
        cv_max_splits=7
    )
    computation = ComputationParam(
        hyperparameter_budget=10,
        n_jobs=None,
        verbose=1
    )
    forecast_horizon = 20
    coverage = 0.7
    template = SilverkiteTemplate()
    params = template.apply_template_for_pipeline_params(
        df=df,
        config=ForecastConfig(
            model_template=ModelTemplateEnum.SK.name,
            metadata_param=metadata,
            forecast_horizon=forecast_horizon,
            coverage=coverage,
            evaluation_metric_param=evaluation_metric,
            evaluation_period_param=evaluation_period,
            model_components_param=model_components_param,
            computation_param=computation
        )
    )
    pipeline = params.pop("pipeline", None)
    expected_params = dict(
        df=df,
        time_col=time_col,
        value_col=value_col,
        date_format=metadata.date_format,
        freq=metadata.freq,
        train_end_date=metadata.train_end_date,
        anomaly_info=metadata.anomaly_info,
        # model
        regressor_cols=template.regressor_cols,
        estimator=None,
        hyperparameter_grid=template.hyperparameter_grid,
        hyperparameter_budget=computation.hyperparameter_budget,
        n_jobs=computation.n_jobs,
        verbose=computation.verbose,
        # forecast
        forecast_horizon=forecast_horizon,
        coverage=coverage,
        test_horizon=evaluation_period.test_horizon,
        periods_between_train_test=evaluation_period.periods_between_train_test,
        agg_periods=evaluation_metric.agg_periods,
        agg_func=evaluation_metric.agg_func,
        relative_error_tolerance=evaluation_metric.relative_error_tolerance,
        # evaluation
        score_func=metric.name,
        score_func_greater_is_better=metric.get_metric_greater_is_better(),
        cv_report_metrics=evaluation_metric.cv_report_metrics,
        null_model_params=evaluation_metric.null_model_params,
        # CV
        cv_horizon=evaluation_period.cv_horizon,
        cv_min_train_periods=evaluation_period.cv_min_train_periods,
        cv_expanding_window=evaluation_period.cv_expanding_window,
        cv_periods_between_splits=evaluation_period.cv_periods_between_splits,
        cv_periods_between_train_test=evaluation_period.cv_periods_between_train_test,
        cv_max_splits=evaluation_period.cv_max_splits
    )
    assert_basic_pipeline_equal(pipeline, template.pipeline)
    assert_equal(params, expected_params)
コード例 #11
0
def test_gcd_train_end_date_regressor():
    """Tests train_end_date for data with regressors"""
    data = generate_df_with_reg_for_tests(freq="D",
                                          periods=30,
                                          train_start_date=datetime.datetime(
                                              2018, 1, 1),
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    regressor_cols = ["regressor1", "regressor2", "regressor_categ"]
    keep_cols = [TIME_COL, VALUE_COL] + regressor_cols
    df = data["df"][keep_cols].copy()
    # Setting NaN values at the end
    df.loc[df.tail(2).index, "regressor1"] = np.nan
    df.loc[df.tail(4).index, "regressor2"] = np.nan
    df.loc[df.tail(6).index, "regressor_categ"] = np.nan
    df.loc[df.tail(8).index, VALUE_COL] = np.nan

    # last date with a value
    result_train_end_date = datetime.datetime(2018, 1, 22)

    # default train_end_date, default regressor_cols
    with pytest.warns(UserWarning) as record:
        canonical_data_dict = get_canonical_data(df=df,
                                                 train_end_date=None,
                                                 regressor_cols=None)
        assert f"{VALUE_COL} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({result_train_end_date})." in record[0].message.args[0]
        assert canonical_data_dict["df"].shape == df.shape
        assert canonical_data_dict["fit_df"].shape == (22, 2)
        assert canonical_data_dict["regressor_cols"] == []
        assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL]
        assert canonical_data_dict["train_end_date"] == result_train_end_date
        assert canonical_data_dict[
            "last_date_for_val"] == result_train_end_date
        assert canonical_data_dict["last_date_for_reg"] is None

    # train_end_date later than last date in df, all available regressor_cols
    with pytest.warns(UserWarning) as record:
        train_end_date = datetime.datetime(2018, 2, 10)
        canonical_data_dict = get_canonical_data(df=df,
                                                 train_end_date=train_end_date,
                                                 regressor_cols=regressor_cols)
        assert f"Input timestamp for the parameter 'train_end_date' " \
               f"({train_end_date}) either exceeds the last available timestamp or" \
               f"{VALUE_COL} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({result_train_end_date})." in record[0].message.args[0]
        assert canonical_data_dict["fit_df"].shape == (22, 5)
        assert canonical_data_dict["regressor_cols"] == regressor_cols
        assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL
                                                   ] + regressor_cols
        assert canonical_data_dict["train_end_date"] == result_train_end_date
        assert canonical_data_dict["last_date_for_val"] == datetime.datetime(
            2018, 1, 22)
        assert canonical_data_dict["last_date_for_reg"] == datetime.datetime(
            2018, 1, 28)

    # train_end_date in between last date in df and last date before null
    # user passes no regressor_cols
    with pytest.warns(UserWarning) as record:
        train_end_date = datetime.datetime(2018, 1, 25)
        canonical_data_dict = get_canonical_data(df=df,
                                                 train_end_date=train_end_date,
                                                 regressor_cols=None)
        assert f"Input timestamp for the parameter 'train_end_date' " \
               f"({train_end_date}) either exceeds the last available timestamp or" \
               f"{VALUE_COL} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({result_train_end_date})." in record[0].message.args[0]
        assert canonical_data_dict["fit_df"].shape == (22, 2)
        assert canonical_data_dict["regressor_cols"] == []
        assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL]
        assert canonical_data_dict["train_end_date"] == datetime.datetime(
            2018, 1, 22)
        assert canonical_data_dict["last_date_for_val"] == datetime.datetime(
            2018, 1, 22)
        assert canonical_data_dict["last_date_for_reg"] is None

    # train end date equal to last date before null
    # user requests a subset of the regressor_cols
    train_end_date = datetime.datetime(2018, 1, 22)
    regressor_cols = ["regressor2"]
    canonical_data_dict = get_canonical_data(df=df,
                                             train_end_date=train_end_date,
                                             regressor_cols=regressor_cols)
    assert canonical_data_dict["fit_df"].shape == (22, 3)
    assert canonical_data_dict["regressor_cols"] == regressor_cols
    assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL
                                               ] + regressor_cols
    assert canonical_data_dict["train_end_date"] == datetime.datetime(
        2018, 1, 22)
    assert canonical_data_dict["last_date_for_val"] == datetime.datetime(
        2018, 1, 22)
    assert canonical_data_dict["last_date_for_reg"] == datetime.datetime(
        2018, 1, 26)

    # train_end_date smaller than last date before null
    # user requests regressor_cols that does not exist in df
    with pytest.warns(UserWarning) as record:
        train_end_date = datetime.datetime(2018, 1, 20)
        regressor_cols = ["regressor1", "regressor4", "regressor5"]
        canonical_data_dict = get_canonical_data(df=df,
                                                 train_end_date=train_end_date,
                                                 regressor_cols=regressor_cols)
        assert canonical_data_dict["fit_df"].shape == (20, 3)
        assert canonical_data_dict["regressor_cols"] == ["regressor1"]
        assert canonical_data_dict["fit_cols"] == [
            TIME_COL, VALUE_COL, "regressor1"
        ]
        assert canonical_data_dict["train_end_date"] == datetime.datetime(
            2018, 1, 20)
        assert canonical_data_dict["last_date_for_val"] == datetime.datetime(
            2018, 1, 22)
        assert canonical_data_dict["last_date_for_reg"] == datetime.datetime(
            2018, 1, 28)
        assert (f"The following columns are not available to use as "
                f"regressors: ['regressor4', 'regressor5']"
                ) in record[0].message.args[0]
コード例 #12
0
def test_run_template_5():
    """Runs custom template with monthly data, auto-regression and lagged regressors"""
    data = generate_df_with_reg_for_tests(
        freq="MS",
        periods=48,
        remove_extra_cols=True,
        mask_test_actuals=True)
    reg_cols_all = ["regressor1", "regressor2", "regressor_categ"]
    reg_cols = ["regressor1"]
    keep_cols = [TIME_COL, VALUE_COL] + reg_cols_all
    df = data["df"][keep_cols]
    forecast_horizon = data["test_df"].shape[0]

    model_components = ModelComponentsParam(
        custom=dict(
            fit_algorithm_dict=dict(fit_algorithm="linear"),
            extra_pred_cols=reg_cols),
        autoregression=dict(autoreg_dict=dict(lag_dict=dict(orders=[1]))),
        lagged_regressors={
            "lagged_regressor_dict": [
                {"regressor2": "auto"},
                {"regressor_categ": {"lag_dict": {"orders": [5]}}}
            ]},
        uncertainty=dict(uncertainty_dict=None))
    config = ForecastConfig(
        model_template=ModelTemplateEnum.SK.name,
        forecast_horizon=forecast_horizon,
        coverage=0.9,
        model_components_param=model_components,
    )

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        result = Forecaster().run_forecast_config(
            df=df,
            config=config,
        )
        rmse = EvaluationMetricEnum.RootMeanSquaredError.get_metric_name()
        assert result.backtest.test_evaluation[rmse] == pytest.approx(4.46, rel=1e-1)
        check_forecast_pipeline_result(
            result,
            coverage=0.9,
            strategy=None,
            score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name,
            greater_is_better=False)
        # Checks lagged regressor columns
        actual_pred_cols = set(result.model[-1].model_dict["pred_cols"])
        actual_x_mat_cols = set(result.model[-1].model_dict["x_mat"].columns)
        expected_pred_cols = {
            'regressor1',
            'y_lag1',
            'regressor_categ_lag5'
        }
        expected_x_mat_cols = {
            'regressor1',
            'y_lag1',
            'regressor_categ_lag5[T.c2]',
            'regressor_categ_lag5[T.c2]'
        }
        assert expected_pred_cols.issubset(actual_pred_cols)
        assert expected_x_mat_cols.issubset(actual_x_mat_cols)
コード例 #13
0
def test_forecast_pipeline_rolling_evaluation_prophet():
    """Checks the output rolling evaluation with Prophet template"""
    data = generate_df_with_reg_for_tests(freq="D",
                                          periods=30,
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    reg_cols = ["regressor1", "regressor2", "regressor3"]
    keep_cols = [TIME_COL, VALUE_COL] + reg_cols
    df = data["df"][keep_cols]

    hyperparameter_grid = {
        "estimator__weekly_seasonality": [True],
        "estimator__daily_seasonality": [True, False],
        "estimator__n_changepoints":
        [0],  # to speed up test case, remove for better fit
        "estimator__uncertainty_samples": [10],  # to speed up test case
        "estimator__add_regressor_dict": [{
            "regressor1": {
                "prior_scale": 10,
                "standardize": True,
                "mode": 'additive'
            },
            "regressor2": {
                "prior_scale": 15,
                "standardize": False,
                "mode": 'additive'
            },
            "regressor3": {}
        }]
    }
    pipeline_params = mock_pipeline(
        df=df,
        forecast_horizon=3,
        regressor_cols=["regressor1", "regressor2", "regressor3"],
        estimator=ProphetEstimator(),
        hyperparameter_grid=hyperparameter_grid)
    tscv = RollingTimeSeriesSplit(forecast_horizon=3,
                                  expanding_window=True,
                                  max_splits=1)
    rolling_evaluation = forecast_pipeline_rolling_evaluation(
        pipeline_params=pipeline_params, tscv=tscv)

    expected_splits_n = tscv.max_splits
    assert len(rolling_evaluation.keys()) == expected_splits_n
    assert set(rolling_evaluation.keys()) == {"split_0"}

    split0_output = rolling_evaluation["split_0"]
    assert round(split0_output["runtime_sec"],
                 3) == split0_output["runtime_sec"]

    pipeline_result = split0_output["pipeline_result"]
    # Calculates expected pipeline
    train, test = list(tscv.split(X=df))[0]
    df_train = df.loc[train]
    pipeline_params_updated = pipeline_params
    pipeline_params_updated["test_horizon"] = 0
    pipeline_params_updated["df"] = df_train
    expected_pipeline_result = forecast_pipeline(**pipeline_params_updated)

    assert pipeline_result.backtest is None
    # Checks output is identical when there is only 1 split
    pipeline_grid_search = summarize_grid_search_results(
        pipeline_result.grid_search,
        score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name)
    expected_grid_search = summarize_grid_search_results(
        expected_pipeline_result.grid_search,
        score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name)
    assert_equal(pipeline_grid_search["mean_test_MAPE"],
                 expected_grid_search["mean_test_MAPE"])
    assert_equal(pipeline_result.grid_search.cv.__dict__,
                 expected_pipeline_result.grid_search.cv.__dict__)
    # Checks forecast df has the correct number of rows
    expected_rows = pipeline_result.timeseries.fit_df.shape[
        0] + tscv.forecast_horizon
    assert pipeline_result.forecast.df.shape[0] == expected_rows
コード例 #14
0
def daily_data_reg():
    return generate_df_with_reg_for_tests(
        freq="D",
        periods=500)
コード例 #15
0
def test_train_end_date_with_regressors():
    """Tests make_future_dataframe and train_end_date with regressors"""
    data = generate_df_with_reg_for_tests(freq="D",
                                          periods=30,
                                          train_start_date=datetime.datetime(
                                              2018, 1, 1),
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    regressor_cols = ["regressor1", "regressor2", "regressor_categ"]
    keep_cols = [TIME_COL, VALUE_COL] + regressor_cols
    df = data["df"][keep_cols].copy()
    # Setting NaN values at the end
    df.loc[df.tail(2).index, "regressor1"] = np.nan
    df.loc[df.tail(4).index, "regressor2"] = np.nan
    df.loc[df.tail(6).index, "regressor_categ"] = np.nan
    df.loc[df.tail(8).index, VALUE_COL] = np.nan

    # default train_end_date, default regressor_cols
    with pytest.warns(UserWarning) as record:
        ts = UnivariateTimeSeries()
        ts.load_data(df,
                     TIME_COL,
                     VALUE_COL,
                     train_end_date=None,
                     regressor_cols=None)
        assert f"{ts.original_value_col} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({ts.train_end_date})." in record[0].message.args[0]
        assert ts.train_end_date == dt(2018, 1, 22)
        assert ts.fit_df.shape == (22, 2)
        assert ts.last_date_for_val == df[
            df[VALUE_COL].notnull()][TIME_COL].max()
        assert ts.last_date_for_reg is None
        result = ts.make_future_dataframe(periods=10, include_history=True)
        expected = pd.DataFrame({
            TIME_COL:
            pd.date_range(start=dt(2018, 1, 1), periods=32, freq="D"),
            VALUE_COL:
            np.concatenate([ts.fit_y, np.repeat(np.nan, 10)])
        })
        expected.index = expected[TIME_COL]
        expected.index.name = None
        assert_frame_equal(result, expected)

    # train_end_date later than last date in df, all available regressor_cols
    with pytest.warns(UserWarning) as record:
        ts = UnivariateTimeSeries()
        train_end_date = dt(2018, 2, 10)
        ts.load_data(df,
                     TIME_COL,
                     VALUE_COL,
                     train_end_date=train_end_date,
                     regressor_cols=regressor_cols)
        assert f"Input timestamp for the parameter 'train_end_date' " \
               f"({train_end_date}) either exceeds the last available timestamp or" \
               f"{VALUE_COL} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({ts.train_end_date})." in record[0].message.args[0]
        assert ts.last_date_for_val == dt(2018, 1, 22)
        assert ts.last_date_for_reg == dt(2018, 1, 28)
        result = ts.make_future_dataframe(periods=10, include_history=False)
        expected = df.copy()[22:28]
        expected.loc[expected.tail(6).index, VALUE_COL] = np.nan
        expected.index = expected[TIME_COL]
        expected.index.name = None
        assert_frame_equal(result, expected)

    # train_end_date in between last date in df and last date before null
    # user passes no regressor_cols
    with pytest.warns(UserWarning) as record:
        ts = UnivariateTimeSeries()
        train_end_date = dt(2018, 1, 25)
        regressor_cols = []
        ts.load_data(df,
                     TIME_COL,
                     VALUE_COL,
                     train_end_date=train_end_date,
                     regressor_cols=regressor_cols)
        assert f"Input timestamp for the parameter 'train_end_date' " \
               f"({train_end_date}) either exceeds the last available timestamp or" \
               f"{VALUE_COL} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({ts.train_end_date})." in record[0].message.args[0]
        assert ts.train_end_date == dt(2018, 1, 22)
        assert ts.last_date_for_reg is None
        result = ts.make_future_dataframe(periods=10, include_history=True)
        expected = pd.DataFrame({
            TIME_COL:
            pd.date_range(start=dt(2018, 1, 1), periods=32, freq="D"),
            VALUE_COL:
            np.concatenate([ts.fit_y, np.repeat(np.nan, 10)])
        })
        expected.index = expected[TIME_COL]
        expected.index.name = None
        assert_frame_equal(result, expected)

    # train end date equal to last date before null
    # user requests a subset of the regressor_cols
    with pytest.warns(UserWarning) as record:
        ts = UnivariateTimeSeries()
        train_end_date = dt(2018, 1, 22)
        regressor_cols = ["regressor2"]
        ts.load_data(df,
                     TIME_COL,
                     VALUE_COL,
                     train_end_date=train_end_date,
                     regressor_cols=regressor_cols)
        assert ts.train_end_date == dt(2018, 1, 22)
        assert ts.last_date_for_reg == dt(2018, 1, 26)
        result = ts.make_future_dataframe(periods=10, include_history=True)
        assert "Provided periods '10' is more than allowed ('4') due to the length of " \
               "regressor columns. Using '4'." in record[0].message.args[0]
        expected = ts.df.copy()[[TIME_COL, VALUE_COL, "regressor2"]]
        expected = expected[expected.index <= ts.last_date_for_reg]
        assert_frame_equal(result, expected)

    # train_end_date smaller than last date before null
    # user requests regressor_cols that does not exist in df
    with pytest.warns(UserWarning) as record:
        ts = UnivariateTimeSeries()
        train_end_date = dt(2018, 1, 20)
        regressor_cols = ["regressor1", "regressor4", "regressor5"]
        ts.load_data(df,
                     TIME_COL,
                     VALUE_COL,
                     train_end_date=train_end_date,
                     regressor_cols=regressor_cols)
        assert ts.train_end_date == dt(2018, 1, 20)
        assert ts.last_date_for_reg == dt(2018, 1, 28)
        assert (f"The following columns are not available to use as "
                f"regressors: ['regressor4', 'regressor5']"
                ) in record[0].message.args[0]
        result = ts.make_future_dataframe(periods=10, include_history=True)
        expected = ts.df.copy()[[TIME_COL, VALUE_COL, "regressor1"]]
        expected = expected[expected.index <= ts.last_date_for_reg]
        assert_frame_equal(result, expected)
コード例 #16
0
def test_prophet_template_custom():
    """Tests prophet_template with custom values, with long range input"""
    # prepares input data
    data = generate_df_with_reg_for_tests(freq="H",
                                          periods=300 * 24,
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    df = data["df"]
    time_col = "some_time_col"
    value_col = "some_value_col"
    df.rename({
        cst.TIME_COL: time_col,
        cst.VALUE_COL: value_col
    },
              axis=1,
              inplace=True)
    # prepares params and calls template
    metric = EvaluationMetricEnum.MeanAbsoluteError
    # anomaly adjustment adds 10.0 to every record
    adjustment_size = 10.0
    anomaly_df = pd.DataFrame({
        cst.START_DATE_COL: [df[time_col].min()],
        cst.END_DATE_COL: [df[time_col].max()],
        cst.ADJUSTMENT_DELTA_COL: [adjustment_size],
        cst.METRIC_COL: [value_col]
    })
    anomaly_info = {
        "value_col": cst.VALUE_COL,
        "anomaly_df": anomaly_df,
        "start_date_col": cst.START_DATE_COL,
        "end_date_col": cst.END_DATE_COL,
        "adjustment_delta_col": cst.ADJUSTMENT_DELTA_COL,
        "filter_by_dict": {
            cst.METRIC_COL: cst.VALUE_COL
        },
        "adjustment_method": "add"
    }
    metadata = MetadataParam(
        time_col=time_col,
        value_col=value_col,
        freq="H",
        date_format="%Y-%m-%d-%H",
        train_end_date=datetime.datetime(2019, 7, 1),
        anomaly_info=anomaly_info,
    )
    evaluation_metric = EvaluationMetricParam(
        cv_selection_metric=metric.name,
        cv_report_metrics=[
            EvaluationMetricEnum.MedianAbsolutePercentError.name
        ],
        agg_periods=24,
        agg_func=np.max,
        null_model_params={
            "strategy": "quantile",
            "constant": None,
            "quantile": 0.8
        },
        relative_error_tolerance=0.01)
    evaluation_period = EvaluationPeriodParam(test_horizon=1,
                                              periods_between_train_test=2,
                                              cv_horizon=3,
                                              cv_min_train_periods=4,
                                              cv_expanding_window=True,
                                              cv_periods_between_splits=5,
                                              cv_periods_between_train_test=6,
                                              cv_max_splits=7)
    model_components = ModelComponentsParam(
        seasonality={
            "yearly_seasonality": [True],
            "weekly_seasonality": [False],
            "daily_seasonality": [4],
            "add_seasonality_dict": [{
                "yearly": {
                    "period": 365.25,
                    "fourier_order": 20,
                    "prior_scale": 20.0
                },
                "quarterly": {
                    "period": 365.25 / 4,
                    "fourier_order": 15
                },
                "weekly": {
                    "period": 7,
                    "fourier_order": 35,
                    "prior_scale": 30.0
                }
            }]
        },
        growth={"growth_term": "linear"},
        events={
            "holiday_lookup_countries":
            ["UnitedStates", "UnitedKingdom", "India"],
            "holiday_pre_num_days": [2],
            "holiday_post_num_days": [3],
            "holidays_prior_scale": [5.0]
        },
        regressors={
            "add_regressor_dict": [{
                "regressor1": {
                    "prior_scale": 10.0,
                    "mode": 'additive'
                },
                "regressor2": {
                    "prior_scale": 20.0,
                    "mode": 'multiplicative'
                },
            }]
        },
        changepoints={
            "changepoint_prior_scale": [0.05],
            "changepoints": [None],
            "n_changepoints": [50],
            "changepoint_range": [0.9]
        },
        uncertainty={
            "mcmc_samples": [500],
            "uncertainty_samples": [2000]
        },
        hyperparameter_override={
            "input__response__null__impute_algorithm":
            "ts_interpolate",
            "input__response__null__impute_params": {
                "orders": [7, 14]
            },
            "input__regressors_numeric__normalize__normalize_algorithm":
            "RobustScaler",
        })
    computation = ComputationParam(hyperparameter_budget=10,
                                   n_jobs=None,
                                   verbose=1)
    forecast_horizon = 20
    coverage = 0.7
    config = ForecastConfig(model_template=ModelTemplateEnum.PROPHET.name,
                            metadata_param=metadata,
                            forecast_horizon=forecast_horizon,
                            coverage=coverage,
                            evaluation_metric_param=evaluation_metric,
                            evaluation_period_param=evaluation_period,
                            model_components_param=model_components,
                            computation_param=computation)
    template = ProphetTemplate()
    params = template.apply_template_for_pipeline_params(df=df, config=config)
    pipeline = params.pop("pipeline", None)
    # Adding start_year and end_year based on the input df
    model_components.events["start_year"] = df[time_col].min().year
    model_components.events["end_year"] = df[time_col].max().year
    expected_params = dict(
        df=df,
        time_col=time_col,
        value_col=value_col,
        date_format=metadata.date_format,
        freq=metadata.freq,
        train_end_date=metadata.train_end_date,
        anomaly_info=metadata.anomaly_info,
        # model
        regressor_cols=template.regressor_cols,
        estimator=None,
        hyperparameter_grid=template.hyperparameter_grid,
        hyperparameter_budget=computation.hyperparameter_budget,
        n_jobs=computation.n_jobs,
        verbose=computation.verbose,
        # forecast
        forecast_horizon=forecast_horizon,
        coverage=coverage,
        test_horizon=evaluation_period.test_horizon,
        periods_between_train_test=evaluation_period.
        periods_between_train_test,
        agg_periods=evaluation_metric.agg_periods,
        agg_func=evaluation_metric.agg_func,
        # evaluation
        score_func=metric.name,
        score_func_greater_is_better=metric.get_metric_greater_is_better(),
        cv_report_metrics=evaluation_metric.cv_report_metrics,
        null_model_params=evaluation_metric.null_model_params,
        relative_error_tolerance=evaluation_metric.relative_error_tolerance,
        # CV
        cv_horizon=evaluation_period.cv_horizon,
        cv_min_train_periods=evaluation_period.cv_min_train_periods,
        cv_expanding_window=evaluation_period.cv_expanding_window,
        cv_periods_between_splits=evaluation_period.cv_periods_between_splits,
        cv_periods_between_train_test=evaluation_period.
        cv_periods_between_train_test,
        cv_max_splits=evaluation_period.cv_max_splits)
    assert_basic_pipeline_equal(pipeline, template.pipeline)
    assert_equal(params, expected_params)
コード例 #17
0
def test_forecast_pipeline_rolling_evaluation_silverkite():
    """Checks the output rolling evaluation with Silverkite template"""
    data = generate_df_with_reg_for_tests(
        freq="1D",
        periods=20 * 7,  # short-term: 20 weeks of data
        remove_extra_cols=True,
        mask_test_actuals=True)
    regressor_cols = ["regressor1", "regressor2", "regressor_categ"]
    keep_cols = [TIME_COL, VALUE_COL] + regressor_cols
    df = data["df"][keep_cols]

    coverage = 0.1
    hyperparameter_grid = {
        "estimator__origin_for_time_vars":
        [None],  # inferred from training data
        "estimator__fs_components_df": [
            pd.DataFrame({
                "name": ["tow"],
                "period": [7.0],
                "order": [3],
                "seas_names": ["weekly"]
            })
        ],
        "estimator__extra_pred_cols":
        [regressor_cols, regressor_cols + ["ct_sqrt"]
         ],  # two cases: no growth term and single growth term
        "estimator__fit_algorithm_dict": [{
            "fit_algorithm": "linear"
        }]
    }
    pipeline_params = mock_pipeline(
        df=df,
        time_col=TIME_COL,
        value_col=VALUE_COL,
        date_format=None,  # not recommended, but possible to specify
        freq=None,
        regressor_cols=regressor_cols,
        estimator=SilverkiteEstimator(),
        hyperparameter_grid=hyperparameter_grid,
        hyperparameter_budget=1,
        n_jobs=1,
        forecast_horizon=2 * 7,
        coverage=coverage,
        test_horizon=2 * 7,
        periods_between_train_test=2 * 7,
        agg_periods=7,
        agg_func=np.mean,
        score_func=mean_absolute_error,  # callable score_func
        null_model_params=None,
        cv_horizon=1 * 7,
        cv_expanding_window=True,
        cv_min_train_periods=8 * 7,
        cv_periods_between_splits=7,
        cv_periods_between_train_test=3 * 7,
        cv_max_splits=2)
    tscv = RollingTimeSeriesSplit(forecast_horizon=2 * 7,
                                  min_train_periods=10 * 7,
                                  expanding_window=True,
                                  use_most_recent_splits=True,
                                  periods_between_splits=2 * 7,
                                  periods_between_train_test=2 * 7,
                                  max_splits=3)
    rolling_evaluation = forecast_pipeline_rolling_evaluation(
        pipeline_params=pipeline_params, tscv=tscv)

    expected_splits_n = tscv.max_splits
    assert len(rolling_evaluation.keys()) == expected_splits_n
    assert set(rolling_evaluation.keys()) == {"split_0", "split_1", "split_2"}

    time_col = pipeline_params["time_col"]
    for split_num, (train, test) in enumerate(tscv.split(X=df)):
        split_output = rolling_evaluation[f"split_{split_num}"]
        assert round(split_output["runtime_sec"],
                     3) == split_output["runtime_sec"]

        pipeline_result = split_output["pipeline_result"]

        # Checks every split uses all the available data for training
        ts = pipeline_result.timeseries
        train_end_date = df.iloc[train[-1]][time_col]
        assert ts.train_end_date == train_end_date

        assert pipeline_result.backtest is None

        # Checks every split has forecast for train+test periods passed by tscv
        forecast = pipeline_result.forecast
        assert forecast.df.shape[0] == ts.fit_df.shape[
            0] + tscv.periods_between_train_test + tscv.forecast_horizon
コード例 #18
0
def test_silverkite_with_components_daily_data():
    """Tests get_components, plot_components, plot_trend,
    plot_seasonalities with daily data and missing input values.
    """
    daily_data = generate_df_with_reg_for_tests(
        freq="D",
        periods=20,
        train_start_date=datetime.datetime(2018, 1, 1),
        conti_year_origin=2018)
    train_df = daily_data["train_df"].copy()
    train_df.loc[[2, 4, 7], cst.VALUE_COL] = np.nan  # creates missing values

    params_daily = params_components()  # SilverkiteEstimator parameters
    # converts into parameters for `forecast_silverkite`
    coverage = params_daily.pop("coverage")
    # removes daily seasonality terms
    params_daily["fs_components_df"] = pd.DataFrame({
        "name": ["tow", "ct1"],
        "period": [7.0, 1.0],
        "order": [4, 5],
        "seas_names": ["weekly", "yearly"]
    })

    model = BaseSilverkiteEstimator(
        coverage=coverage, uncertainty_dict=params_daily["uncertainty_dict"])

    with pytest.raises(NotFittedError,
                       match="Call `fit` before calling `plot_components`."):
        model.plot_components()

    with pytest.warns(Warning):
        # suppress warnings from conf_interval.py and sklearn
        # a subclass's fit() method will have these steps
        model.fit(X=train_df, time_col=cst.TIME_COL, value_col=cst.VALUE_COL)
        silverkite = SilverkiteForecast()
        model.model_dict = silverkite.forecast(df=train_df,
                                               time_col=cst.TIME_COL,
                                               value_col=cst.VALUE_COL,
                                               **params_daily)
        model.finish_fit()

    # Tests plot_components
    with pytest.warns(Warning) as record:
        title = "Custom component plot"
        model._set_silverkite_diagnostics_params()
        fig = model.plot_components(
            names=["trend", "YEARLY_SEASONALITY", "DUMMY"], title=title)
        expected_rows = 3
        assert len(fig.data) == expected_rows + 1  # includes changepoints
        assert [fig.data[i].name for i in range(expected_rows)] == \
               [cst.VALUE_COL, "trend", "YEARLY_SEASONALITY"]

        assert fig.layout.xaxis.title["text"] == cst.TIME_COL
        assert fig.layout.xaxis2.title["text"] == cst.TIME_COL
        assert fig.layout.xaxis3.title["text"] == "Time of year"

        assert fig.layout.yaxis.title["text"] == cst.VALUE_COL
        assert fig.layout.yaxis2.title["text"] == "trend"
        assert fig.layout.yaxis3.title["text"] == "yearly"

        assert fig.layout.title["text"] == title
        assert f"The following components have not been specified in the model: " \
               f"{{'DUMMY'}}, plotting the rest." in record[0].message.args[0]

    # Missing component error
    with pytest.raises(
            ValueError,
            match=
            "None of the provided components have been specified in the model."
    ):
        model.plot_components(names=["DUMMY"])

    # Tests plot_trend
    title = "Custom trend plot"
    fig = model.plot_trend(title=title)
    expected_rows = 2
    assert len(fig.data) == expected_rows + 1  # includes changepoints
    assert [fig.data[i].name
            for i in range(expected_rows)] == [cst.VALUE_COL, "trend"]

    assert fig.layout.xaxis.title["text"] == cst.TIME_COL
    assert fig.layout.xaxis2.title["text"] == cst.TIME_COL

    assert fig.layout.yaxis.title["text"] == cst.VALUE_COL
    assert fig.layout.yaxis2.title["text"] == "trend"

    assert fig.layout.title["text"] == title

    # Tests plot_seasonalities
    with pytest.warns(Warning):
        # suppresses the warning on seasonalities removed
        title = "Custom seasonality plot"
        fig = model.plot_seasonalities(title=title)
        expected_rows = 3
        assert len(fig.data) == expected_rows
        assert [fig.data[i].name for i in range(expected_rows)] == \
               [cst.VALUE_COL, "WEEKLY_SEASONALITY", "YEARLY_SEASONALITY"]

        assert fig.layout.xaxis.title["text"] == cst.TIME_COL
        assert fig.layout.xaxis2.title["text"] == "Day of week"
        assert fig.layout.xaxis3.title["text"] == "Time of year"

        assert fig.layout.yaxis.title["text"] == cst.VALUE_COL
        assert fig.layout.yaxis2.title["text"] == "weekly"
        assert fig.layout.yaxis3.title["text"] == "yearly"

        assert fig.layout.title["text"] == title

    # Component plot error if `fit_algorithm` is "rf" or "gradient_boosting"
    params_daily["fit_algorithm"] = "rf"
    model = BaseSilverkiteEstimator(
        coverage=coverage, uncertainty_dict=params_daily["uncertainty_dict"])
    with pytest.warns(Warning):
        # suppress warnings from conf_interval.py and sklearn
        # a subclass's fit() method will have these steps
        model.fit(X=train_df, time_col=cst.TIME_COL, value_col=cst.VALUE_COL)
        model.model_dict = silverkite.forecast(df=train_df,
                                               time_col=cst.TIME_COL,
                                               value_col=cst.VALUE_COL,
                                               **params_daily)
        model.finish_fit()
    assert model.coef_ is None
    with pytest.raises(
            NotImplementedError,
            match=
            "Component plot has only been implemented for additive linear models."
    ):
        model.plot_components()

    with pytest.raises(
            NotImplementedError,
            match=
            "Component plot has only been implemented for additive linear models."
    ):
        model.plot_trend()

    with pytest.raises(
            NotImplementedError,
            match=
            "Component plot has only been implemented for additive linear models."
    ):
        model.plot_seasonalities()
コード例 #19
0
def test_run_auto_arima_template_custom():
    """Tests running auto arima template through the pipeline"""
    data = generate_df_with_reg_for_tests(freq="D",
                                          periods=50,
                                          train_frac=0.8,
                                          conti_year_origin=2018,
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    # select relevant columns for testing
    relevant_cols = [
        cst.TIME_COL, cst.VALUE_COL, "regressor1", "regressor2", "regressor3"
    ]
    df = data["df"][relevant_cols]
    forecast_horizon = data["fut_time_num"]

    # Model components - custom holidays; other params as defaults
    model_components = ModelComponentsParam(
        # Everything except `custom` and `hyperparameter_override` are ignored
        seasonality={
            "seasonality_mode": ["additive"],
            "yearly_seasonality": ["auto"],
            "weekly_seasonality": [True],
            "daily_seasonality": ["auto"],
        },
        growth={"growth_term": ["linear"]},
        events={
            "holiday_pre_num_days": [1],
            "holiday_post_num_days": [1],
            "holidays_prior_scale": [1.0]
        },
        changepoints={
            "changepoint_prior_scale": [0.05],
            "n_changepoints": [1],
            "changepoint_range": [0.5],
        },
        regressors={
            "add_regressor_dict": [{
                "regressor1": {
                    "prior_scale": 10,
                    "standardize": True,
                    "mode": "additive"
                },
                "regressor2": {
                    "prior_scale": 15,
                    "standardize": False,
                    "mode": "additive"
                },
                "regressor3": {}
            }]
        },
        uncertainty={"uncertainty_samples": [10]},
        custom={
            "max_order": [10],
            "information_criterion": ["bic"]
        })

    metadata = MetadataParam(
        time_col=cst.TIME_COL,
        value_col=cst.VALUE_COL,
        freq="D",
    )
    evaluation_period = EvaluationPeriodParam(
        test_horizon=5,  # speeds up test case
        periods_between_train_test=5,
        cv_horizon=0,  # speeds up test case
    )
    config = ForecastConfig(
        model_template=ModelTemplateEnum.AUTO_ARIMA.name,
        metadata_param=metadata,
        forecast_horizon=forecast_horizon,
        coverage=0.95,
        model_components_param=model_components,
        evaluation_period_param=evaluation_period,
    )
    result = Forecaster().run_forecast_config(
        df=df,
        config=config,
    )

    forecast_df = result.forecast.df_test.reset_index(drop=True)
    expected_cols = [
        "ts", "actual", "forecast", "forecast_lower", "forecast_upper"
    ]
    assert list(forecast_df.columns) == expected_cols
    assert result.backtest.coverage == 0.95, "coverage is not correct"
    # NB: coverage is poor because of very small dataset size and low uncertainty_samples
    assert result.backtest.train_evaluation[
        cst.PREDICTION_BAND_COVERAGE] is not None
    assert result.backtest.test_evaluation[
        cst.PREDICTION_BAND_COVERAGE] is not None
    assert result.backtest.train_evaluation["MSE"] is not None
    assert result.backtest.test_evaluation["MSE"] is not None
    assert result.forecast.train_evaluation[
        cst.PREDICTION_BAND_COVERAGE] is not None
    assert result.forecast.train_evaluation["MSE"] is not None
コード例 #20
0
def test_plot_components():
    """Tests plot_components.
    Because component plots are implemented in `base_silverkite_estimator.py,` the bulk of
    the testing is done there. This file only tests inheritance and compatibility of the
    trained_model generated by this estimator's fit.
    """
    daily_data = generate_df_with_reg_for_tests(
        freq="D",
        periods=20,
        train_start_date=datetime.datetime(2018, 1, 1),
        conti_year_origin=2018)
    train_df = daily_data.get("train_df").copy()
    params_daily = params_components()
    fit_algorithm = params_daily.pop("fit_algorithm", "linear")
    fit_algorithm_params = params_daily.pop("fit_algorithm_params", None)
    params_daily["fit_algorithm_dict"] = {
        "fit_algorithm": fit_algorithm,
        "fit_algorithm_params": fit_algorithm_params,
    }
    # removing daily seasonality terms
    params_daily["fs_components_df"] = pd.DataFrame({
        "name": ["tow", "ct1"],
        "period": [7.0, 1.0],
        "order": [4, 5],
        "seas_names": ["weekly", "yearly"]})
    model = SilverkiteEstimator(**params_daily)
    with pytest.warns(Warning):
        # suppresses sklearn warning on `iid` parameter for ridge hyperparameter_grid search
        model.fit(train_df)

    # Test plot_components
    with pytest.warns(Warning) as record:
        title = "Custom component plot"
        fig = model.plot_components(names=["trend", "YEARLY_SEASONALITY", "DUMMY"], title=title)
        expected_rows = 3
        assert len(fig.data) == expected_rows + 1  # includes changepoints
        assert [fig.data[i].name for i in range(expected_rows)] == \
               [cst.VALUE_COL, "trend", "YEARLY_SEASONALITY"]

        assert fig.layout.xaxis.title["text"] == cst.TIME_COL
        assert fig.layout.xaxis2.title["text"] == cst.TIME_COL
        assert fig.layout.xaxis3.title["text"] == "Time of year"

        assert fig.layout.yaxis.title["text"] == cst.VALUE_COL
        assert fig.layout.yaxis2.title["text"] == "trend"
        assert fig.layout.yaxis3.title["text"] == "yearly"

        assert fig.layout.title["text"] == title
        assert f"The following components have not been specified in the model: " \
               f"{{'DUMMY'}}, plotting the rest." in record[0].message.args[0]

    # Test plot_trend
    title = "Custom trend plot"
    fig = model.plot_trend(title=title)
    expected_rows = 2
    assert len(fig.data) == expected_rows + 1  # includes changepoints
    assert [fig.data[i].name for i in range(expected_rows)] == [cst.VALUE_COL, "trend"]

    assert fig.layout.xaxis.title["text"] == cst.TIME_COL
    assert fig.layout.xaxis2.title["text"] == cst.TIME_COL

    assert fig.layout.yaxis.title["text"] == cst.VALUE_COL
    assert fig.layout.yaxis2.title["text"] == "trend"

    assert fig.layout.title["text"] == title

    # Test plot_seasonalities
    with pytest.warns(Warning):
        # suppresses the warning on seasonalities removed
        title = "Custom seasonality plot"
        fig = model.plot_seasonalities(title=title)
        expected_rows = 3
        assert len(fig.data) == expected_rows
        assert [fig.data[i].name for i in range(expected_rows)] == \
               [cst.VALUE_COL, "WEEKLY_SEASONALITY", "YEARLY_SEASONALITY"]

        assert fig.layout.xaxis.title["text"] == cst.TIME_COL
        assert fig.layout.xaxis2.title["text"] == "Day of week"
        assert fig.layout.xaxis3.title["text"] == "Time of year"

        assert fig.layout.yaxis.title["text"] == cst.VALUE_COL
        assert fig.layout.yaxis2.title["text"] == "weekly"
        assert fig.layout.yaxis3.title["text"] == "yearly"

        assert fig.layout.title["text"] == title
コード例 #21
0
def test_get_quantiles_and_overlays():
    """Tests get_quantiles_and_overlays"""
    dl = DataLoaderTS()
    peyton_manning_ts = dl.load_peyton_manning_ts()

    # no columns are requested
    with pytest.raises(
            ValueError,
            match=
            "Must enable at least one of: show_mean, show_quantiles, show_overlays."
    ):
        peyton_manning_ts.get_quantiles_and_overlays(
            groupby_time_feature="doy")

    # show_mean only
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_time_feature="dow",
        show_mean=True,
        mean_col_name="custom_name")
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays([[MEAN_COL_GROUP], ["custom_name"]],
                                  names=["category", "name"]))
    assert grouped_df.index.name == "dow"
    assert grouped_df.shape == (7, 1)
    assert grouped_df.index[0] == 1

    # show_quantiles only (bool)
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_sliding_window_size=180, show_quantiles=True)
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[QUANTILE_COL_GROUP, QUANTILE_COL_GROUP], ["Q0.1", "Q0.9"]],
            names=["category", "name"]))
    assert grouped_df.index.name == "ts_downsample"
    assert grouped_df.shape == (17, 2)
    assert grouped_df.index[0] == pd.Timestamp(2007, 12, 10)

    # show_quantiles only (list)
    custom_col = pd.Series(
        np.random.choice(list("abcd"), size=peyton_manning_ts.df.shape[0]))
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_custom_column=custom_col,
        show_quantiles=[0, 0.25, 0.5, 0.75, 1],
        quantile_col_prefix="prefix")
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[QUANTILE_COL_GROUP] * 5,
             ["prefix0", "prefix0.25", "prefix0.5", "prefix0.75", "prefix1"]],
            names=["category", "name"]))
    assert grouped_df.index.name == "groups"
    assert grouped_df.shape == (4, 5)
    assert grouped_df.index[0] == "a"
    # checks quantile computation
    df = peyton_manning_ts.df.copy()
    df["custom_col"] = custom_col.values
    quantile_df = df.groupby("custom_col")[VALUE_COL].agg(
        [np.nanmin, np.nanmedian, np.nanmax])
    assert_equal(grouped_df["quantile"]["prefix0"],
                 quantile_df["nanmin"],
                 check_names=False)
    assert_equal(grouped_df["quantile"]["prefix0.5"],
                 quantile_df["nanmedian"],
                 check_names=False)
    assert_equal(grouped_df["quantile"]["prefix1"],
                 quantile_df["nanmax"],
                 check_names=False)

    # show_overlays only (bool), no overlay label
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_time_feature="doy", show_overlays=True)
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[OVERLAY_COL_GROUP] * 9, [f"overlay{i}" for i in range(9)]],
            names=["category", "name"]))
    assert grouped_df.index.name == "doy"
    assert grouped_df.shape == (366, 9)
    assert grouped_df.index[0] == 1

    # show_overlays only (int below the available number), time feature overlay label
    np.random.seed(123)
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_time_feature="doy",
        show_overlays=4,
        overlay_label_time_feature="year")
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[OVERLAY_COL_GROUP] * 4, ["2007", "2011", "2012", "2014"]],
            names=["category", "name"]))
    assert grouped_df.index.name == "doy"
    assert grouped_df.shape == (366, 4)
    assert grouped_df.index[0] == 1

    # show_overlays only (int above the available number), custom overlay label
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_time_feature="dom",
        show_overlays=200,
        overlay_label_custom_column=custom_col)
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[OVERLAY_COL_GROUP] * 4, ["a", "b", "c", "d"]],
            names=["category", "name"]))
    assert grouped_df.index.name == "dom"
    assert grouped_df.shape == (31, 4)
    assert grouped_df.index[0] == 1

    # show_overlays only (list of indices), sliding window overlay label
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_time_feature="dom",
        show_overlays=[0, 4],
        overlay_label_sliding_window_size=365 * 2)
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[OVERLAY_COL_GROUP] * 2,
             ["2007-12-10 00:00:00", "2015-12-08 00:00:00"]],
            names=["category", "name"]))
    assert grouped_df.index.name == "dom"
    assert grouped_df.shape == (31, 2)
    assert grouped_df.index[0] == 1

    # show_overlays only (np.ndarray), sliding window overlay label
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_time_feature="dom",
        show_overlays=np.arange(0, 6, 2),
        overlay_label_sliding_window_size=365 * 2)
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[OVERLAY_COL_GROUP] * 3,
             [
                 "2007-12-10 00:00:00", "2011-12-09 00:00:00",
                 "2015-12-08 00:00:00"
             ]],
            names=["category", "name"]))
    assert grouped_df.index.name == "dom"
    assert grouped_df.shape == (31, 3)
    assert grouped_df.index[0] == 1

    # show_overlays only (list of column names), sliding window overlay label
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_time_feature="dom",
        show_overlays=["2007-12-10 00:00:00", "2015-12-08 00:00:00"],
        overlay_label_sliding_window_size=365 * 2)
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[OVERLAY_COL_GROUP] * 2,
             ["2007-12-10 00:00:00", "2015-12-08 00:00:00"]],
            names=["category", "name"]))
    assert grouped_df.index.name == "dom"
    assert grouped_df.shape == (31, 2)
    assert grouped_df.index[0] == 1

    # Show all 3 (no overlay label)
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_sliding_window_size=50,  # 50 per group (50 overlays)
        show_mean=True,
        show_quantiles=[0.05, 0.5, 0.95],  # 3 quantiles
        show_overlays=True)
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[MEAN_COL_GROUP] + [QUANTILE_COL_GROUP] * 3 +
             [OVERLAY_COL_GROUP] * 50, ["mean", "Q0.05", "Q0.5", "Q0.95"] +
             [f"overlay{i}" for i in range(50)]],
            names=["category", "name"]))
    assert grouped_df.index.name == "ts_downsample"
    assert grouped_df.shape == (60, 54)
    assert grouped_df.index[-1] == pd.Timestamp(2016, 1, 7)

    # Show all 3 (with overlay label).
    # Pass overlay_pivot_table_kwargs.
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_sliding_window_size=180,
        show_mean=True,
        show_quantiles=[0.05, 0.5, 0.95],  # 3 quantiles
        show_overlays=True,
        overlay_label_time_feature="dow",  # 7 possible values
        aggfunc="median")
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[MEAN_COL_GROUP] + [QUANTILE_COL_GROUP] * 3 +
             [OVERLAY_COL_GROUP] * 7,
             [
                 "mean", "Q0.05", "Q0.5", "Q0.95", "1", "2", "3", "4", "5",
                 "6", "7"
             ]],
            names=["category", "name"]))
    assert grouped_df.index.name == "ts_downsample"
    assert grouped_df.shape == (17, 11)
    assert grouped_df.index[-1] == pd.Timestamp(2015, 10, 29)
    assert np.linalg.norm(
        grouped_df[OVERLAY_COL_GROUP].mean()) > 1.0  # not centered

    with pytest.raises(
            TypeError,
            match="pivot_table\\(\\) got an unexpected keyword argument 'aggfc'"
    ):
        peyton_manning_ts.get_quantiles_and_overlays(
            groupby_sliding_window_size=180,
            show_mean=True,
            show_quantiles=[0.05, 0.5, 0.95],
            show_overlays=True,
            overlay_label_time_feature="dow",
            aggfc=np.nanmedian)  # unrecognized parameter

    # center_values with show_mean=True
    centered_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_sliding_window_size=180,
        show_mean=True,
        show_quantiles=[0.05, 0.5, 0.95],
        show_overlays=True,
        overlay_label_time_feature="dow",
        aggfunc="median",
        center_values=True)
    assert np.linalg.norm(centered_df[[MEAN_COL_GROUP, OVERLAY_COL_GROUP
                                       ]].mean()) < 1e-8  # centered at 0
    assert_equal(
        centered_df[QUANTILE_COL_GROUP],
        grouped_df[QUANTILE_COL_GROUP] - grouped_df[MEAN_COL_GROUP].mean()[0])

    # center_values with show_mean=False
    centered_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_sliding_window_size=180,
        show_mean=False,
        show_quantiles=[0.05, 0.5, 0.95],
        show_overlays=True,
        overlay_label_time_feature="dow",
        aggfunc="median",
        center_values=True)
    assert np.linalg.norm(centered_df[[OVERLAY_COL_GROUP
                                       ]].mean()) < 1e-8  # centered at 0
    overall_mean = peyton_manning_ts.df[VALUE_COL].mean()
    assert_equal(centered_df[QUANTILE_COL_GROUP],
                 grouped_df[QUANTILE_COL_GROUP] - overall_mean)

    # new value_col
    df = generate_df_with_reg_for_tests(freq="D", periods=700)["df"]
    ts = UnivariateTimeSeries()
    ts.load_data(df=df)
    grouped_df = ts.get_quantiles_and_overlays(
        groupby_time_feature="dow",
        show_mean=True,
        show_quantiles=True,
        show_overlays=True,
        overlay_label_time_feature="woy",
        value_col="regressor1")

    df_dow = add_groupby_column(df=ts.df,
                                time_col=TIME_COL,
                                groupby_time_feature="dow")
    dow_mean = df_dow["df"].groupby("dow").agg(
        mean=pd.NamedAgg(column="regressor1", aggfunc=np.nanmean))
    assert_equal(grouped_df["mean"], dow_mean, check_names=False)
コード例 #22
0
def test_get_forecast_time_properties():
    """Tests get_forecast_time_properties"""
    num_training_points = 365  # one year of daily data
    data = generate_df_for_tests(freq="D", periods=num_training_points)
    df = data["df"]
    result = get_forecast_time_properties(df,
                                          time_col=TIME_COL,
                                          value_col=VALUE_COL,
                                          freq="D",
                                          forecast_horizon=0)
    default_origin = get_default_origin_for_time_vars(df, TIME_COL)
    assert result == {
        "period": TimeEnum.ONE_DAY_IN_SECONDS.value,
        "simple_freq": SimpleTimeFrequencyEnum.DAY,
        "num_training_points": num_training_points,
        "num_training_days": num_training_points,
        "days_per_observation": 1,
        "forecast_horizon": 0,
        "forecast_horizon_in_timedelta": timedelta(days=0),
        "forecast_horizon_in_days": 0,
        "start_year": 2018,
        "end_year": 2019,
        "origin_for_time_vars": default_origin
    }

    # longer forecast_horizon
    result = get_forecast_time_properties(df,
                                          time_col=TIME_COL,
                                          value_col=VALUE_COL,
                                          freq="D",
                                          forecast_horizon=365)
    default_origin = get_default_origin_for_time_vars(df, TIME_COL)
    assert result == {
        "period": TimeEnum.ONE_DAY_IN_SECONDS.value,
        "simple_freq": SimpleTimeFrequencyEnum.DAY,
        "num_training_points": num_training_points,
        "num_training_days": num_training_points,
        "days_per_observation": 1,
        "forecast_horizon": 365,
        "forecast_horizon_in_timedelta": timedelta(days=365),
        "forecast_horizon_in_days": 365,
        "start_year": 2018,
        "end_year": 2020,
        "origin_for_time_vars": default_origin
    }

    # two years of hourly data
    num_training_points = 2 * 365 * 24
    data = generate_df_for_tests(freq="H", periods=num_training_points)
    df = data["df"]
    result = get_forecast_time_properties(df,
                                          time_col=TIME_COL,
                                          value_col=VALUE_COL,
                                          freq="H",
                                          forecast_horizon=0)
    default_origin = get_default_origin_for_time_vars(df, TIME_COL)
    assert result == {
        "period": TimeEnum.ONE_HOUR_IN_SECONDS.value,
        "simple_freq": SimpleTimeFrequencyEnum.HOUR,
        "num_training_points": num_training_points,
        "num_training_days": num_training_points / 24,
        "days_per_observation": 1 / 24,
        "forecast_horizon": 0,
        "forecast_horizon_in_timedelta": timedelta(days=0),
        "forecast_horizon_in_days": 0,
        "start_year": 2018,
        "end_year": 2020,
        "origin_for_time_vars": default_origin
    }

    # longer forecast_horizon
    result = get_forecast_time_properties(df,
                                          time_col=TIME_COL,
                                          value_col=VALUE_COL,
                                          freq="H",
                                          forecast_horizon=365 * 24)
    default_origin = get_default_origin_for_time_vars(df, TIME_COL)
    assert result == {
        "period": TimeEnum.ONE_HOUR_IN_SECONDS.value,
        "simple_freq": SimpleTimeFrequencyEnum.HOUR,
        "num_training_points": num_training_points,
        "num_training_days": num_training_points / 24,
        "days_per_observation": 1 / 24,
        "forecast_horizon": 365 * 24,
        "forecast_horizon_in_timedelta": timedelta(days=365),
        "forecast_horizon_in_days": 365,
        "start_year": 2018,
        "end_year": 2021,
        "origin_for_time_vars": default_origin
    }

    # ``forecast_horizon=None``
    result = get_forecast_time_properties(df,
                                          time_col=TIME_COL,
                                          value_col=VALUE_COL,
                                          freq="H",
                                          forecast_horizon=None)
    default_origin = get_default_origin_for_time_vars(df, TIME_COL)
    assert result == {
        "period": TimeEnum.ONE_HOUR_IN_SECONDS.value,
        "simple_freq": SimpleTimeFrequencyEnum.HOUR,
        "num_training_points": num_training_points,
        "num_training_days": num_training_points / 24,
        "days_per_observation": 1 / 24,
        "forecast_horizon": 24,
        "forecast_horizon_in_timedelta": timedelta(days=1),
        "forecast_horizon_in_days": 1,
        "start_year": 2018,
        "end_year": 2020,
        "origin_for_time_vars": default_origin
    }

    # weekly df with regressors
    num_training_points = 50
    data = generate_df_with_reg_for_tests(freq="W-SUN",
                                          periods=num_training_points,
                                          train_start_date=datetime.datetime(
                                              2018, 11, 30),
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    df = data["df"]
    train_df = data["train_df"]
    forecast_horizon = data["fut_time_num"]
    regressor_cols = [
        col for col in df.columns if col not in [TIME_COL, VALUE_COL]
    ]
    result = get_forecast_time_properties(df,
                                          time_col=TIME_COL,
                                          value_col=VALUE_COL,
                                          freq="W-SUN",
                                          regressor_cols=regressor_cols,
                                          forecast_horizon=forecast_horizon)
    default_origin = get_default_origin_for_time_vars(df, TIME_COL)
    assert result == {
        "period": TimeEnum.ONE_WEEK_IN_SECONDS.value,
        "simple_freq": SimpleTimeFrequencyEnum.WEEK,
        "num_training_points": train_df.shape[0],  # size of training set
        "num_training_days": train_df.shape[0] * 7,
        "days_per_observation": 7,
        "forecast_horizon": 9,
        "forecast_horizon_in_timedelta": timedelta(days=63),
        "forecast_horizon_in_days": 63.0,
        "start_year": 2018,
        "end_year": 2019,
        "origin_for_time_vars": default_origin
    }

    # checks `num_training_days` with `train_end_date`
    data = generate_df_with_reg_for_tests(freq="H",
                                          periods=300 * 24,
                                          train_start_date=datetime.datetime(
                                              2018, 7, 1),
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    df = data["df"]
    train_end_date = datetime.datetime(2019, 2, 1)
    result = get_forecast_time_properties(
        df=df,
        time_col=TIME_COL,
        value_col=VALUE_COL,
        freq="H",
        regressor_cols=data["regressor_cols"],
        train_end_date=train_end_date,
        forecast_horizon=forecast_horizon)
    period = 3600  # seconds between observations
    time_delta = (train_end_date - df[TIME_COL].min()
                  )  # train end - train start
    num_training_days = (
        time_delta.days +
        (time_delta.seconds + period) / TimeEnum.ONE_DAY_IN_SECONDS.value)
    assert result["num_training_days"] == num_training_days

    # checks `num_training_days` without `train_end_date`
    result = get_forecast_time_properties(
        df=df,
        time_col=TIME_COL,
        value_col=VALUE_COL,
        freq="H",
        regressor_cols=data["regressor_cols"],
        train_end_date=None,
        forecast_horizon=forecast_horizon)
    time_delta = (
        datetime.datetime(2019, 2, 26) - df[TIME_COL].min()
    )  # by default, train end is the last date with nonnull value_col
    num_training_days = (
        time_delta.days +
        (time_delta.seconds + period) / TimeEnum.ONE_DAY_IN_SECONDS.value)
    assert result["num_training_days"] == num_training_days
コード例 #23
0
def test_run_prophet_template_custom():
    """Tests running prophet template through the pipeline"""
    data = generate_df_with_reg_for_tests(freq="D",
                                          periods=50,
                                          train_frac=0.8,
                                          conti_year_origin=2018,
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    # select relevant columns for testing
    relevant_cols = [
        cst.TIME_COL, cst.VALUE_COL, "regressor1", "regressor2", "regressor3"
    ]
    df = data["df"][relevant_cols]
    forecast_horizon = data["fut_time_num"]

    # Model components - custom holidays; other params as defaults
    model_components = ModelComponentsParam(
        seasonality={
            "seasonality_mode": ["additive"],
            "yearly_seasonality": ["auto"],
            "weekly_seasonality": [True],
            "daily_seasonality": ["auto"],
        },
        growth={"growth_term": ["linear"]},
        events={
            "holiday_pre_num_days": [1],
            "holiday_post_num_days": [1],
            "holidays_prior_scale": [1.0]
        },
        changepoints={
            "changepoint_prior_scale": [0.05],
            "n_changepoints": [1],
            "changepoint_range": [0.5],
        },
        regressors={
            "add_regressor_dict": [{
                "regressor1": {
                    "prior_scale": 10,
                    "standardize": True,
                    "mode": "additive"
                },
                "regressor2": {
                    "prior_scale": 15,
                    "standardize": False,
                    "mode": "additive"
                },
                "regressor3": {}
            }]
        },
        uncertainty={"uncertainty_samples": [10]})

    metadata = MetadataParam(
        time_col=cst.TIME_COL,
        value_col=cst.VALUE_COL,
        freq="D",
    )
    evaluation_period = EvaluationPeriodParam(
        test_horizon=5,  # speeds up test case
        periods_between_train_test=5,
        cv_horizon=0,  # speeds up test case
    )
    config = ForecastConfig(
        model_template=ModelTemplateEnum.PROPHET.name,
        metadata_param=metadata,
        forecast_horizon=forecast_horizon,
        coverage=0.95,
        model_components_param=model_components,
        evaluation_period_param=evaluation_period,
    )
    result = Forecaster().run_forecast_config(
        df=df,
        config=config,
    )

    forecast_df = result.forecast.df_test.reset_index(drop=True)
    expected_cols = [
        "ts", "actual", "forecast", "forecast_lower", "forecast_upper"
    ]
    assert list(forecast_df.columns) == expected_cols
    assert result.backtest.coverage == 0.95, "coverage is not correct"
    # NB: coverage is poor because of very small dataset size and low uncertainty_samples
    assert result.backtest.train_evaluation[cst.PREDICTION_BAND_COVERAGE] == pytest.approx(0.677, rel=1e-3), \
        "training coverage is None or less than expected"
    assert result.backtest.test_evaluation[cst.PREDICTION_BAND_COVERAGE] == pytest.approx(0.800, rel=1e-3), \
        "testing coverage is None or less than expected"
    assert result.backtest.train_evaluation["MSE"] == pytest.approx(3.7849, rel=1e-3), \
        "training MSE is None or more than expected"
    assert result.backtest.test_evaluation["MSE"] == pytest.approx(2.9609, rel=1e-3), \
        "testing MSE is None or more than expected"
    assert result.forecast.train_evaluation[cst.PREDICTION_BAND_COVERAGE] == pytest.approx(0.7805, rel=1e-3), \
        "forecast coverage is None or less than expected"
    assert result.forecast.train_evaluation["MSE"] == pytest.approx(4.1806, rel=1e-3), \
        "forecast MSE is None or more than expected"

    # ensure regressors were used in the model
    prophet_estimator = result.model.steps[-1][-1]
    regressors = prophet_estimator.model.extra_regressors
    assert regressors.keys() == {"regressor1", "regressor2", "regressor3"}
    assert regressors["regressor1"]["prior_scale"] == 10.0
    assert regressors["regressor1"]["standardize"] is True
    assert regressors["regressor1"]["mode"] == "additive"
    assert regressors["regressor2"]["prior_scale"] == 15.0
    assert regressors["regressor3"]["standardize"] == "auto"
コード例 #24
0
def test_plot_components():
    """Tests plot_components.
    Because component plots are implemented in `base_silverkite_estimator.py,` the bulk of
    the testing is done there. This file only tests inheritance and compatibility of the
    trained_model generated by this estimator's fit.
    """
    daily_data = generate_df_with_reg_for_tests(
        freq="D",
        periods=20,
        train_start_date=datetime.datetime(2018, 1, 1),
        conti_year_origin=2018)
    train_df = daily_data.get("train_df").copy()
    model = SimpleSilverkiteEstimator(
        fit_algorithm_dict={"fit_algorithm": "linear"},
        yearly_seasonality=True,
        quarterly_seasonality=False,
        monthly_seasonality=False,
        weekly_seasonality=True,
        daily_seasonality=False,
    )
    model.fit(train_df)

    # Test plot_components
    with pytest.warns(Warning) as record:
        title = "Custom component plot"
        fig = model.plot_components(
            names=["trend", "YEARLY_SEASONALITY", "DUMMY"], title=title)
        expected_rows = 3
        assert len(fig.data) == expected_rows
        assert [fig.data[i].name for i in range(expected_rows)] == \
               [cst.VALUE_COL, "trend", "YEARLY_SEASONALITY"]

        assert fig.layout.xaxis.title["text"] == cst.TIME_COL
        assert fig.layout.xaxis2.title["text"] == cst.TIME_COL
        assert fig.layout.xaxis3.title["text"] == "Time of year"

        assert fig.layout.yaxis.title["text"] == cst.VALUE_COL
        assert fig.layout.yaxis2.title["text"] == "trend"
        assert fig.layout.yaxis3.title["text"] == "yearly"

        assert fig.layout.title["text"] == title
        assert f"The following components have not been specified in the model: " \
               f"{{'DUMMY'}}, plotting the rest." in record[0].message.args[0]

    # Test plot_trend
    title = "Custom trend plot"
    fig = model.plot_trend(title=title)
    expected_rows = 2
    assert len(fig.data) == expected_rows
    assert [fig.data[i].name
            for i in range(expected_rows)] == [cst.VALUE_COL, "trend"]

    assert fig.layout.xaxis.title["text"] == cst.TIME_COL
    assert fig.layout.xaxis2.title["text"] == cst.TIME_COL

    assert fig.layout.yaxis.title["text"] == cst.VALUE_COL
    assert fig.layout.yaxis2.title["text"] == "trend"

    assert fig.layout.title["text"] == title

    # Test plot_seasonalities
    with pytest.warns(Warning):
        # suppresses the warning on seasonalities removed
        title = "Custom seasonality plot"
        fig = model.plot_seasonalities(title=title)
        expected_rows = 3
        assert len(fig.data) == expected_rows
        assert [fig.data[i].name for i in range(expected_rows)] == \
               [cst.VALUE_COL, "WEEKLY_SEASONALITY", "YEARLY_SEASONALITY"]

        assert fig.layout.xaxis.title["text"] == cst.TIME_COL
        assert fig.layout.xaxis2.title["text"] == "Day of week"
        assert fig.layout.xaxis3.title["text"] == "Time of year"

        assert fig.layout.yaxis.title["text"] == cst.VALUE_COL
        assert fig.layout.yaxis2.title["text"] == "weekly"
        assert fig.layout.yaxis3.title["text"] == "yearly"

        assert fig.layout.title["text"] == title
コード例 #25
0
def test_run_forecast_config_custom():
    """Tests `run_forecast_config` on weekly data with custom config:

     - numeric and categorical regressors
     - coverage
     - null model
    """
    data = generate_df_with_reg_for_tests(freq="W-MON",
                                          periods=140,
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    reg_cols = ["regressor1", "regressor2", "regressor_categ"]
    keep_cols = [TIME_COL, VALUE_COL] + reg_cols
    df = data["df"][keep_cols]

    metric = EvaluationMetricEnum.MeanAbsoluteError
    evaluation_metric = EvaluationMetricParam(cv_selection_metric=metric.name,
                                              agg_periods=7,
                                              agg_func=np.max,
                                              null_model_params={
                                                  "strategy": "quantile",
                                                  "constant": None,
                                                  "quantile": 0.5
                                              })

    evaluation_period = EvaluationPeriodParam(test_horizon=10,
                                              periods_between_train_test=5,
                                              cv_horizon=4,
                                              cv_min_train_periods=80,
                                              cv_expanding_window=False,
                                              cv_periods_between_splits=20,
                                              cv_periods_between_train_test=3,
                                              cv_max_splits=3)

    model_components = ModelComponentsParam(
        regressors={"regressor_cols": reg_cols},
        custom={
            "fit_algorithm_dict": {
                "fit_algorithm": "ridge",
                "fit_algorithm_params": {
                    "cv": 2
                }
            }
        })
    computation = ComputationParam(verbose=2)
    forecast_horizon = 27
    coverage = 0.90

    forecast_config = ForecastConfig(
        model_template=ModelTemplateEnum.SILVERKITE.name,
        computation_param=computation,
        coverage=coverage,
        evaluation_metric_param=evaluation_metric,
        evaluation_period_param=evaluation_period,
        forecast_horizon=forecast_horizon,
        model_components_param=model_components)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        forecaster = Forecaster()
        result = forecaster.run_forecast_config(df=df, config=forecast_config)

        mse = EvaluationMetricEnum.RootMeanSquaredError.get_metric_name()
        q80 = EvaluationMetricEnum.Quantile80.get_metric_name()
        assert result.backtest.test_evaluation[mse] == pytest.approx(2.976,
                                                                     rel=1e-2)
        assert result.backtest.test_evaluation[q80] == pytest.approx(1.360,
                                                                     rel=1e-2)
        assert result.forecast.train_evaluation[mse] == pytest.approx(2.224,
                                                                      rel=1e-2)
        assert result.forecast.train_evaluation[q80] == pytest.approx(0.941,
                                                                      rel=1e-2)
        check_forecast_pipeline_result(result,
                                       coverage=coverage,
                                       strategy=None,
                                       score_func=metric.name,
                                       greater_is_better=False)

    with pytest.raises(KeyError, match="missing_regressor"):
        model_components = ModelComponentsParam(
            regressors={"regressor_cols": ["missing_regressor"]})
        forecaster = Forecaster()
        result = forecaster.run_forecast_config(
            df=df,
            config=ForecastConfig(
                model_template=ModelTemplateEnum.SILVERKITE.name,
                model_components_param=model_components))
        check_forecast_pipeline_result(result,
                                       coverage=None,
                                       strategy=None,
                                       score_func=metric.get_metric_func(),
                                       greater_is_better=False)
コード例 #26
0
def test_silverkite_with_components_hourly_data():
    """Tests get_components, plot_components, plot_trend,
    plot_seasonalities with hourly data
    """
    hourly_data = generate_df_with_reg_for_tests(
        freq="H",
        periods=24 * 4,
        train_start_date=datetime.datetime(2018, 1, 1),
        conti_year_origin=2018)
    train_df = hourly_data.get("train_df").copy()
    params_hourly = params_components()

    # converts into parameters for `forecast_silverkite`
    coverage = params_hourly.pop("coverage")
    model = BaseSilverkiteEstimator(
        coverage=coverage, uncertainty_dict=params_hourly["uncertainty_dict"])
    model.fit(X=train_df, time_col=cst.TIME_COL, value_col=cst.VALUE_COL)
    silverkite = SilverkiteForecast()
    model.model_dict = silverkite.forecast(df=train_df,
                                           time_col=cst.TIME_COL,
                                           value_col=cst.VALUE_COL,
                                           **params_hourly)
    model.finish_fit()

    # Test plot_components
    with pytest.warns(Warning) as record:
        title = "Custom component plot"
        fig = model.plot_components(
            names=["trend", "DAILY_SEASONALITY", "DUMMY"], title=title)
        expected_rows = 3 + 1  # includes changepoints
        assert len(fig.data) == expected_rows
        assert [fig.data[i].name for i in range(expected_rows)] == \
               [cst.VALUE_COL, "trend", "DAILY_SEASONALITY", "trend change point"]

        assert fig.layout.xaxis.title["text"] == cst.TIME_COL
        assert fig.layout.xaxis2.title["text"] == cst.TIME_COL
        assert fig.layout.xaxis3.title["text"] == "Hour of day"

        assert fig.layout.yaxis.title["text"] == cst.VALUE_COL
        assert fig.layout.yaxis2.title["text"] == "trend"
        assert fig.layout.yaxis3.title["text"] == "daily"

        assert fig.layout.title["text"] == title
        assert f"The following components have not been specified in the model: " \
               f"{{'DUMMY'}}, plotting the rest." in record[0].message.args[0]

    # Test plot_trend
    title = "Custom trend plot"
    fig = model.plot_trend(title=title)
    expected_rows = 2
    assert len(fig.data) == expected_rows + 1  # includes changepoints
    assert [fig.data[i].name
            for i in range(expected_rows)] == [cst.VALUE_COL, "trend"]

    assert fig.layout.xaxis.title["text"] == cst.TIME_COL
    assert fig.layout.xaxis2.title["text"] == cst.TIME_COL

    assert fig.layout.yaxis.title["text"] == cst.VALUE_COL
    assert fig.layout.yaxis2.title["text"] == "trend"

    assert fig.layout.title["text"] == title

    # Test plot_seasonalities
    with pytest.warns(Warning):
        # suppresses the warning on seasonalities removed
        title = "Custom seasonality plot"
        fig = model.plot_seasonalities(title=title)
        expected_rows = 4
        assert len(fig.data) == expected_rows
        assert [fig.data[i].name for i in range(expected_rows)] == \
               [cst.VALUE_COL, "DAILY_SEASONALITY", "WEEKLY_SEASONALITY", "YEARLY_SEASONALITY"]

        assert fig.layout.xaxis.title["text"] == cst.TIME_COL
        assert fig.layout.xaxis2.title["text"] == "Hour of day"
        assert fig.layout.xaxis3.title["text"] == "Day of week"
        assert fig.layout.xaxis4.title["text"] == "Time of year"

        assert fig.layout.yaxis.title["text"] == cst.VALUE_COL
        assert fig.layout.yaxis2.title["text"] == "daily"
        assert fig.layout.yaxis3.title["text"] == "weekly"
        assert fig.layout.yaxis4.title["text"] == "yearly"

        assert fig.layout.title["text"] == title