Ejemplo n.º 1
0
def test_benchmark_silverkite_template_with_real_data():
    # setting every list to 1 item to speed up test case
    forecast_horizons = [30]
    max_cvs = [3]
    fit_algorithms = ["linear"]
    metric = EvaluationMetricEnum.MeanSquaredError
    evaluation_metric = EvaluationMetricParam(cv_selection_metric=metric.name)

    # real data
    dl = DataLoader()
    data_path = dl.get_data_home(data_sub_dir="daily")
    data_name = "daily_female_births"
    df = dl.get_df(data_path=data_path, data_name="daily_female_births")
    time_col = "Date"
    value_col = "Births"
    metadata = MetadataParam(time_col=time_col, value_col=value_col, freq="D")
    result_silverkite_real = benchmark_silverkite_template(
        data_name=data_name,
        df=df,
        metadata=metadata,
        evaluation_metric=evaluation_metric,
        forecast_horizons=forecast_horizons,
        fit_algorithms=fit_algorithms,
        max_cvs=max_cvs)

    result_silverkite_real = result_silverkite_real[0]
    assert result_silverkite_real["data_name"] == data_name
    assert result_silverkite_real["forecast_model_name"] == "silverkite_linear"
    assert result_silverkite_real["train_period"] == df.shape[0]
    assert result_silverkite_real["forecast_horizon"] == 30
    assert result_silverkite_real["cv_folds"] == 3
Ejemplo n.º 2
0
def test_benchmark_silverkite_template_with_simulated_data():
    # setting every list to 1 item to speed up test case
    forecast_horizons = [30]
    max_cvs = [3]
    fit_algorithms = ["linear"]
    metric = EvaluationMetricEnum.MeanSquaredError
    evaluation_metric = EvaluationMetricParam(cv_selection_metric=metric.name)

    # Simulated data
    data_name = "daily_simulated"
    train_period = 365
    data = generate_df_for_tests(freq="D", periods=train_period)
    df = data["df"]
    time_col, value_col = df.columns
    metadata = MetadataParam(time_col=time_col, value_col=value_col, freq="D")
    result_silverkite_simulated = benchmark_silverkite_template(
        data_name=data_name,
        df=df,
        metadata=metadata,
        evaluation_metric=evaluation_metric,
        forecast_horizons=forecast_horizons,
        fit_algorithms=fit_algorithms,
        max_cvs=max_cvs)

    result_silverkite_simulated = result_silverkite_simulated[0]
    assert result_silverkite_simulated["data_name"] == data_name
    assert result_silverkite_simulated["forecast_model_name"] == "silverkite_linear"
    assert result_silverkite_simulated["train_period"] == train_period
    assert result_silverkite_simulated["forecast_horizon"] == 30
    assert result_silverkite_simulated["cv_folds"] == 3
Ejemplo n.º 3
0
def test_apply_template_for_pipeline_params(df):
    mt = MyTemplate()
    config = ForecastConfig(metadata_param=MetadataParam(
        time_col=NEW_TIME_COL,
        value_col=NEW_VALUE_COL,
    ),
                            evaluation_metric_param=EvaluationMetricParam(
                                cv_selection_metric="MeanSquaredError"))
    original_config = dataclasses.replace(config)

    # Tests apply_template_for_pipeline_params
    pipeline_params = mt.apply_template_for_pipeline_params(df=df,
                                                            config=config)

    assert_equal(pipeline_params["df"], df)
    assert pipeline_params["train_end_date"] is None
    estimator = pipeline_params["pipeline"].steps[-1][-1]
    assert isinstance(estimator, SilverkiteEstimator)
    assert estimator.coverage == mt.config.coverage
    assert mt.estimator is not estimator
    assert mt.estimator.coverage is None
    assert (
        pipeline_params["pipeline"].named_steps["input"].transformer_list[2]
        [1].named_steps["select_reg"].column_names == mt.get_regressor_cols())

    # Tests `apply_template_decorator`
    assert mt.config == mt.apply_forecast_config_defaults(config)
    assert mt.config != config  # `mt.config` has default values added
    assert config == original_config  # `config` is not modified by the function
Ejemplo n.º 4
0
def test_get_forecast_time_properties(df):
    mt = MyTemplate()
    mt.df = df

    # with `train_end_date` (masking applied)
    mt.config = ForecastConfig(
        coverage=0.9,
        forecast_horizon=20,
        metadata_param=MetadataParam(
            time_col=NEW_TIME_COL,
            value_col=NEW_VALUE_COL,
            freq="H",
            date_format="%Y-%m-%d-%H",
            train_end_date=datetime.datetime(2019, 2, 1),
        )
    )
    mt.regressor_cols = mt.get_regressor_cols()
    mt.lagged_regressor_cols = mt.get_lagged_regressor_info()["lagged_regressor_cols"]
    time_properties = mt.get_forecast_time_properties()

    period = 3600  # seconds between observations
    time_delta = (mt.config.metadata_param.train_end_date - df[mt.config.metadata_param.time_col].min())  # train end - train start
    num_training_days = (time_delta.days + (time_delta.seconds + period) / TimeEnum.ONE_DAY_IN_SECONDS.value)
    assert time_properties["num_training_days"] == num_training_days

    # without `train_end_date`
    mt.config.metadata_param.train_end_date = None
    time_properties = mt.get_forecast_time_properties()
    time_delta = (datetime.datetime(2019, 2, 26) - df[mt.config.metadata_param.time_col].min())  # by default, train end is the last date with nonnull value_col
    num_training_days = (time_delta.days + (time_delta.seconds + period) / TimeEnum.ONE_DAY_IN_SECONDS.value)
    assert time_properties["num_training_days"] == num_training_days
Ejemplo n.º 5
0
def test_forecast_config():
    """Tests ForecastConfig dataclass"""
    config = ForecastConfig(
        model_template=ModelTemplateEnum.SILVERKITE.name,
        metadata_param=MetadataParam(time_col="custom_time_col",
                                     anomaly_info=[{
                                         "key": "value"
                                     }, {
                                         "key2": "value2"
                                     }]),
        evaluation_period_param=EvaluationPeriodParam(
            test_horizon=10,
            periods_between_train_test=5,
            cv_min_train_periods=20),
        evaluation_metric_param=EvaluationMetricParam(
            cv_selection_metric=EvaluationMetricEnum.MeanSquaredError.name,
            cv_report_metrics=[
                EvaluationMetricEnum.MeanAbsoluteError.name,
                EvaluationMetricEnum.MeanAbsolutePercentError.name
            ],
            relative_error_tolerance=0.02),
        model_components_param=ModelComponentsParam(
            autoregression={"autoreg_dict": {
                "autoreg_param": 0
            }},
            changepoints=None,
            custom={"custom_param": 1},
            growth={"growth_param": 2},
            events={"events_param": 3},
            hyperparameter_override=[{
                "h1": 4
            }, {
                "h2": 5
            }, None],
            regressors={"names": ["regressor1", "regressor2"]},
            lagged_regressors={"lagged_regressor_dict": {
                "lag_reg_param": 0
            }},
            seasonality={"seas_param": 6},
            uncertainty={"uncertainty_param": 7}),
        computation_param=ComputationParam(n_jobs=None))
    assert_forecast_config(config)

    # Tests a string passed to `cv_report_metrics`
    assert ForecastConfig(evaluation_metric_param=EvaluationMetricParam(
        cv_report_metrics=CV_REPORT_METRICS_ALL), ).to_dict()
Ejemplo n.º 6
0
def test_estimator_plot_components_from_forecaster():
    """Tests estimator's plot_components function after the Forecaster has set everything up at the top most level"""
    # Test with real data (Female-births) via model template
    dl = DataLoader()
    data_path = dl.get_data_home(data_sub_dir="daily")
    df = dl.get_df(data_path=data_path, data_name="daily_female_births")
    metadata = MetadataParam(time_col="Date", value_col="Births", freq="D")
    model_components = ModelComponentsParam(
        seasonality={
            "yearly_seasonality": True,
            "quarterly_seasonality": True,
            "weekly_seasonality": True,
            "daily_seasonality": False
        })
    result = Forecaster().run_forecast_config(
        df=df,
        config=ForecastConfig(
            model_template=ModelTemplateEnum.SILVERKITE.name,
            forecast_horizon=30,  # forecast 1 month
            coverage=0.95,  # 95% prediction intervals
            metadata_param=metadata,
            model_components_param=model_components))
    estimator = result.model.steps[-1][-1]
    assert estimator.plot_components()
Ejemplo n.º 7
0
    def apply_metadata_defaults(
            metadata: Optional[MetadataParam] = None) -> MetadataParam:
        """Applies the default MetadataParam values to the given object.
        If an expected attribute value is provided, the value is unchanged. Otherwise the default value for it is used.
        Other attributes are untouched.
        If the input object is None, it creates a MetadataParam object.

        Parameters
        ----------
        metadata : `~greykite.framework.templates.autogen.forecast_config.MetadataParam` or None
            The MetadataParam object.

        Returns
        -------
        metadata : `~greykite.framework.templates.autogen.forecast_config.MetadataParam`
            Valid MetadataParam object with the provided attribute values and the default attribute values if not.
        """
        if metadata is None:
            metadata = MetadataParam()
        if metadata.time_col is None:
            metadata.time_col = TIME_COL
        if metadata.value_col is None:
            metadata.value_col = VALUE_COL
        return metadata
Ejemplo n.º 8
0
def test_prophet_template_custom():
    """Tests prophet_template with custom values, with long range input"""
    # prepares input data
    data = generate_df_with_reg_for_tests(freq="H",
                                          periods=300 * 24,
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    df = data["df"]
    time_col = "some_time_col"
    value_col = "some_value_col"
    df.rename({
        cst.TIME_COL: time_col,
        cst.VALUE_COL: value_col
    },
              axis=1,
              inplace=True)
    # prepares params and calls template
    metric = EvaluationMetricEnum.MeanAbsoluteError
    # anomaly adjustment adds 10.0 to every record
    adjustment_size = 10.0
    anomaly_df = pd.DataFrame({
        cst.START_DATE_COL: [df[time_col].min()],
        cst.END_DATE_COL: [df[time_col].max()],
        cst.ADJUSTMENT_DELTA_COL: [adjustment_size],
        cst.METRIC_COL: [value_col]
    })
    anomaly_info = {
        "value_col": cst.VALUE_COL,
        "anomaly_df": anomaly_df,
        "start_date_col": cst.START_DATE_COL,
        "end_date_col": cst.END_DATE_COL,
        "adjustment_delta_col": cst.ADJUSTMENT_DELTA_COL,
        "filter_by_dict": {
            cst.METRIC_COL: cst.VALUE_COL
        },
        "adjustment_method": "add"
    }
    metadata = MetadataParam(
        time_col=time_col,
        value_col=value_col,
        freq="H",
        date_format="%Y-%m-%d-%H",
        train_end_date=datetime.datetime(2019, 7, 1),
        anomaly_info=anomaly_info,
    )
    evaluation_metric = EvaluationMetricParam(
        cv_selection_metric=metric.name,
        cv_report_metrics=[
            EvaluationMetricEnum.MedianAbsolutePercentError.name
        ],
        agg_periods=24,
        agg_func=np.max,
        null_model_params={
            "strategy": "quantile",
            "constant": None,
            "quantile": 0.8
        },
        relative_error_tolerance=0.01)
    evaluation_period = EvaluationPeriodParam(test_horizon=1,
                                              periods_between_train_test=2,
                                              cv_horizon=3,
                                              cv_min_train_periods=4,
                                              cv_expanding_window=True,
                                              cv_periods_between_splits=5,
                                              cv_periods_between_train_test=6,
                                              cv_max_splits=7)
    model_components = ModelComponentsParam(
        seasonality={
            "yearly_seasonality": [True],
            "weekly_seasonality": [False],
            "daily_seasonality": [4],
            "add_seasonality_dict": [{
                "yearly": {
                    "period": 365.25,
                    "fourier_order": 20,
                    "prior_scale": 20.0
                },
                "quarterly": {
                    "period": 365.25 / 4,
                    "fourier_order": 15
                },
                "weekly": {
                    "period": 7,
                    "fourier_order": 35,
                    "prior_scale": 30.0
                }
            }]
        },
        growth={"growth_term": "linear"},
        events={
            "holiday_lookup_countries":
            ["UnitedStates", "UnitedKingdom", "India"],
            "holiday_pre_num_days": [2],
            "holiday_post_num_days": [3],
            "holidays_prior_scale": [5.0]
        },
        regressors={
            "add_regressor_dict": [{
                "regressor1": {
                    "prior_scale": 10.0,
                    "mode": 'additive'
                },
                "regressor2": {
                    "prior_scale": 20.0,
                    "mode": 'multiplicative'
                },
            }]
        },
        changepoints={
            "changepoint_prior_scale": [0.05],
            "changepoints": [None],
            "n_changepoints": [50],
            "changepoint_range": [0.9]
        },
        uncertainty={
            "mcmc_samples": [500],
            "uncertainty_samples": [2000]
        },
        hyperparameter_override={
            "input__response__null__impute_algorithm":
            "ts_interpolate",
            "input__response__null__impute_params": {
                "orders": [7, 14]
            },
            "input__regressors_numeric__normalize__normalize_algorithm":
            "RobustScaler",
        })
    computation = ComputationParam(hyperparameter_budget=10,
                                   n_jobs=None,
                                   verbose=1)
    forecast_horizon = 20
    coverage = 0.7
    config = ForecastConfig(model_template=ModelTemplateEnum.PROPHET.name,
                            metadata_param=metadata,
                            forecast_horizon=forecast_horizon,
                            coverage=coverage,
                            evaluation_metric_param=evaluation_metric,
                            evaluation_period_param=evaluation_period,
                            model_components_param=model_components,
                            computation_param=computation)
    template = ProphetTemplate()
    params = template.apply_template_for_pipeline_params(df=df, config=config)
    pipeline = params.pop("pipeline", None)
    # Adding start_year and end_year based on the input df
    model_components.events["start_year"] = df[time_col].min().year
    model_components.events["end_year"] = df[time_col].max().year
    expected_params = dict(
        df=df,
        time_col=time_col,
        value_col=value_col,
        date_format=metadata.date_format,
        freq=metadata.freq,
        train_end_date=metadata.train_end_date,
        anomaly_info=metadata.anomaly_info,
        # model
        regressor_cols=template.regressor_cols,
        estimator=None,
        hyperparameter_grid=template.hyperparameter_grid,
        hyperparameter_budget=computation.hyperparameter_budget,
        n_jobs=computation.n_jobs,
        verbose=computation.verbose,
        # forecast
        forecast_horizon=forecast_horizon,
        coverage=coverage,
        test_horizon=evaluation_period.test_horizon,
        periods_between_train_test=evaluation_period.
        periods_between_train_test,
        agg_periods=evaluation_metric.agg_periods,
        agg_func=evaluation_metric.agg_func,
        # evaluation
        score_func=metric.name,
        score_func_greater_is_better=metric.get_metric_greater_is_better(),
        cv_report_metrics=evaluation_metric.cv_report_metrics,
        null_model_params=evaluation_metric.null_model_params,
        relative_error_tolerance=evaluation_metric.relative_error_tolerance,
        # CV
        cv_horizon=evaluation_period.cv_horizon,
        cv_min_train_periods=evaluation_period.cv_min_train_periods,
        cv_expanding_window=evaluation_period.cv_expanding_window,
        cv_periods_between_splits=evaluation_period.cv_periods_between_splits,
        cv_periods_between_train_test=evaluation_period.
        cv_periods_between_train_test,
        cv_max_splits=evaluation_period.cv_max_splits)
    assert_basic_pipeline_equal(pipeline, template.pipeline)
    assert_equal(params, expected_params)
Ejemplo n.º 9
0
def test_run_prophet_template_custom():
    """Tests running prophet template through the pipeline"""
    data = generate_df_with_reg_for_tests(freq="D",
                                          periods=50,
                                          train_frac=0.8,
                                          conti_year_origin=2018,
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    # select relevant columns for testing
    relevant_cols = [
        cst.TIME_COL, cst.VALUE_COL, "regressor1", "regressor2", "regressor3"
    ]
    df = data["df"][relevant_cols]
    forecast_horizon = data["fut_time_num"]

    # Model components - custom holidays; other params as defaults
    model_components = ModelComponentsParam(
        seasonality={
            "seasonality_mode": ["additive"],
            "yearly_seasonality": ["auto"],
            "weekly_seasonality": [True],
            "daily_seasonality": ["auto"],
        },
        growth={"growth_term": ["linear"]},
        events={
            "holiday_pre_num_days": [1],
            "holiday_post_num_days": [1],
            "holidays_prior_scale": [1.0]
        },
        changepoints={
            "changepoint_prior_scale": [0.05],
            "n_changepoints": [1],
            "changepoint_range": [0.5],
        },
        regressors={
            "add_regressor_dict": [{
                "regressor1": {
                    "prior_scale": 10,
                    "standardize": True,
                    "mode": "additive"
                },
                "regressor2": {
                    "prior_scale": 15,
                    "standardize": False,
                    "mode": "additive"
                },
                "regressor3": {}
            }]
        },
        uncertainty={"uncertainty_samples": [10]})

    metadata = MetadataParam(
        time_col=cst.TIME_COL,
        value_col=cst.VALUE_COL,
        freq="D",
    )
    evaluation_period = EvaluationPeriodParam(
        test_horizon=5,  # speeds up test case
        periods_between_train_test=5,
        cv_horizon=0,  # speeds up test case
    )
    config = ForecastConfig(
        model_template=ModelTemplateEnum.PROPHET.name,
        metadata_param=metadata,
        forecast_horizon=forecast_horizon,
        coverage=0.95,
        model_components_param=model_components,
        evaluation_period_param=evaluation_period,
    )
    result = Forecaster().run_forecast_config(
        df=df,
        config=config,
    )

    forecast_df = result.forecast.df_test.reset_index(drop=True)
    expected_cols = [
        "ts", "actual", "forecast", "forecast_lower", "forecast_upper"
    ]
    assert list(forecast_df.columns) == expected_cols
    assert result.backtest.coverage == 0.95, "coverage is not correct"
    # NB: coverage is poor because of very small dataset size and low uncertainty_samples
    assert result.backtest.train_evaluation[cst.PREDICTION_BAND_COVERAGE] == pytest.approx(0.677, rel=1e-3), \
        "training coverage is None or less than expected"
    assert result.backtest.test_evaluation[cst.PREDICTION_BAND_COVERAGE] == pytest.approx(0.800, rel=1e-3), \
        "testing coverage is None or less than expected"
    assert result.backtest.train_evaluation["MSE"] == pytest.approx(3.7849, rel=1e-3), \
        "training MSE is None or more than expected"
    assert result.backtest.test_evaluation["MSE"] == pytest.approx(2.9609, rel=1e-3), \
        "testing MSE is None or more than expected"
    assert result.forecast.train_evaluation[cst.PREDICTION_BAND_COVERAGE] == pytest.approx(0.7805, rel=1e-3), \
        "forecast coverage is None or less than expected"
    assert result.forecast.train_evaluation["MSE"] == pytest.approx(4.1806, rel=1e-3), \
        "forecast MSE is None or more than expected"

    # ensure regressors were used in the model
    prophet_estimator = result.model.steps[-1][-1]
    regressors = prophet_estimator.model.extra_regressors
    assert regressors.keys() == {"regressor1", "regressor2", "regressor3"}
    assert regressors["regressor1"]["prior_scale"] == 10.0
    assert regressors["regressor1"]["standardize"] is True
    assert regressors["regressor1"]["mode"] == "additive"
    assert regressors["regressor2"]["prior_scale"] == 15.0
    assert regressors["regressor3"]["standardize"] == "auto"
Ejemplo n.º 10
0
# Loads dataset into pandas DataFrame
dl = DataLoader()
df = dl.load_peyton_manning()

# %%
# Then we create a forecast model with ``SILVERKITE`` template.
# For a simple example of creating a forecast model, see
# `Simple Forecast <./0100_simple_forecast.html>`_.
# For a detailed tuning tutorial, see
# `Forecast Model Tuning <../tutorials/0100_forecast_tutorial.html>`_.

# Specifies dataset information
metadata = MetadataParam(
    time_col="ts",  # name of the time column
    value_col="y",  # name of the value column
    freq="D"  # "H" for hourly, "D" for daily, "W" for weekly, etc.
)

# Specifies model parameters
model_components = ModelComponentsParam(changepoints={
    "changepoints_dict": {
        "method": "auto",
        "potential_changepoint_n": 25,
        "regularization_strength": 0.5,
        "resample_freq": "7D",
        "no_changepoint_distance_from_end": "365D"
    }
},
                                        uncertainty={
                                            "uncertainty_dict": "auto",
Ejemplo n.º 11
0
def test_silverkite_template_custom(model_components_param):
    """"Tests simple_silverkite_template with custom parameters,
    and data that has regressors"""
    data = generate_df_with_reg_for_tests(
        freq="H",
        periods=300*24,
        remove_extra_cols=True,
        mask_test_actuals=True)
    df = data["df"]
    time_col = "some_time_col"
    value_col = "some_value_col"
    df.rename({
        TIME_COL: time_col,
        VALUE_COL: value_col
    }, axis=1, inplace=True)

    metric = EvaluationMetricEnum.MeanAbsoluteError
    # anomaly adjustment adds 10.0 to every record
    adjustment_size = 10.0
    anomaly_df = pd.DataFrame({
        START_DATE_COL: [df[time_col].min()],
        END_DATE_COL: [df[time_col].max()],
        ADJUSTMENT_DELTA_COL: [adjustment_size],
        METRIC_COL: [value_col]
    })
    anomaly_info = {
        "value_col": VALUE_COL,
        "anomaly_df": anomaly_df,
        "start_date_col": START_DATE_COL,
        "end_date_col": END_DATE_COL,
        "adjustment_delta_col": ADJUSTMENT_DELTA_COL,
        "filter_by_dict": {METRIC_COL: VALUE_COL},
        "adjustment_method": "add"
    }
    metadata = MetadataParam(
        time_col=time_col,
        value_col=value_col,
        freq="H",
        date_format="%Y-%m-%d-%H",
        train_end_date=datetime.datetime(2019, 7, 1),
        anomaly_info=anomaly_info
    )
    evaluation_metric = EvaluationMetricParam(
        cv_selection_metric=metric.name,
        cv_report_metrics=[EvaluationMetricEnum.MedianAbsolutePercentError.name],
        agg_periods=24,
        agg_func=np.max,
        null_model_params={
            "strategy": "quantile",
            "constant": None,
            "quantile": 0.8
        },
        relative_error_tolerance=0.01
    )
    evaluation_period = EvaluationPeriodParam(
        test_horizon=1,
        periods_between_train_test=2,
        cv_horizon=3,
        cv_min_train_periods=4,
        cv_expanding_window=True,
        cv_periods_between_splits=5,
        cv_periods_between_train_test=6,
        cv_max_splits=7
    )
    computation = ComputationParam(
        hyperparameter_budget=10,
        n_jobs=None,
        verbose=1
    )
    forecast_horizon = 20
    coverage = 0.7
    template = SilverkiteTemplate()
    params = template.apply_template_for_pipeline_params(
        df=df,
        config=ForecastConfig(
            model_template=ModelTemplateEnum.SK.name,
            metadata_param=metadata,
            forecast_horizon=forecast_horizon,
            coverage=coverage,
            evaluation_metric_param=evaluation_metric,
            evaluation_period_param=evaluation_period,
            model_components_param=model_components_param,
            computation_param=computation
        )
    )
    pipeline = params.pop("pipeline", None)
    expected_params = dict(
        df=df,
        time_col=time_col,
        value_col=value_col,
        date_format=metadata.date_format,
        freq=metadata.freq,
        train_end_date=metadata.train_end_date,
        anomaly_info=metadata.anomaly_info,
        # model
        regressor_cols=template.regressor_cols,
        estimator=None,
        hyperparameter_grid=template.hyperparameter_grid,
        hyperparameter_budget=computation.hyperparameter_budget,
        n_jobs=computation.n_jobs,
        verbose=computation.verbose,
        # forecast
        forecast_horizon=forecast_horizon,
        coverage=coverage,
        test_horizon=evaluation_period.test_horizon,
        periods_between_train_test=evaluation_period.periods_between_train_test,
        agg_periods=evaluation_metric.agg_periods,
        agg_func=evaluation_metric.agg_func,
        relative_error_tolerance=evaluation_metric.relative_error_tolerance,
        # evaluation
        score_func=metric.name,
        score_func_greater_is_better=metric.get_metric_greater_is_better(),
        cv_report_metrics=evaluation_metric.cv_report_metrics,
        null_model_params=evaluation_metric.null_model_params,
        # CV
        cv_horizon=evaluation_period.cv_horizon,
        cv_min_train_periods=evaluation_period.cv_min_train_periods,
        cv_expanding_window=evaluation_period.cv_expanding_window,
        cv_periods_between_splits=evaluation_period.cv_periods_between_splits,
        cv_periods_between_train_test=evaluation_period.cv_periods_between_train_test,
        cv_max_splits=evaluation_period.cv_max_splits
    )
    assert_basic_pipeline_equal(pipeline, template.pipeline)
    assert_equal(params, expected_params)
Ejemplo n.º 12
0
def seek_the_oracle(
    df_index,
    series,
    col,
    forecast_length,
    freq,
    prediction_interval=0.9,
    model_template='silverkite',
    growth=None,
    holiday=True,
    holiday_country="UnitedStates",
    regressors=None,
    verbose=0,
    inner_n_jobs=1,
    **kwargs
):
    """Internal. For loop or parallel version of Greykite."""
    inner_df = pd.DataFrame(
        {
            'ts': df_index,
            'y': series,
        }
    )
    if regressors is not None:
        inner_regr = regressors.copy()
        new_names = [
            'rrrr' + str(x) if x in inner_df.columns else str(x)
            for x in inner_regr.columns
        ]
        inner_regr.columns = new_names
        inner_regr.index.name = 'ts'
        inner_regr.reset_index(drop=False, inplace=True)
        inner_df = inner_df.merge(inner_regr, left_on='ts', right_on='ts', how='outer')
    metadata = MetadataParam(
        time_col="ts",  # name of the time column ("date" in example above)
        value_col="y",  # name of the value column ("sessions" in example above)
        freq=freq,  # "H" for hourly, "D" for daily, "W" for weekly, etc.
    )
    # INCLUDE forecast_length lagged mean and std of other features!
    model_template = ModelTemplateEnum.SILVERKITE.name
    forecaster = Forecaster()  # Creates forecasts and stores the result
    if regressors is not None:
        model_components = ModelComponentsParam(
            growth=growth, regressors={"regressor_cols": new_names}
        )
    else:
        model_components = ModelComponentsParam(
            growth=growth,  # 'linear', 'quadratic', 'sqrt'
        )
    computation = ComputationParam(n_jobs=inner_n_jobs, verbose=verbose)
    if holiday:  # also 'auto'
        model_components.events = {
            # These holidays as well as their pre/post dates are modeled as individual events.
            "holidays_to_model_separately": SilverkiteHoliday.ALL_HOLIDAYS_IN_COUNTRIES,  # all holidays in "holiday_lookup_countries"
            "holiday_lookup_countries": [
                holiday_country
            ],  # only look up holidays in the United States
            "holiday_pre_num_days": 1,  # also mark the 1 days before a holiday as holiday
            "holiday_post_num_days": 1,  # also mark the 1 days after a holiday as holiday
        }
    config = ForecastConfig(
        model_template=model_template,
        forecast_horizon=forecast_length,
        coverage=prediction_interval,
        model_components_param=model_components,
        metadata_param=metadata,
        computation_param=computation,
    )
    result = forecaster.run_forecast_config(  # result is also stored as `forecaster.forecast_result`.
        df=inner_df,
        config=config,
    )
    res_df = result.forecast.df.tail(forecast_length).drop(columns=['actual'])
    res_df['series_id'] = col
    return res_df
Ejemplo n.º 13
0
# `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.plot_quantiles_and_overlays`.
# A tutorial of using the function can be found at `Seasonality <../quickstart/0300_seasonality.html>`_.

# %%
# Baseline model
# --------------------
# A simple forecast can be created on the data set,
# see details in `Simple Forecast <../quickstart/0100_simple_forecast.html>`_.
# Note that if you do not provide any extra parameters, all model parameters are by default.
# The default parameters are chosen conservatively, so consider this a baseline
# model to assess forecast difficulty and make further improvements if necessary.

# Specifies dataset information
metadata = MetadataParam(
    time_col="ts",  # name of the time column
    value_col="y",  # name of the value column
    freq="D"  # "H" for hourly, "D" for daily, "W" for weekly, etc.
)

forecaster = Forecaster()
result = forecaster.run_forecast_config(
    df=df,
    config=ForecastConfig(
        model_template=ModelTemplateEnum.SILVERKITE.name,
        forecast_horizon=365,  # forecasts 365 steps ahead
        coverage=0.95,  # 95% prediction intervals
        metadata_param=metadata))

# %%
# For a detailed documentation about the output from
# :py:meth:`~greykite.framework.templates.forecaster.Forecaster.run_forecast_config`,
Ejemplo n.º 14
0
def valid_configs():
    metadata = MetadataParam(time_col=TIME_COL, value_col=VALUE_COL, freq="D")
    computation = ComputationParam(hyperparameter_budget=10,
                                   n_jobs=None,
                                   verbose=1)
    forecast_horizon = 2 * 7
    coverage = 0.90
    evaluation_metric = EvaluationMetricParam(
        cv_selection_metric=EvaluationMetricEnum.MeanAbsoluteError.name,
        cv_report_metrics=None,
        agg_periods=7,
        agg_func=np.mean,
        null_model_params=None)
    evaluation_period = EvaluationPeriodParam(test_horizon=2 * 7,
                                              periods_between_train_test=2 * 7,
                                              cv_horizon=1 * 7,
                                              cv_min_train_periods=8 * 7,
                                              cv_expanding_window=True,
                                              cv_periods_between_splits=7,
                                              cv_periods_between_train_test=3 *
                                              7,
                                              cv_max_splits=2)

    silverkite_components = ModelComponentsParam(
        seasonality={
            "yearly_seasonality": False,
            "weekly_seasonality": True
        },
        growth={"growth_term": "quadratic"},
        events={
            "holidays_to_model_separately":
            SilverkiteHoliday.ALL_HOLIDAYS_IN_COUNTRIES,
            "holiday_lookup_countries": ["UnitedStates"],
            "holiday_pre_num_days": 3,
        },
        changepoints={
            "changepoints_dict": {
                "method": "uniform",
                "n_changepoints": 20,
            }
        },
        regressors={
            "regressor_cols": ["regressor1", "regressor2", "regressor3"]
        },
        uncertainty={
            "uncertainty_dict": "auto",
        },
        hyperparameter_override={"input__response__null__max_frac": 0.1},
        custom={
            "fit_algorithm_dict": {
                "fit_algorithm": "ridge",
                "fit_algorithm_params": {
                    "normalize": True
                },
            },
            "feature_sets_enabled": False
        })

    prophet_components = ModelComponentsParam(
        seasonality={
            "seasonality_mode": ["additive"],
            "yearly_seasonality": ["auto"],
            "weekly_seasonality": [True],
            "daily_seasonality": ["auto"],
        },
        growth={"growth_term": ["linear"]},
        events={
            "holiday_pre_num_days": [1],
            "holiday_post_num_days": [1],
            "holidays_prior_scale": [1.0]
        },
        changepoints={
            "changepoint_prior_scale": [0.05],
            "n_changepoints": [1],
            "changepoint_range": [0.5],
        },
        regressors={
            "add_regressor_dict": [{
                "regressor1": {
                    "prior_scale": 10,
                    "standardize": True,
                    "mode": 'additive'
                },
                "regressor2": {
                    "prior_scale": 15,
                    "standardize": False,
                    "mode": 'additive'
                },
                "regressor3": {}
            }]
        },
        uncertainty={"uncertainty_samples": [10]})

    valid_prophet = ForecastConfig(
        model_template=ModelTemplateEnum.PROPHET.name,
        metadata_param=metadata,
        computation_param=computation,
        coverage=coverage,
        evaluation_metric_param=evaluation_metric,
        evaluation_period_param=evaluation_period,
        forecast_horizon=forecast_horizon,
        model_components_param=prophet_components)

    valid_silverkite = ForecastConfig(
        model_template=ModelTemplateEnum.SILVERKITE.name,
        metadata_param=metadata,
        computation_param=computation,
        coverage=coverage,
        evaluation_metric_param=evaluation_metric,
        evaluation_period_param=evaluation_period,
        forecast_horizon=forecast_horizon,
        model_components_param=silverkite_components)

    configs = {
        "valid_prophet": valid_prophet,
        "valid_silverkite": valid_silverkite
    }
    return configs
def test_run_auto_arima_template_custom():
    """Tests running auto arima template through the pipeline"""
    data = generate_df_with_reg_for_tests(freq="D",
                                          periods=50,
                                          train_frac=0.8,
                                          conti_year_origin=2018,
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    # select relevant columns for testing
    relevant_cols = [
        cst.TIME_COL, cst.VALUE_COL, "regressor1", "regressor2", "regressor3"
    ]
    df = data["df"][relevant_cols]
    forecast_horizon = data["fut_time_num"]

    # Model components - custom holidays; other params as defaults
    model_components = ModelComponentsParam(
        # Everything except `custom` and `hyperparameter_override` are ignored
        seasonality={
            "seasonality_mode": ["additive"],
            "yearly_seasonality": ["auto"],
            "weekly_seasonality": [True],
            "daily_seasonality": ["auto"],
        },
        growth={"growth_term": ["linear"]},
        events={
            "holiday_pre_num_days": [1],
            "holiday_post_num_days": [1],
            "holidays_prior_scale": [1.0]
        },
        changepoints={
            "changepoint_prior_scale": [0.05],
            "n_changepoints": [1],
            "changepoint_range": [0.5],
        },
        regressors={
            "add_regressor_dict": [{
                "regressor1": {
                    "prior_scale": 10,
                    "standardize": True,
                    "mode": "additive"
                },
                "regressor2": {
                    "prior_scale": 15,
                    "standardize": False,
                    "mode": "additive"
                },
                "regressor3": {}
            }]
        },
        uncertainty={"uncertainty_samples": [10]},
        custom={
            "max_order": [10],
            "information_criterion": ["bic"]
        })

    metadata = MetadataParam(
        time_col=cst.TIME_COL,
        value_col=cst.VALUE_COL,
        freq="D",
    )
    evaluation_period = EvaluationPeriodParam(
        test_horizon=5,  # speeds up test case
        periods_between_train_test=5,
        cv_horizon=0,  # speeds up test case
    )
    config = ForecastConfig(
        model_template=ModelTemplateEnum.AUTO_ARIMA.name,
        metadata_param=metadata,
        forecast_horizon=forecast_horizon,
        coverage=0.95,
        model_components_param=model_components,
        evaluation_period_param=evaluation_period,
    )
    result = Forecaster().run_forecast_config(
        df=df,
        config=config,
    )

    forecast_df = result.forecast.df_test.reset_index(drop=True)
    expected_cols = [
        "ts", "actual", "forecast", "forecast_lower", "forecast_upper"
    ]
    assert list(forecast_df.columns) == expected_cols
    assert result.backtest.coverage == 0.95, "coverage is not correct"
    # NB: coverage is poor because of very small dataset size and low uncertainty_samples
    assert result.backtest.train_evaluation[
        cst.PREDICTION_BAND_COVERAGE] is not None
    assert result.backtest.test_evaluation[
        cst.PREDICTION_BAND_COVERAGE] is not None
    assert result.backtest.train_evaluation["MSE"] is not None
    assert result.backtest.test_evaluation["MSE"] is not None
    assert result.forecast.train_evaluation[
        cst.PREDICTION_BAND_COVERAGE] is not None
    assert result.forecast.train_evaluation["MSE"] is not None
Ejemplo n.º 16
0
from greykite.common.data_loader import DataLoader
from greykite.framework.templates.autogen.forecast_config import ForecastConfig
from greykite.framework.templates.autogen.forecast_config import MetadataParam
from greykite.framework.templates.forecaster import Forecaster
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.framework.utils.result_summary import summarize_grid_search_results

# Loads dataset into pandas DataFrame
dl = DataLoader()
df = dl.load_peyton_manning()

# specify dataset information
metadata = MetadataParam(
    time_col="ts",  # name of the time column ("date" in example above)
    value_col="y",  # name of the value column ("sessions" in example above)
    freq="D"  # "H" for hourly, "D" for daily, "W" for weekly, etc.
    # Any format accepted by `pandas.date_range`
)

# %%
# Create a forecast
# -----------------
# You can pick the ``PROPHET`` or ``SILVERKITE``
# forecasting model template. (see :doc:`/pages/stepbystep/0100_choose_model`).
#
# In this example, we use ``SILVERKITE``.
# You may also use ``PROPHET`` to see how a third-party library
# is leveraged in the same framework.
forecaster = Forecaster()  # Creates forecasts and stores the result
result = forecaster.run_forecast_config(  # result is also stored as `forecaster.forecast_result`.
    df=df,