def test_train_end_date_without_regressors():
    """Tests make_future_dataframe and train_end_date without regressors"""
    ts = UnivariateTimeSeries()
    df = pd.DataFrame({
        TIME_COL: [
            dt(2018, 1, 1, 3, 0, 0),
            dt(2018, 1, 1, 4, 0, 0),
            dt(2018, 1, 1, 5, 0, 0),
            dt(2018, 1, 1, 6, 0, 0),
            dt(2018, 1, 1, 7, 0, 0)
        ],
        VALUE_COL: [1, None, 3, None, None],
    })

    # train_end_date later than last date in df
    with pytest.warns(UserWarning) as record:
        train_end_date = dt(2018, 1, 1, 8, 0, 0)
        ts.load_data(df, TIME_COL, VALUE_COL, train_end_date=train_end_date)
        assert f"Input timestamp for the parameter 'train_end_date' " \
               f"({train_end_date}) either exceeds the last available timestamp or" \
               f"{VALUE_COL} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({ts.train_end_date})." in record[0].message.args[0]
        assert ts.train_end_date == dt(2018, 1, 1, 5, 0, 0)
        result = ts.make_future_dataframe(periods=10, include_history=True)
        expected = pd.DataFrame({
            TIME_COL:
            pd.date_range(start=dt(2018, 1, 1, 3, 0, 0), periods=13, freq="H"),
            VALUE_COL:
            np.concatenate((ts.fit_y, np.repeat(np.nan, 10)))
        })
        expected.index = expected[TIME_COL]
        expected.index.name = None
        assert_frame_equal(result, expected)
def test_plot_grouping_evaluation():
    """Tests plot_grouping_evaluation function"""
    df = generate_df_for_tests(freq="D", periods=20)["df"]
    df.rename(columns={
        TIME_COL: "custom_time_column",
        VALUE_COL: "custom_value_column"
    },
              inplace=True)

    ts = UnivariateTimeSeries()
    ts.load_data(df,
                 time_col="custom_time_column",
                 value_col="custom_value_column")

    # groupby_time_feature
    fig = ts.plot_grouping_evaluation(aggregation_func=np.mean,
                                      aggregation_func_name="mean",
                                      groupby_time_feature="dow")

    assert fig.data[0].name == f"mean of {VALUE_COL}"
    assert fig.layout.xaxis.title.text == "dow"
    assert fig.layout.yaxis.title.text == f"mean of {VALUE_COL}"
    assert fig.layout.title.text == f"mean of {VALUE_COL} vs dow"
    assert fig.data[0].x.shape[0] == 7

    # groupby_sliding_window_size
    fig = ts.plot_grouping_evaluation(
        aggregation_func=np.max,
        aggregation_func_name="max",
        groupby_sliding_window_size=7
    )  # there are 20 training points, so this creates groups of size (6, 7, 7)
    assert fig.data[0].name == f"max of {VALUE_COL}"
    assert fig.layout.xaxis.title.text == f"{TIME_COL}_downsample"
    assert fig.layout.yaxis.title.text == f"max of {VALUE_COL}"
    assert fig.layout.title.text == f"max of {VALUE_COL} vs {TIME_COL}_downsample"
    assert fig.data[0].x.shape[0] == 3

    # groupby_custom_column
    custom_groups = pd.Series(["g1", "g2", "g3", "g4", "g5"],
                              name="custom_groups").repeat(4)
    fig = ts.plot_grouping_evaluation(aggregation_func=np.min,
                                      aggregation_func_name="min",
                                      groupby_custom_column=custom_groups)
    assert fig.data[0].name == f"min of {VALUE_COL}"
    assert fig.layout.xaxis.title.text == "custom_groups"
    assert fig.layout.yaxis.title.text == f"min of {VALUE_COL}"
    assert fig.layout.title.text == f"min of {VALUE_COL} vs custom_groups"
    assert fig.data[0].x.shape[0] == 5

    # custom xlabel, ylabel and title
    fig = ts.plot_grouping_evaluation(aggregation_func=np.mean,
                                      aggregation_func_name="mean",
                                      groupby_time_feature="dow",
                                      xlabel="Day of Week",
                                      ylabel="Average of y",
                                      title="Average of y by Day of week")
    assert fig.layout.xaxis.title.text == "Day of Week"
    assert fig.layout.yaxis.title.text == "Average of y"
    assert fig.layout.title.text == "Average of y by Day of week"
def test_plot():
    """Checks plot function"""
    # Plots with `color`
    ts = UnivariateTimeSeries()
    df = pd.DataFrame({
        TIME_COL: [
            dt(2018, 1, 1, 3, 0, 0),
            dt(2018, 1, 1, 4, 0, 0),
            dt(2018, 1, 1, 5, 0, 0)
        ],
        VALUE_COL: [1, 2, 3]
    })
    ts.load_data(df, TIME_COL, VALUE_COL)
    fig = ts.plot(color="green")
    assert len(fig.data) == 1
    assert fig.data[0].line.color == "green"
    with pytest.raises(
            ValueError,
            match=
            "There is no `anomaly_info` to show. `show_anomaly_adjustment` must be False."
    ):
        ts.plot(show_anomaly_adjustment=True)

    # Plots with `show_anomaly_adjustment`
    dl = DataLoaderTS()
    df = dl.load_beijing_pm()
    value_col = "pm"
    # Masks up to 2011-02-04-03, and adds 100.0 to the rest
    anomaly_df = pd.DataFrame({
        START_DATE_COL: ["2010-01-01-00", "2011-02-04-03"],
        END_DATE_COL: ["2011-02-04-03", "2014-12-31-23"],
        ADJUSTMENT_DELTA_COL: [np.nan, 100.0],
        METRIC_COL: [value_col, value_col]
    })
    anomaly_info = {
        "value_col": value_col,
        "anomaly_df": anomaly_df,
        "start_date_col": START_DATE_COL,
        "end_date_col": END_DATE_COL,
        "adjustment_delta_col": ADJUSTMENT_DELTA_COL,
        "filter_by_dict": {
            METRIC_COL: value_col
        },
        "adjustment_method": "add"
    }
    ts = UnivariateTimeSeries()
    ts.load_data(df=df, value_col="pm", anomaly_info=anomaly_info)
    fig = ts.plot(show_anomaly_adjustment=True)
    assert len(fig.data) == 2
    assert fig.data[0].name == value_col
    assert fig.data[1].name == f"{value_col}_unadjusted"
    assert fig.layout.xaxis.title.text == ts.original_time_col
    assert fig.layout.yaxis.title.text == ts.original_value_col
    assert fig.data[0].y.shape[0] == df.shape[0]
    assert fig.data[1].y.shape[0] == df.shape[0]
    # adjusted data has more NaNs, since anomalies are replaced with NaN
    assert sum(np.isnan(fig.data[0].y)) == 10906
    assert sum(np.isnan(fig.data[1].y)) == 2067
Example #4
0
def test_make_univariate_time_series(df):
    """Tests make_univariate_time_series function"""
    forecast = UnivariateForecast(df, train_end_date=datetime.datetime(2018, 1, 2))
    ts = UnivariateTimeSeries()
    ts.load_data(pd.DataFrame({
        cst.TIME_COL: df[cst.TIME_COL],
        cst.VALUE_COL: df[cst.PREDICTED_COL]
    }), cst.TIME_COL, cst.VALUE_COL)
    assert forecast.make_univariate_time_series().df.equals(ts.df)
def test_get_grouping_evaluation():
    """Tests get_grouping_evaluation function"""
    df = pd.DataFrame({
        "custom_time_column": [
            datetime.datetime(2018, 1, 1),
            datetime.datetime(2018, 1, 2),
            datetime.datetime(2018, 1, 3),
            datetime.datetime(2018, 1, 4),
            datetime.datetime(2018, 1, 5)
        ],
        "custom_value_column": [1.0, 2.0, 3.0, 4.0, 5.0],
    })

    ts = UnivariateTimeSeries()
    ts.load_data(df,
                 time_col="custom_time_column",
                 value_col="custom_value_column")

    # mean, groupby_time_feature
    grouped_df = ts.get_grouping_evaluation(aggregation_func=np.mean,
                                            aggregation_func_name="mean",
                                            groupby_time_feature="dow")
    expected = pd.DataFrame({
        "dow":
        [1, 2, 3, 4,
         5],  # Monday, Tuesday, etc. Time feature is used as column name
        f"mean of {VALUE_COL}": [1.0, 2.0, 3.0, 4.0, 5.0]
    })
    assert_equal(grouped_df, expected)

    # max, groupby_sliding_window_size
    grouped_df = ts.get_grouping_evaluation(aggregation_func=np.max,
                                            aggregation_func_name="max",
                                            groupby_sliding_window_size=2)
    expected = pd.DataFrame({
        f"{TIME_COL}_downsample": [
            datetime.datetime(2018, 1, 1),
            datetime.datetime(2018, 1, 3),
            datetime.datetime(2018, 1, 5)
        ],
        f"max of {VALUE_COL}": [1.0, 3.0, 5.0]
    })
    assert_equal(grouped_df, expected)

    # min, groupby_custom_column
    grouped_df = ts.get_grouping_evaluation(aggregation_func=np.min,
                                            aggregation_func_name=None,
                                            groupby_custom_column=pd.Series(
                                                ["g1", "g2", "g1", "g3", "g2"],
                                                name="custom_groups"))
    expected = pd.DataFrame({
        "custom_groups": ["g1", "g2", "g3"],
        f"aggregation of {VALUE_COL}": [1.0, 2.0, 4.0]
    })
    assert_equal(grouped_df, expected)
    def make_univariate_time_series(self):
        """Converts prediction into a UnivariateTimeSeries
        Useful to convert a forecast into the input regressor for a subsequent forecast.

        :return: UnivariateTimeSeries
        """
        ts = UnivariateTimeSeries()
        df = (self.df[[self.time_col, self.predicted_col]]
              .rename({self.predicted_col: self.ylabel}, axis=1))
        ts.load_data(df, self.time_col, self.ylabel)
        return ts
def test_load_data_anomaly():
    """Checks anomaly_info parameter"""
    dl = DataLoaderTS()
    df = dl.load_beijing_pm()
    value_col = "pm"

    # no anomaly adjustment
    ts = UnivariateTimeSeries()
    ts.load_data(df=df, value_col=value_col)
    assert ts.df_before_adjustment is None

    # adjusts two columns
    dim_one = "one"
    dim_two = "two"
    anomaly_df = pd.DataFrame({
        START_DATE_COL: ["2011-04-04-10", "2011-10-10-00", "2012-12-20-10"],
        END_DATE_COL: ["2011-04-05-20", "2011-10-11-23", "2012-12-20-13"],
        ADJUSTMENT_DELTA_COL: [np.nan, 100.0, -100.0],
        METRIC_COL: [dim_one, dim_one, dim_two]
    })
    anomaly_info = [{
        "value_col": value_col,
        "anomaly_df": anomaly_df,
        "start_date_col": START_DATE_COL,
        "end_date_col": END_DATE_COL,
        "adjustment_delta_col": ADJUSTMENT_DELTA_COL,
        "filter_by_dict": {
            METRIC_COL: dim_one
        },
        "adjustment_method": "add"
    }, {
        "value_col": "pres",
        "anomaly_df": anomaly_df,
        "start_date_col": START_DATE_COL,
        "end_date_col": END_DATE_COL,
        "adjustment_delta_col": ADJUSTMENT_DELTA_COL,
        "filter_by_dict": {
            METRIC_COL: dim_two
        },
        "adjustment_method": "subtract"
    }]
    ts = UnivariateTimeSeries()
    ts.load_data(df=df, value_col=value_col, anomaly_info=anomaly_info)
    canonical_data_dict = get_canonical_data(df=df,
                                             value_col=value_col,
                                             anomaly_info=anomaly_info)
    assert_equal(ts.df, canonical_data_dict["df"])
    assert_equal(ts.df_before_adjustment,
                 canonical_data_dict["df_before_adjustment"])
def test_check_time_series_gaps():
    """Checks gaps filled for non-regular input"""
    ts = UnivariateTimeSeries()
    df = pd.DataFrame({
        TIME_COL: [
            dt(2018, 1, 1, 0, 0, 1),
            dt(2018, 1, 1, 0, 0, 2),
            dt(2018, 1, 1, 0, 0, 10),  # intentionally out of order
            dt(2018, 1, 1, 0, 0, 4)
        ],
        VALUE_COL: [1, 2, 3, 4]
    })

    expected = pd.Series([
        dt(2018, 1, 1, 0, 0, 1),
        dt(2018, 1, 1, 0, 0, 2),
        dt(2018, 1, 1, 0, 0, 3),
        dt(2018, 1, 1, 0, 0, 4),
        dt(2018, 1, 1, 0, 0, 5),
        dt(2018, 1, 1, 0, 0, 6),
        dt(2018, 1, 1, 0, 0, 7),
        dt(2018, 1, 1, 0, 0, 8),
        dt(2018, 1, 1, 0, 0, 9),
        dt(2018, 1, 1, 0, 0, 10)
    ])
    expected.index = expected
    ts.load_data(
        df, TIME_COL, VALUE_COL,
        freq="S")  # the frequency should be provided when there are gaps
    assert ts.df[TIME_COL].equals(expected)
    assert ts.time_stats["data_points"] == 10  # after filling in gaps
    assert ts.value_stats["count"] == 4  # before filling in gaps
    assert ts.time_stats["added_timepoints"] == 6
    assert ts.time_stats["dropped_timepoints"] == 0
    assert ts.df[VALUE_COL].equals(ts.y)

    expected_gaps = pd.DataFrame({
        "right_before_gap":
        pd.Series([dt(2018, 1, 1, 0, 0, 2),
                   dt(2018, 1, 1, 0, 0, 4)]),
        "right_after_gap":
        pd.Series([dt(2018, 1, 1, 0, 0, 4),
                   dt(2018, 1, 1, 0, 0, 10)]),
        "gap_size": [1.0, 5.0]
    })
    assert ts.time_stats["gaps"].equals(expected_gaps)
def test_check_time_series_tz_local():
    """Checks date parsing with localization"""
    expected = pd.Series([
        dt(2018, 1, 1, 0, 0, 0),
        dt(2018, 1, 5, 0, 0, 0),
        dt(2018, 1, 9, 0, 0, 0)
    ])
    expected.index = expected
    expected = expected.tz_localize("US/Pacific")
    ts = UnivariateTimeSeries()
    df = pd.DataFrame({
        TIME_COL: ["2018-01-01", "2018-01-05", "2018-01-09"],
        VALUE_COL: [1, 2, 3]
    })
    ts.load_data(df, TIME_COL, VALUE_COL, tz="US/Pacific", freq="4D")
    assert ts.time_stats["added_timepoints"] == 0
    assert ts.time_stats["dropped_timepoints"] == 0
    assert ts.df[TIME_COL].equals(expected)
    assert ts.df[VALUE_COL].equals(ts.y)
def test_check_time_series2():
    """Checks value column stats"""
    ts = UnivariateTimeSeries()
    df = pd.DataFrame({
        TIME_COL: [
            dt(2018, 1, 1, 0, 0, 1),
            dt(2018, 1, 1, 0, 0, 2),
            dt(2018, 1, 1, 0, 0, 3)
        ],
        VALUE_COL: [1, 2, 3]
    })
    ts.load_data(df, TIME_COL, VALUE_COL)
    assert ts.value_stats["mean"] == 2.0
    assert ts.value_stats["std"] == 1.0
    assert ts.value_stats["min"] == 1.0
    assert ts.value_stats["25%"] == 1.5
    assert ts.value_stats["50%"] == 2.0
    assert ts.value_stats["75%"] == 2.5
    assert ts.value_stats["max"] == 3.0
    assert ts.df[VALUE_COL].equals(ts.y)
def test_check_time_series1():
    """Checks if regular data can be properly loaded. Checks time column stats"""
    ts = UnivariateTimeSeries()
    df = pd.DataFrame({
        "time": [
            dt(2018, 1, 1, 0, 0, 1),
            dt(2018, 1, 1, 0, 0, 2),
            dt(2018, 1, 1, 0, 0, 3)
        ],
        "val": [1, 2, 3]
    })
    ts.load_data(df, "time", "val")
    assert ts.original_time_col == "time"
    assert ts.original_value_col == "val"
    assert ts.time_stats["data_points"] == 3
    assert ts.time_stats["mean_increment_secs"] == 1.0
    assert ts.time_stats["min_timestamp"] == df.min()[0]
    assert ts.time_stats["max_timestamp"] == df.max()[0]
    assert ts.time_stats["added_timepoints"] == 0
    assert ts.time_stats["dropped_timepoints"] == 0
    assert ts.freq == "S"
    assert ts.df[VALUE_COL].equals(ts.y)
    assert ts.df.index.name is None
Example #12
0
def forecast_pipeline(
        # input
        df: pd.DataFrame,
        time_col=TIME_COL,
        value_col=VALUE_COL,
        date_format=None,
        tz=None,
        freq=None,
        train_end_date=None,
        anomaly_info=None,
        # model
        pipeline=None,
        regressor_cols=None,
        lagged_regressor_cols=None,
        estimator=SimpleSilverkiteEstimator(),
        hyperparameter_grid=None,
        hyperparameter_budget=None,
        n_jobs=COMPUTATION_N_JOBS,
        verbose=1,
        # forecast
        forecast_horizon=None,
        coverage=0.95,
        test_horizon=None,
        periods_between_train_test=None,
        agg_periods=None,
        agg_func=None,
        # evaluation
        score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name,
        score_func_greater_is_better=False,
        cv_report_metrics=CV_REPORT_METRICS_ALL,
        null_model_params=None,
        relative_error_tolerance=None,
        # CV
        cv_horizon=None,
        cv_min_train_periods=None,
        cv_expanding_window=False,
        cv_use_most_recent_splits=False,
        cv_periods_between_splits=None,
        cv_periods_between_train_test=None,
        cv_max_splits=3):
    """Computation pipeline for end-to-end forecasting.

    Trains a forecast model end-to-end:

        1. checks input data
        2. runs cross-validation to select optimal hyperparameters e.g. best model
        3. evaluates best model on test set
        4. provides forecast of best model (re-trained on all data) into the future

    Returns forecasts with methods to plot and see diagnostics.
    Also returns the fitted pipeline and CV results.

    Provides a high degree of customization over training and evaluation parameters:

        1. model
        2. cross validation
        3. evaluation
        4. forecast horizon

    See test cases for examples.

    Parameters
    ----------
    df : `pandas.DataFrame`
        Timeseries data to forecast.
        Contains columns [`time_col`, `value_col`], and optional regressor columns
        Regressor columns should include future values for prediction

    time_col : `str`, default TIME_COL in constants.py
        name of timestamp column in df

    value_col : `str`, default VALUE_COL in constants.py
        name of value column in df (the values to forecast)

    date_format : `str` or None, default None
        strftime format to parse time column, eg ``%m/%d/%Y``.
        Note that ``%f`` will parse all the way up to nanoseconds.
        If None (recommended), inferred by `pandas.to_datetime`.

    tz : `str` or None, default None
        Passed to `pandas.tz_localize` to localize the timestamp

    freq : `str` or None, default None
        Frequency of input data. Used to generate future dates for prediction.
        Frequency strings can have multiples, e.g. '5H'.
        See https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
        for a list of frequency aliases.
        If None, inferred by `pandas.infer_freq`.
        Provide this parameter if ``df`` has missing timepoints.

    train_end_date : `datetime.datetime`, optional, default None
        Last date to use for fitting the model. Forecasts are generated after this date.
        If None, it is set to the last date with a non-null value in
        ``value_col`` of ``df``.

    anomaly_info : `dict` or `list` [`dict`] or None, default None
        Anomaly adjustment info. Anomalies in ``df``
        are corrected before any forecasting is done.

        If None, no adjustments are made.

        A dictionary containing the parameters to
        `~greykite.common.features.adjust_anomalous_data.adjust_anomalous_data`.
        See that function for details.
        The possible keys are:

            ``"value_col"`` : `str`
                The name of the column in ``df`` to adjust. You may adjust the value
                to forecast as well as any numeric regressors.
            ``"anomaly_df"`` : `pandas.DataFrame`
                Adjustments to correct the anomalies.
            ``"start_date_col"``: `str`, default START_DATE_COL
                Start date column in ``anomaly_df``.
            ``"end_date_col"``: `str`, default END_DATE_COL
                End date column in ``anomaly_df``.
            ``"adjustment_delta_col"``: `str` or None, default None
                Impact column in ``anomaly_df``.
            ``"filter_by_dict"``: `dict` or None, default None
                Used to filter ``anomaly_df`` to the relevant anomalies for
                the ``value_col`` in this dictionary.
                Key specifies the column name, value specifies the filter value.
            ``"filter_by_value_col""``: `str` or None, default None
                Adds ``{filter_by_value_col: value_col}`` to ``filter_by_dict``
                if not None, for the ``value_col`` in this dictionary.
            ``"adjustment_method"`` : `str` ("add" or "subtract"), default "add"
                How to make the adjustment, if ``adjustment_delta_col`` is provided.

        Accepts a list of such dictionaries to adjust multiple columns in ``df``.

    pipeline : `sklearn.pipeline.Pipeline` or None, default None
        Pipeline to fit. The final named step must be called "estimator".
        If None, will use the default Pipeline from
        `~greykite.framework.pipeline.utils.get_basic_pipeline`.

    regressor_cols : `list` [`str`] or None, default None
        A list of regressor columns used in the training and prediction DataFrames.
        It should contain only the regressors that are being used in the grid search.
        If None, no regressor columns are used.
        Regressor columns that are unavailable in ``df`` are dropped.

    lagged_regressor_cols : `list` [`str`] or None, default None
        A list of additional columns needed for lagged regressors in the training and prediction DataFrames.
        This list can have overlap with ``regressor_cols``.
        If None, no additional columns are added to the DataFrame.
        Lagged regressor columns that are unavailable in ``df`` are dropped.

    estimator : instance of an estimator that implements `greykite.algo.models.base_forecast_estimator.BaseForecastEstimator`
        Estimator to use as the final step in the pipeline.
        Ignored if ``pipeline`` is provided.

    forecast_horizon : `int` or None, default None
        Number of periods to forecast into the future. Must be > 0.
        If None, default is determined from input data frequency

    coverage : `float` or None, default=0.95
        Intended coverage of the prediction bands (0.0 to 1.0)
        If None, the upper/lower predictions are not returned
        Ignored if `pipeline` is provided. Uses coverage of the ``pipeline`` estimator instead.

    test_horizon : `int` or None, default None
        Numbers of periods held back from end of df for test.
        The rest is used for cross validation.
        If None, default is forecast_horizon. Set to 0 to skip backtest.

    periods_between_train_test : `int` or None, default None
        Number of periods for the gap between train and test data.
        If None, default is 0.

    agg_periods : `int` or None, default None
        Number of periods to aggregate before evaluation.

        Model is fit and forecasted on the dataset's original frequency.

        Before evaluation, the actual and forecasted values are aggregated,
        using rolling windows of size ``agg_periods`` and the function
        ``agg_func``. (e.g. if the dataset is hourly, use ``agg_periods=24, agg_func=np.sum``,
        to evaluate performance on the daily totals).

        If None, does not aggregate before evaluation.

        Currently, this is only used when calculating CV metrics and
        the R2_null_model_score metric in backtest/forecast. No pre-aggregation
        is applied for the other backtest/forecast evaluation metrics.

    agg_func : callable or None, default None
        Takes an array and returns a number, e.g. np.max, np.sum.

        Defines how to aggregate rolling windows of actual and predicted values
        before evaluation.

        Ignored if ``agg_periods`` is None.

        Currently, this is only used when calculating CV metrics and
        the R2_null_model_score metric in backtest/forecast. No pre-aggregation
        is applied for the other backtest/forecast evaluation metrics.

    score_func : `str` or callable, default ``EvaluationMetricEnum.MeanAbsolutePercentError.name``
        Score function used to select optimal model in CV.
        If a callable, takes arrays ``y_true``, ``y_pred`` and returns a float.
        If a string, must be either a
        `~greykite.common.evaluation.EvaluationMetricEnum` member name
        or `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`.

    score_func_greater_is_better : `bool`, default False
        True if ``score_func`` is a score function, meaning higher is better,
        and False if it is a loss function, meaning lower is better.
        Must be provided if ``score_func`` is a callable (custom function).
        Ignored if ``score_func`` is a string, because the direction is known.

    cv_report_metrics : `str`, or `list` [`str`], or None, default `~greykite.common.constants.CV_REPORT_METRICS_ALL`
        Additional metrics to compute during CV, besides the one specified by ``score_func``.

            - If the string constant `greykite.framework.constants.CV_REPORT_METRICS_ALL`,
              computes all metrics in ``EvaluationMetricEnum``. Also computes
              ``FRACTION_OUTSIDE_TOLERANCE`` if ``relative_error_tolerance`` is not None.
              The results are reported by the short name (``.get_metric_name()``) for ``EvaluationMetricEnum``
              members and ``FRACTION_OUTSIDE_TOLERANCE_NAME`` for ``FRACTION_OUTSIDE_TOLERANCE``.
              These names appear in the keys of ``forecast_result.grid_search.cv_results_``
              returned by this function.
            - If a list of strings, each of the listed metrics is computed. Valid strings are
              `~greykite.common.evaluation.EvaluationMetricEnum` member names
              and `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`.

              For example::

                ["MeanSquaredError", "MeanAbsoluteError", "MeanAbsolutePercentError", "MedianAbsolutePercentError", "FractionOutsideTolerance2"]

            - If None, no additional metrics are computed.

    null_model_params : `dict` or None, default None
        Defines baseline model to compute ``R2_null_model_score`` evaluation metric.
        ``R2_null_model_score`` is the improvement in the loss function relative
        to a null model. It can be used to evaluate model quality with respect to
        a simple baseline. For details, see
        `~greykite.common.evaluation.r2_null_model_score`.

        The null model is a `~sklearn.dummy.DummyRegressor`,
        which returns constant predictions.

        Valid keys are "strategy", "constant", "quantile".
        See `~sklearn.dummy.DummyRegressor`. For example::

            null_model_params = {
                "strategy": "mean",
            }
            null_model_params = {
                "strategy": "median",
            }
            null_model_params = {
                "strategy": "quantile",
                "quantile": 0.8,
            }
            null_model_params = {
                "strategy": "constant",
                "constant": 2.0,
            }

        If None, ``R2_null_model_score`` is not calculated.

        Note: CV model selection always optimizes ``score_func`, not
        the ``R2_null_model_score``.

    relative_error_tolerance : `float` or None, default None
        Threshold to compute the ``Outside Tolerance`` metric,
        defined as the fraction of forecasted values whose relative
        error is strictly greater than ``relative_error_tolerance``.
        For example, 0.05 allows for 5% relative error.
        If `None`, the metric is not computed.

    hyperparameter_grid : `dict`, `list` [`dict`] or None, default None
        Sets properties of the steps in the pipeline,
        and specifies combinations to search over.
        Should be valid input to `sklearn.model_selection.GridSearchCV` (param_grid)
        or `sklearn.model_selection.RandomizedSearchCV` (param_distributions).

        Prefix transform/estimator attributes by the name of the step in the pipeline.
        See details at: https://scikit-learn.org/stable/modules/compose.html#nested-parameters

        If None, uses the default pipeline parameters.

    hyperparameter_budget : `int` or None, default None
        Max number of hyperparameter sets to try within the ``hyperparameter_grid`` search space

        Runs a full grid search if ``hyperparameter_budget`` is sufficient to exhaust full
        ``hyperparameter_grid``, otherwise samples uniformly at random from the space.

        If None, uses defaults:

            * full grid search if all values are constant
            * 10 if any value is a distribution to sample from

    n_jobs : `int` or None, default `~greykite.framework.constants.COMPUTATION_N_JOBS`
        Number of jobs to run in parallel
        (the maximum number of concurrently running workers).
        ``-1`` uses all CPUs. ``-2`` uses all CPUs but one.
        ``None`` is treated as 1 unless in a `joblib.Parallel` backend context
        that specifies otherwise.

    verbose : `int`, default 1
        Verbosity level during CV.
        if > 0, prints number of fits
        if > 1, prints fit parameters, total score + fit time
        if > 2, prints train/test scores

    cv_horizon : `int` or None, default None
        Number of periods in each CV test set
        If None, default is ``forecast_horizon``.
        Set either ``cv_horizon`` or ``cv_max_splits`` to 0 to skip CV.

    cv_min_train_periods : `int` or None, default None
        Minimum number of periods for training each CV fold.
        If cv_expanding_window is False, every training period is this size
        If None, default is 2 * ``cv_horizon``

    cv_expanding_window : `bool`, default False
        If True, training window for each CV split is fixed to the first available date.
        Otherwise, train start date is sliding, determined by ``cv_min_train_periods``.

    cv_use_most_recent_splits: `bool`, default False
        If True, splits from the end of the dataset are used.
        Else a sampling strategy is applied. Check
        `~greykite.sklearn.cross_validation.RollingTimeSeriesSplit._sample_splits`
        for details.

    cv_periods_between_splits : `int` or None, default None
        Number of periods to slide the test window between CV splits
        If None, default is ``cv_horizon``

    cv_periods_between_train_test : `int` or None, default None
        Number of periods for the gap between train and test in a CV split.
        If None, default is ``periods_between_train_test``.

    cv_max_splits : `int` or None, default 3
        Maximum number of CV splits.
        Given the above configuration, samples up to max_splits train/test splits,
        preferring splits toward the end of available data. If None, uses all splits.
        Set either ``cv_horizon`` or ``cv_max_splits`` to 0 to skip CV.

    Returns
    -------
    forecast_result : :class:`~greykite.framework.pipeline.pipeline.ForecastResult`
        Forecast result. See :class:`~greykite.framework.pipeline.pipeline.ForecastResult`
        for details.

            * If ``cv_horizon=0``, ``forecast_result.grid_search.best_estimator_``
              and ``forecast_result.grid_search.best_params_`` attributes are defined
              according to the provided single set of parameters. There must be a single
              set of parameters to skip cross-validation.
            * If ``test_horizon=0``, ``forecast_result.backtest`` is None.
    """
    if hyperparameter_grid is None or hyperparameter_grid == []:
        hyperparameter_grid = {}
    # When hyperparameter_grid is a singleton list, unlist it
    if isinstance(hyperparameter_grid, list) and len(hyperparameter_grid) == 1:
        hyperparameter_grid = hyperparameter_grid[0]

    # Loads full dataset
    ts = UnivariateTimeSeries()
    ts.load_data(
        df=df,
        time_col=time_col,
        value_col=value_col,
        freq=freq,
        date_format=date_format,
        tz=tz,
        train_end_date=train_end_date,
        regressor_cols=regressor_cols,
        lagged_regressor_cols=lagged_regressor_cols,
        anomaly_info=anomaly_info)

    # Splits data into training and test sets. ts.df uses standardized column names
    if test_horizon == 0:
        train_df = ts.fit_df
        train_y = ts.fit_y
        test_df = pd.DataFrame(columns=list(df.columns))
    else:
        # Make sure to refit best_pipeline appropriately
        train_df, test_df, train_y, test_y = train_test_split(
            ts.fit_df,
            ts.fit_y,
            train_size=ts.fit_df.shape[0] - test_horizon - periods_between_train_test,
            test_size=test_horizon + periods_between_train_test,
            shuffle=False)  # this is important since this is timeseries forecasting!
    log_message(f"Train size: {train_df.shape[0]}. Test size: {test_df.shape[0]}", LoggingLevelEnum.INFO)

    # Defines default training pipeline
    if pipeline is None:
        pipeline = get_basic_pipeline(
            estimator=estimator,
            score_func=score_func,
            score_func_greater_is_better=score_func_greater_is_better,
            agg_periods=agg_periods,
            agg_func=agg_func,
            relative_error_tolerance=relative_error_tolerance,
            coverage=coverage,
            null_model_params=null_model_params,
            regressor_cols=ts.regressor_cols,
            lagged_regressor_cols=ts.lagged_regressor_cols)

    # Searches for the best parameters, and refits model with selected parameters on the entire training set
    if cv_horizon == 0 or cv_max_splits == 0:
        # No cross-validation. Only one set of hyperparameters is allowed.
        try:
            if len(ParameterGrid(hyperparameter_grid)) > 1:
                raise ValueError(
                    "CV is required to identify the best model because there are multiple options "
                    "in `hyperparameter_grid`. Either provide a single option or set `cv_horizon` and `cv_max_splits` "
                    "to nonzero values.")
        except TypeError:  # Parameter value is not iterable
            raise ValueError(
                "CV is required to identify the best model because `hyperparameter_grid` contains "
                "a distribution. Either remove the distribution or set `cv_horizon` and `cv_max_splits` "
                "to nonzero values.")

        # Fits model to entire train set. Params must be set manually since it's not done by grid search
        params = {k: v[0] for k, v in hyperparameter_grid.items()}  # unpack lists, `v` is a singleton list with the parameter value
        best_estimator = pipeline.set_params(**params).fit(train_df, train_y)

        # Wraps this model in a dummy RandomizedSearchCV object to return the backtest model
        grid_search = get_hyperparameter_searcher(
            hyperparameter_grid=hyperparameter_grid,
            model=pipeline,
            cv=None,  # no cross-validation
            hyperparameter_budget=hyperparameter_budget,
            n_jobs=n_jobs,
            verbose=verbose,
            score_func=score_func,
            score_func_greater_is_better=score_func_greater_is_better,
            cv_report_metrics=cv_report_metrics,
            agg_periods=agg_periods,
            agg_func=agg_func,
            relative_error_tolerance=relative_error_tolerance)
        # Sets relevant attributes. Others are undefined (cv_results_, best_score_, best_index_, scorer_, refit_time_)
        grid_search.best_estimator_ = best_estimator
        grid_search.best_params_ = params
        grid_search.n_splits_ = 0
    else:
        # Defines cross-validation splitter
        cv = RollingTimeSeriesSplit(
            forecast_horizon=cv_horizon,
            min_train_periods=cv_min_train_periods,
            expanding_window=cv_expanding_window,
            use_most_recent_splits=cv_use_most_recent_splits,
            periods_between_splits=cv_periods_between_splits,
            periods_between_train_test=cv_periods_between_train_test,
            max_splits=cv_max_splits)

        # Defines grid search approach for CV
        grid_search = get_hyperparameter_searcher(
            hyperparameter_grid=hyperparameter_grid,
            model=pipeline,
            cv=cv,
            hyperparameter_budget=hyperparameter_budget,
            n_jobs=n_jobs,
            verbose=verbose,
            score_func=score_func,
            score_func_greater_is_better=score_func_greater_is_better,
            cv_report_metrics=cv_report_metrics,
            agg_periods=agg_periods,
            agg_func=agg_func,
            relative_error_tolerance=relative_error_tolerance)
        grid_search.fit(train_df, train_y)
        best_estimator = grid_search.best_estimator_

    # Evaluates historical performance, fits model to all data (train+test)
    if test_horizon > 0:
        backtest_train_end_date = train_df[TIME_COL].max()
        # Uses pd.date_range because pd.Timedelta does not work for complicated frequencies e.g. "W-MON"
        backtest_test_start_date = pd.date_range(
            start=backtest_train_end_date,
            periods=periods_between_train_test + 2,  # Adds 2 as start parameter is inclusive
            freq=ts.freq)[-1]
        backtest = get_forecast(
            df=ts.fit_df,  # Backtest needs to happen on fit_df, not on the entire df
            trained_model=best_estimator,
            train_end_date=backtest_train_end_date,
            test_start_date=backtest_test_start_date,
            forecast_horizon=test_horizon,
            xlabel=time_col,
            ylabel=value_col,
            relative_error_tolerance=relative_error_tolerance)
        best_pipeline = clone(best_estimator)  # Copies optimal parameters
        best_pipeline.fit(ts.fit_df, ts.y)  # Refits this model on entire training dataset
    else:
        backtest = None  # Backtest training metrics are the same as forecast training metrics
        best_pipeline = best_estimator  # best_model is already fit to all data

    # Makes future predictions
    periods = forecast_horizon + periods_between_train_test
    future_df = ts.make_future_dataframe(
        periods=periods,
        include_history=True)

    forecast_train_end_date = ts.train_end_date
    # Uses pd.date_range because pd.Timedelta does not work for complicated frequencies e.g. "W-MON"
    forecast_test_start_date = pd.date_range(
        start=forecast_train_end_date,
        periods=periods_between_train_test + 2,  # Adds 2 as start parameter is inclusive
        freq=ts.freq)[-1]
    forecast = get_forecast(
        df=future_df,
        trained_model=best_pipeline,
        train_end_date=forecast_train_end_date,
        test_start_date=forecast_test_start_date,
        forecast_horizon=forecast_horizon,
        xlabel=time_col,
        ylabel=value_col,
        relative_error_tolerance=relative_error_tolerance)

    result = ForecastResult(
        timeseries=ts,
        grid_search=grid_search,
        model=best_pipeline,
        backtest=backtest,
        forecast=forecast
    )
    return result
def test_make_future_dataframe():
    """Checks future dataframe creation"""
    ts = UnivariateTimeSeries()
    df = pd.DataFrame({
        TIME_COL: [
            dt(2018, 1, 1, 3, 0, 0),
            dt(2018, 1, 1, 4, 0, 0),
            dt(2018, 1, 1, 5, 0, 0),
            dt(2018, 1, 1, 6, 0, 0),
            dt(2018, 1, 1, 7, 0, 0)
        ],
        VALUE_COL: [1, None, 3, None, None],
    })
    with pytest.warns(UserWarning) as record:
        ts.load_data(df, TIME_COL, VALUE_COL, regressor_cols=None)
        assert f"{ts.original_value_col} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({ts.train_end_date})." in record[0].message.args[0]

    # test regressor_cols from load_data
    assert ts.regressor_cols == []

    # tests last_date_for_val from load_data
    assert ts.last_date_for_val == dt(2018, 1, 1, 5, 0, 0)
    assert ts.train_end_date == dt(2018, 1, 1, 5, 0, 0)

    # tests last_date_for_reg from load_data
    assert ts.last_date_for_reg is None

    # tests fit_df from load_data
    result_fit_df = ts.fit_df.reset_index(
        drop=True)  # fit_df's index is time_col
    expected_fit_df = df[df[TIME_COL] <= dt(2018, 1, 1, 5, 0, 0)]
    assert_frame_equal(result_fit_df, expected_fit_df)

    # tests fit_y
    result_fit_y = ts.fit_y.reset_index(drop=True)  # fit_y's index is time_col
    expected_fit_y = expected_fit_df[VALUE_COL]
    assert result_fit_y.equals(expected_fit_y)

    # with history, default value for periods
    result = ts.make_future_dataframe(periods=None, include_history=True)
    expected = pd.DataFrame({
        TIME_COL:
        pd.date_range(start=dt(2018, 1, 1, 3, 0, 0), periods=33, freq="H"),
        VALUE_COL:
        np.concatenate((result_fit_y, np.repeat(np.nan, 30)))
    })
    expected.index = expected[TIME_COL]
    expected.index.name = None
    assert_frame_equal(result, expected)

    # without history
    result = ts.make_future_dataframe(periods=10, include_history=False)
    expected = pd.DataFrame({
        TIME_COL:
        pd.date_range(start=dt(2018, 1, 1, 6, 0, 0), periods=10, freq="H"),
        VALUE_COL:
        np.repeat(np.nan, 10)
    })
    expected.index = expected[TIME_COL]
    expected.index.name = None
    expected.index.freq = "H"
    assert_frame_equal(result, expected)
Example #14
0
from greykite.framework.input.univariate_time_series import UnivariateTimeSeries
from greykite.framework.constants import MEAN_COL_GROUP, OVERLAY_COL_GROUP
from greykite.common.constants import TIME_COL
from greykite.common.data_loader import DataLoader
from greykite.common.viz.timeseries_plotting import add_groupby_column, plot_multivariate, plot_univariate

# Loads dataset into pandas DataFrame
dl = DataLoader()
df = dl.load_peyton_manning()
df.rename(columns={"y": "log(pageviews)"},
          inplace=True)  # uses a more informative name

# plots dataset
ts = UnivariateTimeSeries()
ts.load_data(df=df, time_col="ts", value_col="log(pageviews)", freq="D")
fig = ts.plot()
plotly.io.show(fig)

# %%
# Yearly seasonality
# ------------------
# Because the observations are at daily frequency,
# it is possible to see yearly, quarterly, monthly, and weekly seasonality.
# The name of the seasonality refers to the length of one cycle. For example,
# yearly seasonality is a pattern that repeats once a year.
#
# .. tip::
#   It's helpful to start with the longest cycle to see the big picture.
#
# To examine yearly seasonality, plot the average value by day of year.
def test_get_quantiles_and_overlays():
    """Tests get_quantiles_and_overlays"""
    dl = DataLoaderTS()
    peyton_manning_ts = dl.load_peyton_manning_ts()

    # no columns are requested
    with pytest.raises(
            ValueError,
            match=
            "Must enable at least one of: show_mean, show_quantiles, show_overlays."
    ):
        peyton_manning_ts.get_quantiles_and_overlays(
            groupby_time_feature="doy")

    # show_mean only
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_time_feature="dow",
        show_mean=True,
        mean_col_name="custom_name")
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays([[MEAN_COL_GROUP], ["custom_name"]],
                                  names=["category", "name"]))
    assert grouped_df.index.name == "dow"
    assert grouped_df.shape == (7, 1)
    assert grouped_df.index[0] == 1

    # show_quantiles only (bool)
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_sliding_window_size=180, show_quantiles=True)
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[QUANTILE_COL_GROUP, QUANTILE_COL_GROUP], ["Q0.1", "Q0.9"]],
            names=["category", "name"]))
    assert grouped_df.index.name == "ts_downsample"
    assert grouped_df.shape == (17, 2)
    assert grouped_df.index[0] == pd.Timestamp(2007, 12, 10)

    # show_quantiles only (list)
    custom_col = pd.Series(
        np.random.choice(list("abcd"), size=peyton_manning_ts.df.shape[0]))
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_custom_column=custom_col,
        show_quantiles=[0, 0.25, 0.5, 0.75, 1],
        quantile_col_prefix="prefix")
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[QUANTILE_COL_GROUP] * 5,
             ["prefix0", "prefix0.25", "prefix0.5", "prefix0.75", "prefix1"]],
            names=["category", "name"]))
    assert grouped_df.index.name == "groups"
    assert grouped_df.shape == (4, 5)
    assert grouped_df.index[0] == "a"
    # checks quantile computation
    df = peyton_manning_ts.df.copy()
    df["custom_col"] = custom_col.values
    quantile_df = df.groupby("custom_col")[VALUE_COL].agg(
        [np.nanmin, np.nanmedian, np.nanmax])
    assert_equal(grouped_df["quantile"]["prefix0"],
                 quantile_df["nanmin"],
                 check_names=False)
    assert_equal(grouped_df["quantile"]["prefix0.5"],
                 quantile_df["nanmedian"],
                 check_names=False)
    assert_equal(grouped_df["quantile"]["prefix1"],
                 quantile_df["nanmax"],
                 check_names=False)

    # show_overlays only (bool), no overlay label
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_time_feature="doy", show_overlays=True)
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[OVERLAY_COL_GROUP] * 9, [f"overlay{i}" for i in range(9)]],
            names=["category", "name"]))
    assert grouped_df.index.name == "doy"
    assert grouped_df.shape == (366, 9)
    assert grouped_df.index[0] == 1

    # show_overlays only (int below the available number), time feature overlay label
    np.random.seed(123)
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_time_feature="doy",
        show_overlays=4,
        overlay_label_time_feature="year")
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[OVERLAY_COL_GROUP] * 4, ["2007", "2011", "2012", "2014"]],
            names=["category", "name"]))
    assert grouped_df.index.name == "doy"
    assert grouped_df.shape == (366, 4)
    assert grouped_df.index[0] == 1

    # show_overlays only (int above the available number), custom overlay label
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_time_feature="dom",
        show_overlays=200,
        overlay_label_custom_column=custom_col)
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[OVERLAY_COL_GROUP] * 4, ["a", "b", "c", "d"]],
            names=["category", "name"]))
    assert grouped_df.index.name == "dom"
    assert grouped_df.shape == (31, 4)
    assert grouped_df.index[0] == 1

    # show_overlays only (list of indices), sliding window overlay label
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_time_feature="dom",
        show_overlays=[0, 4],
        overlay_label_sliding_window_size=365 * 2)
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[OVERLAY_COL_GROUP] * 2,
             ["2007-12-10 00:00:00", "2015-12-08 00:00:00"]],
            names=["category", "name"]))
    assert grouped_df.index.name == "dom"
    assert grouped_df.shape == (31, 2)
    assert grouped_df.index[0] == 1

    # show_overlays only (np.ndarray), sliding window overlay label
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_time_feature="dom",
        show_overlays=np.arange(0, 6, 2),
        overlay_label_sliding_window_size=365 * 2)
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[OVERLAY_COL_GROUP] * 3,
             [
                 "2007-12-10 00:00:00", "2011-12-09 00:00:00",
                 "2015-12-08 00:00:00"
             ]],
            names=["category", "name"]))
    assert grouped_df.index.name == "dom"
    assert grouped_df.shape == (31, 3)
    assert grouped_df.index[0] == 1

    # show_overlays only (list of column names), sliding window overlay label
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_time_feature="dom",
        show_overlays=["2007-12-10 00:00:00", "2015-12-08 00:00:00"],
        overlay_label_sliding_window_size=365 * 2)
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[OVERLAY_COL_GROUP] * 2,
             ["2007-12-10 00:00:00", "2015-12-08 00:00:00"]],
            names=["category", "name"]))
    assert grouped_df.index.name == "dom"
    assert grouped_df.shape == (31, 2)
    assert grouped_df.index[0] == 1

    # Show all 3 (no overlay label)
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_sliding_window_size=50,  # 50 per group (50 overlays)
        show_mean=True,
        show_quantiles=[0.05, 0.5, 0.95],  # 3 quantiles
        show_overlays=True)
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[MEAN_COL_GROUP] + [QUANTILE_COL_GROUP] * 3 +
             [OVERLAY_COL_GROUP] * 50, ["mean", "Q0.05", "Q0.5", "Q0.95"] +
             [f"overlay{i}" for i in range(50)]],
            names=["category", "name"]))
    assert grouped_df.index.name == "ts_downsample"
    assert grouped_df.shape == (60, 54)
    assert grouped_df.index[-1] == pd.Timestamp(2016, 1, 7)

    # Show all 3 (with overlay label).
    # Pass overlay_pivot_table_kwargs.
    grouped_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_sliding_window_size=180,
        show_mean=True,
        show_quantiles=[0.05, 0.5, 0.95],  # 3 quantiles
        show_overlays=True,
        overlay_label_time_feature="dow",  # 7 possible values
        aggfunc="median")
    assert_equal(
        grouped_df.columns,
        pd.MultiIndex.from_arrays(
            [[MEAN_COL_GROUP] + [QUANTILE_COL_GROUP] * 3 +
             [OVERLAY_COL_GROUP] * 7,
             [
                 "mean", "Q0.05", "Q0.5", "Q0.95", "1", "2", "3", "4", "5",
                 "6", "7"
             ]],
            names=["category", "name"]))
    assert grouped_df.index.name == "ts_downsample"
    assert grouped_df.shape == (17, 11)
    assert grouped_df.index[-1] == pd.Timestamp(2015, 10, 29)
    assert np.linalg.norm(
        grouped_df[OVERLAY_COL_GROUP].mean()) > 1.0  # not centered

    with pytest.raises(
            TypeError,
            match="pivot_table\\(\\) got an unexpected keyword argument 'aggfc'"
    ):
        peyton_manning_ts.get_quantiles_and_overlays(
            groupby_sliding_window_size=180,
            show_mean=True,
            show_quantiles=[0.05, 0.5, 0.95],
            show_overlays=True,
            overlay_label_time_feature="dow",
            aggfc=np.nanmedian)  # unrecognized parameter

    # center_values with show_mean=True
    centered_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_sliding_window_size=180,
        show_mean=True,
        show_quantiles=[0.05, 0.5, 0.95],
        show_overlays=True,
        overlay_label_time_feature="dow",
        aggfunc="median",
        center_values=True)
    assert np.linalg.norm(centered_df[[MEAN_COL_GROUP, OVERLAY_COL_GROUP
                                       ]].mean()) < 1e-8  # centered at 0
    assert_equal(
        centered_df[QUANTILE_COL_GROUP],
        grouped_df[QUANTILE_COL_GROUP] - grouped_df[MEAN_COL_GROUP].mean()[0])

    # center_values with show_mean=False
    centered_df = peyton_manning_ts.get_quantiles_and_overlays(
        groupby_sliding_window_size=180,
        show_mean=False,
        show_quantiles=[0.05, 0.5, 0.95],
        show_overlays=True,
        overlay_label_time_feature="dow",
        aggfunc="median",
        center_values=True)
    assert np.linalg.norm(centered_df[[OVERLAY_COL_GROUP
                                       ]].mean()) < 1e-8  # centered at 0
    overall_mean = peyton_manning_ts.df[VALUE_COL].mean()
    assert_equal(centered_df[QUANTILE_COL_GROUP],
                 grouped_df[QUANTILE_COL_GROUP] - overall_mean)

    # new value_col
    df = generate_df_with_reg_for_tests(freq="D", periods=700)["df"]
    ts = UnivariateTimeSeries()
    ts.load_data(df=df)
    grouped_df = ts.get_quantiles_and_overlays(
        groupby_time_feature="dow",
        show_mean=True,
        show_quantiles=True,
        show_overlays=True,
        overlay_label_time_feature="woy",
        value_col="regressor1")

    df_dow = add_groupby_column(df=ts.df,
                                time_col=TIME_COL,
                                groupby_time_feature="dow")
    dow_mean = df_dow["df"].groupby("dow").agg(
        mean=pd.NamedAgg(column="regressor1", aggfunc=np.nanmean))
    assert_equal(grouped_df["mean"], dow_mean, check_names=False)
def test_make_future_dataframe_with_regressor():
    """Checks future dataframe creation"""
    ts = UnivariateTimeSeries()
    df = pd.DataFrame({
        TIME_COL: [
            dt(2018, 1, 1, 3, 0, 0),
            dt(2018, 1, 1, 4, 0, 0),
            dt(2018, 1, 1, 5, 0, 0),
            dt(2018, 1, 1, 6, 0, 0),
            dt(2018, 1, 1, 7, 0, 0),
            dt(2018, 1, 1, 8, 0, 0)
        ],
        VALUE_COL: [1, None, 3, None, None, None],
        "regressor1": [0.01, None, 0.014, 0.016, 0.017, None],
        "regressor2": [0.11, 0.112, 0.114, 0.116, None, None],
        "regressor3": [0.21, 0.212, 0.214, 0.216, 0.217, None]
    })
    regressor_cols = [
        col for col in df.columns if col not in [TIME_COL, VALUE_COL]
    ]

    with pytest.warns(Warning) as record:
        ts.load_data(df, TIME_COL, VALUE_COL, regressor_cols=regressor_cols)
        assert "y column of the provided TimeSeries contains null " \
               "values at the end" in record[0].message.args[0]

    # test regressor_cols from load_data
    assert ts.regressor_cols == ["regressor1", "regressor2", "regressor3"]

    # tests last_date_for_fit from load_data (same as without regressor)
    assert ts.train_end_date == dt(2018, 1, 1, 5, 0, 0)

    # tests last_date_for_reg from load_data
    assert ts.last_date_for_reg == dt(2018, 1, 1, 7, 0, 0)

    # tests fit_df from load_data
    result_fit_df = ts.fit_df.reset_index(drop=True)  # fit_df's index is x_col
    expected_fit_df = df[df[TIME_COL] <= dt(2018, 1, 1, 5, 0, 0)]
    assert_frame_equal(result_fit_df, expected_fit_df)

    # tests fit_y
    result_fit_y = ts.fit_y.reset_index(drop=True)  # fit_y's index is x_col
    expected_fit_y = expected_fit_df[VALUE_COL]
    assert result_fit_y.equals(expected_fit_y)

    # with history
    result = ts.make_future_dataframe(
        periods=2, include_history=True).reset_index(drop=True)
    expected = pd.DataFrame({
        TIME_COL:
        pd.date_range(start=dt(2018, 1, 1, 3, 0, 0), periods=5, freq="H"),
        VALUE_COL: [1, None, 3, None, None],
        "regressor1": [0.01, None, 0.014, 0.016, 0.017],
        "regressor2": [0.11, 0.112, 0.114, 0.116, None],
        "regressor3": [0.21, 0.212, 0.214, 0.216, 0.217]
    })
    assert_frame_equal(result, expected)

    # without history
    result = ts.make_future_dataframe(
        periods=2, include_history=False).reset_index(drop=True)
    expected = pd.DataFrame({
        TIME_COL:
        pd.date_range(start=dt(2018, 1, 1, 6, 0, 0), periods=2, freq="H"),
        VALUE_COL:
        np.repeat(np.nan, 2),
        "regressor1": [0.016, 0.017],
        "regressor2": [0.116, None],
        "regressor3": [0.216, 0.217]
    })
    assert result.equals(expected)

    # user doesn't request any future periods
    result = ts.make_future_dataframe(
        periods=None, include_history=False).reset_index(drop=True)
    expected = pd.DataFrame({
        TIME_COL:
        pd.date_range(start=dt(2018, 1, 1, 6, 0, 0), periods=2, freq="H"),
        VALUE_COL:
        np.repeat(np.nan, 2),
        "regressor1": [0.016, 0.017],
        "regressor2": [0.116, None],
        "regressor3": [0.216, 0.217]
    })
    assert result.equals(expected)

    # user requests fewer than the available periods
    result = ts.make_future_dataframe(
        periods=1, include_history=False).reset_index(drop=True)
    expected = pd.DataFrame({
        TIME_COL:
        pd.date_range(start=dt(2018, 1, 1, 6, 0, 0), periods=1, freq="H"),
        VALUE_COL:
        np.repeat(np.nan, 1),
        "regressor1": [0.016],
        "regressor2": [0.116],
        "regressor3": [0.216]
    })
    assert result.equals(expected)

    # user requests more than 2 periods
    with pytest.warns(Warning) as record:
        result = ts.make_future_dataframe(
            periods=4, include_history=False).reset_index(drop=True)
        expected = pd.DataFrame({
            TIME_COL:
            pd.date_range(start=dt(2018, 1, 1, 6, 0, 0), periods=2, freq="H"),
            VALUE_COL:
            np.repeat(np.nan, 2),
            "regressor1": [0.016, 0.017],
            "regressor2": [0.116, None],
            "regressor3": [0.216, 0.217]
        })
        assert result.equals(expected)
        assert "Provided periods '4' is more than allowed ('2') due to the length of regressor columns. " \
               "Using '2'." in record[0].message.args[0]
def test_train_end_date_with_regressors():
    """Tests make_future_dataframe and train_end_date with regressors"""
    data = generate_df_with_reg_for_tests(freq="D",
                                          periods=30,
                                          train_start_date=datetime.datetime(
                                              2018, 1, 1),
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    regressor_cols = ["regressor1", "regressor2", "regressor_categ"]
    keep_cols = [TIME_COL, VALUE_COL] + regressor_cols
    df = data["df"][keep_cols].copy()
    # Setting NaN values at the end
    df.loc[df.tail(2).index, "regressor1"] = np.nan
    df.loc[df.tail(4).index, "regressor2"] = np.nan
    df.loc[df.tail(6).index, "regressor_categ"] = np.nan
    df.loc[df.tail(8).index, VALUE_COL] = np.nan

    # default train_end_date, default regressor_cols
    with pytest.warns(UserWarning) as record:
        ts = UnivariateTimeSeries()
        ts.load_data(df,
                     TIME_COL,
                     VALUE_COL,
                     train_end_date=None,
                     regressor_cols=None)
        assert f"{ts.original_value_col} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({ts.train_end_date})." in record[0].message.args[0]
        assert ts.train_end_date == dt(2018, 1, 22)
        assert ts.fit_df.shape == (22, 2)
        assert ts.last_date_for_val == df[
            df[VALUE_COL].notnull()][TIME_COL].max()
        assert ts.last_date_for_reg is None
        result = ts.make_future_dataframe(periods=10, include_history=True)
        expected = pd.DataFrame({
            TIME_COL:
            pd.date_range(start=dt(2018, 1, 1), periods=32, freq="D"),
            VALUE_COL:
            np.concatenate([ts.fit_y, np.repeat(np.nan, 10)])
        })
        expected.index = expected[TIME_COL]
        expected.index.name = None
        assert_frame_equal(result, expected)

    # train_end_date later than last date in df, all available regressor_cols
    with pytest.warns(UserWarning) as record:
        ts = UnivariateTimeSeries()
        train_end_date = dt(2018, 2, 10)
        ts.load_data(df,
                     TIME_COL,
                     VALUE_COL,
                     train_end_date=train_end_date,
                     regressor_cols=regressor_cols)
        assert f"Input timestamp for the parameter 'train_end_date' " \
               f"({train_end_date}) either exceeds the last available timestamp or" \
               f"{VALUE_COL} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({ts.train_end_date})." in record[0].message.args[0]
        assert ts.last_date_for_val == dt(2018, 1, 22)
        assert ts.last_date_for_reg == dt(2018, 1, 28)
        result = ts.make_future_dataframe(periods=10, include_history=False)
        expected = df.copy()[22:28]
        expected.loc[expected.tail(6).index, VALUE_COL] = np.nan
        expected.index = expected[TIME_COL]
        expected.index.name = None
        assert_frame_equal(result, expected)

    # train_end_date in between last date in df and last date before null
    # user passes no regressor_cols
    with pytest.warns(UserWarning) as record:
        ts = UnivariateTimeSeries()
        train_end_date = dt(2018, 1, 25)
        regressor_cols = []
        ts.load_data(df,
                     TIME_COL,
                     VALUE_COL,
                     train_end_date=train_end_date,
                     regressor_cols=regressor_cols)
        assert f"Input timestamp for the parameter 'train_end_date' " \
               f"({train_end_date}) either exceeds the last available timestamp or" \
               f"{VALUE_COL} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({ts.train_end_date})." in record[0].message.args[0]
        assert ts.train_end_date == dt(2018, 1, 22)
        assert ts.last_date_for_reg is None
        result = ts.make_future_dataframe(periods=10, include_history=True)
        expected = pd.DataFrame({
            TIME_COL:
            pd.date_range(start=dt(2018, 1, 1), periods=32, freq="D"),
            VALUE_COL:
            np.concatenate([ts.fit_y, np.repeat(np.nan, 10)])
        })
        expected.index = expected[TIME_COL]
        expected.index.name = None
        assert_frame_equal(result, expected)

    # train end date equal to last date before null
    # user requests a subset of the regressor_cols
    with pytest.warns(UserWarning) as record:
        ts = UnivariateTimeSeries()
        train_end_date = dt(2018, 1, 22)
        regressor_cols = ["regressor2"]
        ts.load_data(df,
                     TIME_COL,
                     VALUE_COL,
                     train_end_date=train_end_date,
                     regressor_cols=regressor_cols)
        assert ts.train_end_date == dt(2018, 1, 22)
        assert ts.last_date_for_reg == dt(2018, 1, 26)
        result = ts.make_future_dataframe(periods=10, include_history=True)
        assert "Provided periods '10' is more than allowed ('4') due to the length of " \
               "regressor columns. Using '4'." in record[0].message.args[0]
        expected = ts.df.copy()[[TIME_COL, VALUE_COL, "regressor2"]]
        expected = expected[expected.index <= ts.last_date_for_reg]
        assert_frame_equal(result, expected)

    # train_end_date smaller than last date before null
    # user requests regressor_cols that does not exist in df
    with pytest.warns(UserWarning) as record:
        ts = UnivariateTimeSeries()
        train_end_date = dt(2018, 1, 20)
        regressor_cols = ["regressor1", "regressor4", "regressor5"]
        ts.load_data(df,
                     TIME_COL,
                     VALUE_COL,
                     train_end_date=train_end_date,
                     regressor_cols=regressor_cols)
        assert ts.train_end_date == dt(2018, 1, 20)
        assert ts.last_date_for_reg == dt(2018, 1, 28)
        assert (f"The following columns are not available to use as "
                f"regressors: ['regressor4', 'regressor5']"
                ) in record[0].message.args[0]
        result = ts.make_future_dataframe(periods=10, include_history=True)
        expected = ts.df.copy()[[TIME_COL, VALUE_COL, "regressor1"]]
        expected = expected[expected.index <= ts.last_date_for_reg]
        assert_frame_equal(result, expected)