Beispiel #1
0
def test_get_default_origin_for_time_vars(hourly_data):
    """Tests get_default_origin_for_time_vars"""
    train_df = hourly_data["train_df"]
    conti_year_origin = get_default_origin_for_time_vars(train_df, TIME_COL)
    assert round(conti_year_origin, 3) == 2018.496

    df = pd.DataFrame({"time": ["2018-07-01", "2018-08-01"]})
    conti_year_origin = get_default_origin_for_time_vars(df, "time")
    assert round(conti_year_origin, 3) == 2018.496
Beispiel #2
0
def test_get_changepoint_values_from_config(hourly_data):
    """Tests get_changepoint_values_from_config"""
    train_df = hourly_data["train_df"]
    conti_year_origin = get_default_origin_for_time_vars(train_df, TIME_COL)
    time_features_df = build_time_features_df(
        dt=train_df[TIME_COL], conti_year_origin=conti_year_origin)
    with pytest.raises(Exception,
                       match="changepoint method must be specified"):
        get_changepoint_values_from_config(
            changepoints_dict={"n_changepoints": 2},
            time_features_df=time_features_df,
            time_col="datetime")

    with pytest.raises(NotImplementedError,
                       match="changepoint method.*not recognized"):
        get_changepoint_values_from_config(
            changepoints_dict={"method": "not implemented"},
            time_features_df=time_features_df,
            time_col="datetime")

    # tests uniform method
    changepoint_values = get_changepoint_values_from_config(
        changepoints_dict={
            "method": "uniform",
            "n_changepoints": 20
        },
        time_features_df=time_features_df,
        time_col="datetime")
    expected_changepoint_values = get_evenly_spaced_changepoints_values(
        df=time_features_df, n_changepoints=20)
    assert np.array_equal(changepoint_values, expected_changepoint_values)

    changepoint_values = get_changepoint_values_from_config(
        changepoints_dict={
            "method": "uniform",
            "n_changepoints": 20,
            "continuous_time_col": "ct2"
        },
        time_features_df=time_features_df,
        time_col="datetime")
    expected_changepoint_values = get_evenly_spaced_changepoints_values(
        df=time_features_df, n_changepoints=20, continuous_time_col="ct2")
    assert np.array_equal(changepoint_values, expected_changepoint_values)

    # tests custom method
    dates = ["2018-01-01", "2019-01-02-16", "2019-01-03", "2019-02-01"]
    changepoint_values = get_changepoint_values_from_config(
        changepoints_dict={
            "method": "custom",
            "dates": dates
        },
        time_features_df=time_features_df,
        time_col="datetime")
    expected_changepoint_values = get_custom_changepoints_values(
        df=time_features_df, changepoint_dates=dates, time_col="datetime")
    assert np.array_equal(changepoint_values, expected_changepoint_values)

    changepoint_values = get_changepoint_values_from_config(
        changepoints_dict={
            "method": "custom",
            "dates": dates,
            "continuous_time_col": "ct2"
        },
        time_features_df=time_features_df,
        time_col="datetime")
    expected_changepoint_values = get_custom_changepoints_values(
        df=time_features_df,
        changepoint_dates=dates,
        time_col="datetime",
        continuous_time_col="ct2")
    assert np.array_equal(changepoint_values, expected_changepoint_values)
def get_forecast_time_properties(
        df,
        time_col=TIME_COL,
        value_col=VALUE_COL,
        freq=None,
        regressor_cols=None,
        lagged_regressor_cols=None,
        train_end_date=None,
        forecast_horizon=None):
    """Returns the number of training points in `df`, the start year, and prediction end year

    Parameters
    ----------
    df : `pandas.DataFrame` with columns [``time_col``, ``value_col``]
        Univariate timeseries data to forecast
    time_col : `str`, default ``TIME_COL`` in constants.py
        Name of timestamp column in df
    value_col : `str`, default ``VALUE_COL`` in constants.py
        Name of value column in df (the values to forecast)
    freq : `str` or None, default None
        Frequency of input data. Used to generate future dates for prediction.
        Frequency strings can have multiples, e.g. '5H'.
        See https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
        for a list of frequency aliases.
        If None, inferred by pd.infer_freq.
        Provide this parameter if ``df`` has missing timepoints.
    regressor_cols : `list` [`str`] or None, optional, default None
        A list of regressor columns used in the training and prediction DataFrames.
        If None, no regressor columns are used.
        Regressor columns that are unavailable in ``df`` are dropped.
    lagged_regressor_cols : `list` [`str`] or None, optional, default None
        A list of lagged regressor columns used in the training and prediction DataFrames.
        If None, no lagged regressor columns are used.
        Lagged regressor columns that are unavailable in ``df`` are dropped.
    train_end_date : `datetime.datetime`, optional, default None
        Last date to use for fitting the model. Forecasts are generated after this date.
        If None, it is set to the last date with a non-null value in
        ``value_col`` of ``df``.
    forecast_horizon : `int` or None, default None
        Number of periods to forecast into the future. Must be > 0
        If None, default is determined from input data frequency

    Returns
    -------
    time_properties : `dict` [`str`, `any`]
        Time properties dictionary with keys:

        ``"period"`` : `int`
            Period of each observation (i.e. minimum time between observations, in seconds).
        ``"simple_freq"`` : `SimpleTimeFrequencyEnum`
            ``SimpleTimeFrequencyEnum`` member corresponding to data frequency.
        ``"num_training_points"`` : `int`
            Number of observations for training.
        ``"num_training_days"`` : `int`
            Number of days for training.
        ``"days_per_observation"``: `float`
            The time frequency in day units.
        ``"forecast_horizon"``: `int`
            The number of time intervals for which forecast is needed.
        ``"forecast_horizon_in_timedelta"``: `datetime.timedelta`
            The forecast horizon length in timedelta units.
        ``"forecast_horizon_in_days"``: `float`
            The forecast horizon length in day units.
        ``"start_year"`` : `int`
            Start year of the training period.
        ``"end_year"`` : `int`
            End year of the forecast period.
        ``"origin_for_time_vars"`` : `float`
            Continuous time representation of the first date in ``df``.
    """
    if regressor_cols is None:
        regressor_cols = []

    # Defines ``fit_df``, the data available for fitting the model
    # and its time column (in `datetime.datetime` format)
    canonical_data_dict = get_canonical_data(
        df=df,
        time_col=time_col,
        value_col=value_col,
        freq=freq,
        train_end_date=train_end_date,
        regressor_cols=regressor_cols,
        lagged_regressor_cols=lagged_regressor_cols)
    fit_df = canonical_data_dict["fit_df"]

    # Calculates basic time properties
    train_start = fit_df[TIME_COL].min()
    start_year = int(train_start.strftime("%Y"))
    origin_for_time_vars = get_default_origin_for_time_vars(fit_df, TIME_COL)
    period = min_gap_in_seconds(df=fit_df, time_col=TIME_COL)
    simple_freq = get_simple_time_frequency_from_period(period)
    num_training_points = fit_df.shape[0]

    # Calculates number of (fractional) days in the training set
    time_delta = fit_df[TIME_COL].max() - train_start
    num_training_days = (
            time_delta.days
            + (time_delta.seconds + period) / TimeEnum.ONE_DAY_IN_SECONDS.value)

    # Calculates forecast horizon (as a number of periods)
    if forecast_horizon is None:
        # expected to be kept in sync with default value set in ``get_default_time_parameters``
        forecast_horizon = get_default_horizon_from_period(
            period=period,
            num_observations=num_training_points)

    days_per_observation = period / TimeEnum.ONE_DAY_IN_SECONDS.value
    forecast_horizon_in_days = forecast_horizon * days_per_observation
    forecast_horizon_in_timedelta = timedelta(days=forecast_horizon_in_days)

    # Calculates forecast end year
    train_end = fit_df[TIME_COL].max()
    days_to_forecast = math.ceil(forecast_horizon * days_per_observation)
    future_end = train_end + timedelta(days=days_to_forecast)
    end_year = int(future_end.strftime("%Y"))

    return {
        "period": period,
        "simple_freq": simple_freq,
        "num_training_points": num_training_points,
        "num_training_days": num_training_days,
        "days_per_observation": days_per_observation,
        "forecast_horizon": forecast_horizon,
        "forecast_horizon_in_timedelta": forecast_horizon_in_timedelta,
        "forecast_horizon_in_days": forecast_horizon_in_days,
        "start_year": start_year,
        "end_year": end_year,
        "origin_for_time_vars": origin_for_time_vars
    }
Beispiel #4
0
def generate_df_for_tests(freq,
                          periods,
                          train_start_date=datetime.datetime(2018, 7, 1),
                          train_end_date=None,
                          train_frac=0.8,
                          conti_year_origin=None,
                          noise_std=2.0,
                          remove_extra_cols=True,
                          autoreg_coefs=None,
                          fs_coefs=[-1, 3, 4],
                          growth_coef=3.0,
                          growth_pow=1.1,
                          intercept=0.0):
    """Generates dataset for unit tests.

    :param freq: str
        pd.date_range freq parameter, e.g. H or D
    :param periods: int
        number of periods to generate
    :param train_start_date: datetime.datetime
        train start date
    :param train_end_date: Optional[datetime.datetime]
        train end date
    :param train_frac: Optional[float]
        fraction of data to use for training
        only used if train_end_date isn't provided
    :param noise_std: float
        standard deviation of gaussian noise
    :param conti_year_origin: float
        the time origin for continuous time variables
    :param remove_extra_cols: bool
        whether to remove extra columns besides TIME_COL, VALUE_COL
    :param autoreg_coefs: Optional[List[int]]
        The coefficients for the autoregressive terms.
        If provided the generated series denoted mathematically by Y(t) will be
        converted as follows:
        Y(t) -> Y(t) + c1 Y(t-1) + c2 Y(t-2) + c3 Y(t-3) + ...
        where autoreg_coefs = [c1, c2, c3, ...]
        In this fashion, the obtained series will have autoregressive
        properties not explained by seasonality and growth.
    :param fs_coefs: List[float]
        The fourier series coefficients used.
    :param growth_coef: float
        Multiplier for growth
    :param growth_pow: float
        Power for growth, as function of continuous time
    :param intercept: float
        Constant term added to Y(t)

    :return: Dict[str, any]
        contains full dataframe, train dataframe, test dataframe,
        and nrows in test dataframe
    """
    np.random.seed(123)

    date_list = pd.date_range(start=train_start_date,
                              periods=periods,
                              freq=freq).tolist()

    df0 = pd.DataFrame({TIME_COL: date_list})
    if conti_year_origin is None:
        conti_year_origin = get_default_origin_for_time_vars(df0, TIME_COL)
    time_df = build_time_features_df(dt=df0[TIME_COL],
                                     conti_year_origin=conti_year_origin)
    df = pd.concat([df0, time_df], axis=1)
    df["growth"] = growth_coef * (df["ct1"]**growth_pow)

    func = fourier_series_multi_fcn(col_names=["toy", "tow", "tod"],
                                    periods=[1.0, 7.0, 24.0],
                                    orders=[1, 1, 1],
                                    seas_names=None)

    res = func(df)
    df_seas = res["df"]
    df = pd.concat([df, df_seas], axis=1)

    df[VALUE_COL] = (
        intercept + df["growth"] +
        fs_coefs[0] * df[get_fourier_col_name(1, "tod", function_name="sin")] +
        fs_coefs[1] * df[get_fourier_col_name(1, "tow", function_name="sin")] +
        fs_coefs[2] * df[get_fourier_col_name(1, "toy", function_name="sin")] +
        noise_std * np.random.normal(size=df.shape[0]))

    if autoreg_coefs is not None:
        df["temporary_new_value"] = df[VALUE_COL]
        k = len(autoreg_coefs)
        for i in range(k):
            df["temporary_new_value"] = (
                df["temporary_new_value"] +
                autoreg_coefs[i] * df[VALUE_COL].shift(-i)).bfill()
        df[VALUE_COL] = df["temporary_new_value"]
        del df["temporary_new_value"]

    if train_end_date is None:
        train_rows = np.floor(train_frac * df.shape[0]).astype(int)
        train_end_date = df[TIME_COL][train_rows]

    if remove_extra_cols:
        df = df[[TIME_COL, VALUE_COL]]
    train_df = df.loc[df[TIME_COL] <= train_end_date]
    test_df = df.loc[df[TIME_COL] > train_end_date]
    fut_time_num = test_df.shape[0]

    return {
        "df": df,
        "train_df": train_df.reset_index(drop=True),
        "test_df": test_df.reset_index(drop=True),
        "fut_time_num": fut_time_num,
    }
def test_get_forecast_time_properties():
    """Tests get_forecast_time_properties"""
    num_training_points = 365  # one year of daily data
    data = generate_df_for_tests(freq="D", periods=num_training_points)
    df = data["df"]
    result = get_forecast_time_properties(df,
                                          time_col=TIME_COL,
                                          value_col=VALUE_COL,
                                          freq="D",
                                          forecast_horizon=0)
    default_origin = get_default_origin_for_time_vars(df, TIME_COL)
    assert result == {
        "period": TimeEnum.ONE_DAY_IN_SECONDS.value,
        "simple_freq": SimpleTimeFrequencyEnum.DAY,
        "num_training_points": num_training_points,
        "num_training_days": num_training_points,
        "days_per_observation": 1,
        "forecast_horizon": 0,
        "forecast_horizon_in_timedelta": timedelta(days=0),
        "forecast_horizon_in_days": 0,
        "start_year": 2018,
        "end_year": 2019,
        "origin_for_time_vars": default_origin
    }

    # longer forecast_horizon
    result = get_forecast_time_properties(df,
                                          time_col=TIME_COL,
                                          value_col=VALUE_COL,
                                          freq="D",
                                          forecast_horizon=365)
    default_origin = get_default_origin_for_time_vars(df, TIME_COL)
    assert result == {
        "period": TimeEnum.ONE_DAY_IN_SECONDS.value,
        "simple_freq": SimpleTimeFrequencyEnum.DAY,
        "num_training_points": num_training_points,
        "num_training_days": num_training_points,
        "days_per_observation": 1,
        "forecast_horizon": 365,
        "forecast_horizon_in_timedelta": timedelta(days=365),
        "forecast_horizon_in_days": 365,
        "start_year": 2018,
        "end_year": 2020,
        "origin_for_time_vars": default_origin
    }

    # two years of hourly data
    num_training_points = 2 * 365 * 24
    data = generate_df_for_tests(freq="H", periods=num_training_points)
    df = data["df"]
    result = get_forecast_time_properties(df,
                                          time_col=TIME_COL,
                                          value_col=VALUE_COL,
                                          freq="H",
                                          forecast_horizon=0)
    default_origin = get_default_origin_for_time_vars(df, TIME_COL)
    assert result == {
        "period": TimeEnum.ONE_HOUR_IN_SECONDS.value,
        "simple_freq": SimpleTimeFrequencyEnum.HOUR,
        "num_training_points": num_training_points,
        "num_training_days": num_training_points / 24,
        "days_per_observation": 1 / 24,
        "forecast_horizon": 0,
        "forecast_horizon_in_timedelta": timedelta(days=0),
        "forecast_horizon_in_days": 0,
        "start_year": 2018,
        "end_year": 2020,
        "origin_for_time_vars": default_origin
    }

    # longer forecast_horizon
    result = get_forecast_time_properties(df,
                                          time_col=TIME_COL,
                                          value_col=VALUE_COL,
                                          freq="H",
                                          forecast_horizon=365 * 24)
    default_origin = get_default_origin_for_time_vars(df, TIME_COL)
    assert result == {
        "period": TimeEnum.ONE_HOUR_IN_SECONDS.value,
        "simple_freq": SimpleTimeFrequencyEnum.HOUR,
        "num_training_points": num_training_points,
        "num_training_days": num_training_points / 24,
        "days_per_observation": 1 / 24,
        "forecast_horizon": 365 * 24,
        "forecast_horizon_in_timedelta": timedelta(days=365),
        "forecast_horizon_in_days": 365,
        "start_year": 2018,
        "end_year": 2021,
        "origin_for_time_vars": default_origin
    }

    # ``forecast_horizon=None``
    result = get_forecast_time_properties(df,
                                          time_col=TIME_COL,
                                          value_col=VALUE_COL,
                                          freq="H",
                                          forecast_horizon=None)
    default_origin = get_default_origin_for_time_vars(df, TIME_COL)
    assert result == {
        "period": TimeEnum.ONE_HOUR_IN_SECONDS.value,
        "simple_freq": SimpleTimeFrequencyEnum.HOUR,
        "num_training_points": num_training_points,
        "num_training_days": num_training_points / 24,
        "days_per_observation": 1 / 24,
        "forecast_horizon": 24,
        "forecast_horizon_in_timedelta": timedelta(days=1),
        "forecast_horizon_in_days": 1,
        "start_year": 2018,
        "end_year": 2020,
        "origin_for_time_vars": default_origin
    }

    # weekly df with regressors
    num_training_points = 50
    data = generate_df_with_reg_for_tests(freq="W-SUN",
                                          periods=num_training_points,
                                          train_start_date=datetime.datetime(
                                              2018, 11, 30),
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    df = data["df"]
    train_df = data["train_df"]
    forecast_horizon = data["fut_time_num"]
    regressor_cols = [
        col for col in df.columns if col not in [TIME_COL, VALUE_COL]
    ]
    result = get_forecast_time_properties(df,
                                          time_col=TIME_COL,
                                          value_col=VALUE_COL,
                                          freq="W-SUN",
                                          regressor_cols=regressor_cols,
                                          forecast_horizon=forecast_horizon)
    default_origin = get_default_origin_for_time_vars(df, TIME_COL)
    assert result == {
        "period": TimeEnum.ONE_WEEK_IN_SECONDS.value,
        "simple_freq": SimpleTimeFrequencyEnum.WEEK,
        "num_training_points": train_df.shape[0],  # size of training set
        "num_training_days": train_df.shape[0] * 7,
        "days_per_observation": 7,
        "forecast_horizon": 9,
        "forecast_horizon_in_timedelta": timedelta(days=63),
        "forecast_horizon_in_days": 63.0,
        "start_year": 2018,
        "end_year": 2019,
        "origin_for_time_vars": default_origin
    }

    # checks `num_training_days` with `train_end_date`
    data = generate_df_with_reg_for_tests(freq="H",
                                          periods=300 * 24,
                                          train_start_date=datetime.datetime(
                                              2018, 7, 1),
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    df = data["df"]
    train_end_date = datetime.datetime(2019, 2, 1)
    result = get_forecast_time_properties(
        df=df,
        time_col=TIME_COL,
        value_col=VALUE_COL,
        freq="H",
        regressor_cols=data["regressor_cols"],
        train_end_date=train_end_date,
        forecast_horizon=forecast_horizon)
    period = 3600  # seconds between observations
    time_delta = (train_end_date - df[TIME_COL].min()
                  )  # train end - train start
    num_training_days = (
        time_delta.days +
        (time_delta.seconds + period) / TimeEnum.ONE_DAY_IN_SECONDS.value)
    assert result["num_training_days"] == num_training_days

    # checks `num_training_days` without `train_end_date`
    result = get_forecast_time_properties(
        df=df,
        time_col=TIME_COL,
        value_col=VALUE_COL,
        freq="H",
        regressor_cols=data["regressor_cols"],
        train_end_date=None,
        forecast_horizon=forecast_horizon)
    time_delta = (
        datetime.datetime(2019, 2, 26) - df[TIME_COL].min()
    )  # by default, train end is the last date with nonnull value_col
    num_training_days = (
        time_delta.days +
        (time_delta.seconds + period) / TimeEnum.ONE_DAY_IN_SECONDS.value)
    assert result["num_training_days"] == num_training_days