Example #1
0
def test_convert_date_to_continuous_time():
    assert convert_date_to_continuous_time(dt(2019, 1, 1)) == 2019.0
    assert convert_date_to_continuous_time(dt(2019, 7, 1)) == 2019 + 181 / 365
    assert convert_date_to_continuous_time(dt(2020, 7, 1)) == 2020 + 182 / 366  # leap year
    assert convert_date_to_continuous_time(dt(2019, 7, 1, 7, 4, 24)) == 2019 + (181  # day
                                                                                + 7 / 24  # hour
                                                                                + 4 / (24 * 60)  # minute
                                                                                + 24 / (24 * 60 * 60)) / 365  # second
def params2():
    autoreg_dict = {
        "lag_dict": {"orders": [7]},
        "agg_lag_dict": {
            "orders_list": [[7, 7*2, 7*3]],
            "interval_list": [(7, 7*2)]},
        "series_na_fill_func": lambda s: s.bfill().ffill()}
    lagged_regressor_dict = {
        "regressor1": {
            "lag_dict": {"orders": [1, 2, 3]},
            "agg_lag_dict": {
                "orders_list": [[7, 7 * 2, 7 * 3]],
                "interval_list": [(8, 7 * 2)]},
            "series_na_fill_func": lambda s: s.bfill().ffill()},
        "regressor2": "auto"}
    uncertainty_dict = {
        "uncertainty_method": "simple_conditional_residuals",
        "params": {
            "conditional_cols": ["dow"],
            "quantiles": [0.025, 0.975],
            "quantile_estimation_method": "normal_fit",
            "sample_size_thresh": 5,
            "small_sample_size_method": "std_quantiles",
            "small_sample_size_quantile": 0.98}}
    return {
        "origin_for_time_vars": convert_date_to_continuous_time(datetime.datetime(2018, 1, 1)),
        "extra_pred_cols": ["ct1", "regressor1", "regressor2"],
        "train_test_thresh": None,
        "training_fraction": None,
        "fit_algorithm_dict": {
            "fit_algorithm": "ridge",
            "fit_algorithm_params": None,
        },
        "daily_event_df_dict": None,
        "changepoints_dict": None,
        "changepoint_detector": None,
        "fs_components_df": pd.DataFrame({
            "name": ["tow"],
            "period": [7.0],
            "order": [3],
            "seas_names": [None]}),
        "autoreg_dict": autoreg_dict,
        "lagged_regressor_dict": lagged_regressor_dict,
        "min_admissible_value": None,
        "max_admissible_value": None,
        "uncertainty_dict": uncertainty_dict,
        "normalize_method": "min_max",
        "adjust_anomalous_dict": None,
        "impute_dict": {
            "func": impute_with_lags,
            "params": {"orders": [7]}},
        "regression_weight_col": "ct2",
        "forecast_horizon": 5,
        "simulation_based": True,
    }
Example #3
0
def params():
    autoreg_dict = {
        "lag_dict": {
            "orders": [7]
        },
        "agg_lag_dict": {
            "orders_list": [[7, 7 * 2, 7 * 3]],
            "interval_list": [(7, 7 * 2)]
        },
        "series_na_fill_func": lambda s: s.bfill().ffill()
    }
    uncertainty_dict = {
        "uncertainty_method": "simple_conditional_residuals",
        "params": {
            "conditional_cols": ["dow"],
            "quantiles": [0.025, 0.975],
            "quantile_estimation_method": "normal_fit",
            "sample_size_thresh": 5,
            "small_sample_size_method": "std_quantiles",
            "small_sample_size_quantile": 0.98
        }
    }
    return {
        "origin_for_time_vars":
        convert_date_to_continuous_time(datetime.datetime(2018, 1, 3)),
        "extra_pred_cols": ["ct1", "regressor1", "regressor2"],
        "train_test_thresh":
        None,
        "training_fraction":
        None,
        "fit_algorithm":
        "sgd",
        "fit_algorithm_params": {
            "alpha": 0.1
        },
        "daily_event_df_dict":
        None,
        "changepoints_dict":
        None,
        "fs_components_df":
        pd.DataFrame({
            "name": ["tow"],
            "period": [7.0],
            "order": [3],
            "seas_names": [None]
        }),
        "autoreg_dict":
        autoreg_dict,
        "min_admissible_value":
        None,
        "max_admissible_value":
        None,
        "uncertainty_dict":
        uncertainty_dict
    }
 def fit(self, X, y=None):
     """Sets the time origin for input time series"""
     assert isinstance(X, pd.DataFrame)
     dt = X[self.time_col]
     self.origin_for_time_vars = convert_date_to_continuous_time(dt[0])
     return self
Example #5
0
def test_get_changepoint_features_and_values_from_config(hourly_data):
    """Tests get_changepoint_features_and_values_from_config"""
    # no changepoints
    train_df = hourly_data["train_df"]
    changepoints = get_changepoint_features_and_values_from_config(
        df=train_df,
        time_col=TIME_COL,
        changepoints_dict=None,
        origin_for_time_vars=None)
    assert changepoints["changepoint_df"] is None
    assert changepoints["changepoint_values"] is None
    assert changepoints["continuous_time_col"] is None
    assert changepoints["growth_func"] is None
    assert changepoints["changepoint_cols"] == []

    # uniform method
    train_df = hourly_data["train_df"]
    n_changepoints = 20
    changepoints_dict = {
        "method": "uniform",
        "n_changepoints": n_changepoints,
        "continuous_time_col": "ct2"
    }
    changepoints = get_changepoint_features_and_values_from_config(
        df=train_df,
        time_col=TIME_COL,
        changepoints_dict=changepoints_dict,
        origin_for_time_vars=None)
    changepoint_dates = get_changepoint_dates_from_changepoints_dict(
        changepoints_dict=changepoints_dict, df=train_df, time_col=TIME_COL)

    assert sorted(list(changepoints.keys())) == sorted([
        "changepoint_df", "changepoint_values", "continuous_time_col",
        "growth_func", "changepoint_cols"
    ])
    assert changepoints["changepoint_df"].shape == (train_df.shape[0],
                                                    n_changepoints)
    assert changepoints["changepoint_values"].shape == (n_changepoints, )
    assert changepoints["continuous_time_col"] == "ct2"
    assert changepoints["growth_func"] is None
    assert changepoints["changepoint_cols"] == [
        f"{CHANGEPOINT_COL_PREFIX}{i}_{pd.to_datetime(date).strftime('%Y_%m_%d_%H')}"
        for i, date in enumerate(changepoint_dates)
    ]

    # custom method, no changepoint in range
    changepoints_dict = {
        "method": "custom",
        "dates": ["2048-03-02"],
        "growth_func": signed_sqrt
    }
    origin_for_time_vars = convert_date_to_continuous_time(
        datetime.datetime(2018, 1, 1))
    changepoints = get_changepoint_features_and_values_from_config(
        df=train_df,
        time_col=TIME_COL,
        changepoints_dict=changepoints_dict,
        origin_for_time_vars=origin_for_time_vars)

    assert sorted(list(changepoints.keys())) == sorted([
        "changepoint_df", "changepoint_values", "continuous_time_col",
        "growth_func", "changepoint_cols"
    ])
    assert changepoints["changepoint_df"] is None
    assert changepoints["changepoint_values"] is None
    assert changepoints["continuous_time_col"] is None
    assert changepoints["growth_func"] == signed_sqrt
    assert changepoints["changepoint_cols"] == []
Example #6
0
#
# If a feature is not automatically created by ``SILVERKITE``, we need to create it
# beforehand and append it to the data df.
# Here we create the "is_football_season" feature.
# Note that we also need to provide the customized column for the forecast horizon period as well.
# The way we do it is to first create the df with timestamps covering the forecast horizon.
# This can be done with the `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.make_future_dataframe`
# function within the `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries` class.
# Then we create a new column of our customized regressor for this augmented df.

# Makes augmented df with forecast horizon 365 days
df_full = ts.make_future_dataframe(periods=365)
# Builds "df_features" that contains datetime information of the "df"
df_features = build_time_features_df(
    dt=df_full["ts"],
    conti_year_origin=convert_date_to_continuous_time(df_full["ts"][0]))

# Roughly approximates the football season.
# "woy" is short for "week of year", created above.
# Football season is roughly the first 6 weeks and last 17 weeks in a year.
is_football_season = (df_features["woy"] <= 6) | (df_features["woy"] >= 36)
# Adds the new feature to the dataframe.
df_full["is_football_season"] = is_football_season.astype(int).tolist()
df_full.reset_index(drop=True, inplace=True)

# Configures regressor column.
regressors = {"regressor_cols": ["is_football_season"]}

# %%
# Interactions
# """"""""""""
def params():
    silverkite = SimpleSilverkiteForecast()
    daily_event_df_dict = silverkite._SimpleSilverkiteForecast__get_silverkite_holidays(
        holiday_lookup_countries=["India"],
        holidays_to_model_separately=["Easter Sunday", "Republic Day"],
        start_year=2017,
        end_year=2025,
        pre_num=2,
        post_num=2)
    autoreg_dict = {
        "lag_dict": {
            "orders": [7]
        },
        "agg_lag_dict": {
            "orders_list": [[7, 7 * 2, 7 * 3]],
            "interval_list": [(7, 7 * 2)]
        },
        "series_na_fill_func": lambda s: s.bfill().ffill()
    }
    lagged_regressor_dict = {
        "regressor1": {
            "lag_dict": {
                "orders": [1, 2, 3]
            },
            "agg_lag_dict": {
                "orders_list": [[7, 7 * 2, 7 * 3]],
                "interval_list": [(8, 7 * 2)]
            },
            "series_na_fill_func": lambda s: s.bfill().ffill()
        },
        "regressor2": "auto"
    }
    uncertainty_dict = {
        "uncertainty_method": "simple_conditional_residuals",
        "params": {
            "conditional_cols": ["dow"],
            "quantiles": [0.05, 0.95],
            "quantile_estimation_method": "normal_fit",
            "sample_size_thresh": 5,
            "small_sample_size_method": "std_quantiles",
            "small_sample_size_quantile": 0.98
        }
    }

    return {
        "time_properties":
        None,
        "freq":
        None,
        "forecast_horizon":
        None,
        "origin_for_time_vars":
        convert_date_to_continuous_time(datetime.datetime(2018, 1, 3)),
        "train_test_thresh":
        None,
        "training_fraction":
        None,
        "fit_algorithm_dict": {
            "fit_algorithm": "sgd",
            "fit_algorithm_params": {
                "alpha": 0.1
            }
        },
        "holidays_to_model_separately": ["New Year's Day", "Christmas Day"],
        "holiday_lookup_countries": ["UnitedStates"],
        "holiday_pre_num_days":
        2,
        "holiday_post_num_days":
        2,
        "holiday_pre_post_num_dict": {
            "New Year's Day": (7, 3)
        },
        "daily_event_df_dict":
        daily_event_df_dict,
        "changepoints_dict":
        None,
        "yearly_seasonality":
        "auto",
        "quarterly_seasonality":
        False,
        "monthly_seasonality":
        False,
        "weekly_seasonality":
        3,
        "daily_seasonality":
        False,
        "max_daily_seas_interaction_order":
        None,
        "max_weekly_seas_interaction_order":
        None,
        "autoreg_dict":
        autoreg_dict,
        "lagged_regressor_dict":
        lagged_regressor_dict,
        "min_admissible_value":
        None,
        "max_admissible_value":
        None,
        "uncertainty_dict":
        uncertainty_dict,
        "growth_term":
        "linear",
        "regressor_cols":
        None,
        "feature_sets_enabled":
        None,
        "extra_pred_cols": ["ct1", "regressor1", "regressor2"]
    }
Example #8
0
def params_components():
    """Parameters for ``forecast_silverkite``"""
    autoreg_dict = {
        "lag_dict": {
            "orders": [7]
        },
        "agg_lag_dict": {
            "orders_list": [[7, 7 * 2, 7 * 3]],
            "interval_list": [(7, 7 * 2)]
        },
        "series_na_fill_func": lambda s: s.bfill().ffill()
    }

    uncertainty_dict = {
        "uncertainty_method": "simple_conditional_residuals",
        "params": {
            "conditional_cols": ["dow"],
            "quantiles": [0.025, 0.975],
            "quantile_estimation_method": "normal_fit",
            "sample_size_thresh": 5,
            "small_sample_size_method": "std_quantiles",
            "small_sample_size_quantile": 0.98
        }
    }

    # generate holidays
    countries = ["US", "India"]
    holidays_to_model_separately = [
        "New Year's Day", "Christmas Day", "Independence Day", "Thanksgiving",
        "Labor Day", "Memorial Day", "Veterans Day"
    ]
    event_df_dict = generate_holiday_events(
        countries=countries,
        holidays_to_model_separately=holidays_to_model_separately,
        year_start=2015,
        year_end=2025,
        pre_num=2,
        post_num=2)
    # constant event effect at daily level
    event_cols = [f"Q('events_{key}')" for key in event_df_dict.keys()]
    interaction_cols = cols_interact(static_col="is_weekend",
                                     fs_name="tow",
                                     fs_order=4,
                                     fs_seas_name="weekly")
    extra_pred_cols = ["ct_sqrt", "dow_hr", "ct1", "ct1:tod", "regressor1", "regressor2"] + \
        event_cols + interaction_cols

    # seasonality terms
    fs_components_df = pd.DataFrame({
        "name": ["tod", "tow", "ct1"],
        "period": [24.0, 7.0, 1.0],
        "order": [12, 4, 5],
        "seas_names": ["daily", "weekly", "yearly"]
    })

    # changepoints
    changepoints_dict = dict(
        method="custom",
        dates=["2018-01-01", "2019-01-02-16", "2019-01-03", "2019-02-01"],
        continuous_time_col="ct2")

    return {
        "coverage":
        0.95,
        "origin_for_time_vars":
        convert_date_to_continuous_time(datetime.datetime(2018, 1, 3)),
        "extra_pred_cols":
        extra_pred_cols,
        "train_test_thresh":
        None,
        "training_fraction":
        None,
        "fit_algorithm":
        "ridge",
        "daily_event_df_dict":
        event_df_dict,
        "changepoints_dict":
        changepoints_dict,
        "fs_components_df":
        fs_components_df,
        "autoreg_dict":
        autoreg_dict,
        "min_admissible_value":
        None,
        "max_admissible_value":
        None,
        "uncertainty_dict":
        uncertainty_dict
    }
Example #9
0
def forecast_similarity_based(df,
                              time_col,
                              value_cols,
                              agg_method,
                              agg_func=None,
                              grid_size_str=None,
                              match_cols=[],
                              origin_for_time_vars=None,
                              recent_k=1):
    """Fits a basic forecast model which is aggregate based. As an example for an hourly time series we
    can assign the value of the most recent three weeks at the same time of the week as forecast.
    This works for multiple responses passed as a list in "value_cols".
    Also we do not require the timestamps to be regular and for example dataframe can have missing
    timestamps.

    :param df: the dataframe which includes the training data, the value_cols and match_cols if needed
    :param time_col: the column of the dataframe which includes the timestamps of the series
    :param value_cols: the response columns for which forecast is desired
    :param agg_method: a string which specifies the aggregation method. Options are
        "mean": the mean of the values at timestamps which match with desired time on match_cols
        "median": the median of the values at timestamps which match with desired time on match_cols
        "min": the min of the values at timestamps which match with desired time on match_cols
        "max": the max of the values at timestamps which match with desired time on match_cols
        "most_recent": the mean of "recent_k" (given in last argument of the function)
        values matching with desired time on match_cols
    :param agg_func: the aggregation function needed for aggregating w.r.t "match_cols".
        This is needed if agg_method is not available.
    :param grid_size_str: the expected time increment. If not provided it is inferred.
    :param match_cols: the variables used for grouping in aggregation
    :param origin_for_time_vars: the time origin for continuous time variables
    :param recent_k: the number of most recent timestamps to consider for aggregation

    :return: A dictionary consisting of a "model" and a "predict" function.
        The "model" object is simply a dictionary of two items (dataframes).
        First item is an aggregated dataframe of "value_cols" w.r.t "match_cols": "pred_df"
        Second item is an aggregated dataframe across all data: "pred_df_overall".
        This is useful if a level in match_cols didn't appear in train dataset,
        therefore in that case we fall back to overall prediction
        The "predict" is a function which performs prediction for new data
        The "predict_n" is a function which predicts the future for any given number of steps specified
    """
    # This is only needed for predict_n function.
    # But it makes sense to do this only once for computational efficiency
    timeseries_info = describe_timeseries(df, time_col)
    if grid_size_str is None:
        grid_size = timeseries_info["median_delta"]
        grid_size_str = str(int(grid_size / np.timedelta64(1, "s"))) + "s"
    max_timestamp = timeseries_info["max_timestamp"]

    # a dictionary of methods and their corresponding aggregation function
    agg_method_func_dict = {
        "mean": (lambda x: np.mean(x)),
        "median": (lambda x: np.median(x)),
        "min": (lambda x: np.min(x)),
        "max": (lambda x: np.max(x)),
        "most_recent": (lambda x: np.mean(x[-recent_k:]))
    }

    if agg_func is None:
        if agg_method not in agg_method_func_dict.keys():
            raise Exception(
                "The aggregation method you specified is not implemented." +
                "These are the available methods: mean, median, min, max, most_recent"
            )
        agg_func = agg_method_func_dict[agg_method]

    # sets default origin so that "ct1" feature from "build_time_features_df" starts at 0 on training start date
    if origin_for_time_vars is None:
        origin_for_time_vars = convert_date_to_continuous_time(df[time_col][0])

    # we calculate time features here which might appear in match_cols and not available in df by default
    def add_time_features(df):
        time_df = build_time_features_df(
            dt=df[time_col], conti_year_origin=origin_for_time_vars)
        for col in match_cols:
            if col not in df.columns:
                df[col] = time_df[col].values
        return df

    df = add_time_features(df=df)

    # for value_col in value_cols:
    # aggregate w.r.t. given columns in match_cols
    agg_dict = {value_col: agg_func for value_col in value_cols}
    # we create a coarse prediction by simply aggregating every value column globally
    # this is useful for cases where no matching data is available for a given timestamp to be forecasted
    pred_df_overall = df.groupby([True] * len(df),
                                 as_index=False).agg(agg_dict)
    if len(match_cols) == 0:
        pred_df = pred_df_overall
    else:
        pred_df = df.groupby(match_cols, as_index=False).agg(agg_dict)

    model = {"pred_df": pred_df, "pred_df_overall": pred_df_overall}

    def predict(new_df, new_external_regressor_df=None):
        """Predicts for new dataframe (new_df) using the fitted model.
        :param new_df: a dataframe of new data which must include the time_col and match_cols
        :param new_external_regressor_df: a regressor dataframe if needed
        :return: new_df is augmented with predictions for value_cols and returned
        """
        new_df = new_df.copy(deep=True)
        new_df = add_time_features(df=new_df)
        if new_external_regressor_df is not None:
            new_df = pd.concat([new_df, new_external_regressor_df])

        # if the response columns appear in the new_df columns, we take them out to prevent issues in aggregation
        for col in value_cols:
            if col in new_df.columns:
                warnings.warn(
                    f"{col} is a response column and appeared in new_df. Hence it was removed."
                )
                del new_df[col]

        new_df["temporary_overall_dummy"] = 0
        pred_df_overall["temporary_overall_dummy"] = 0

        new_df_grouped = pd.merge(new_df, pred_df, on=match_cols, how="left")
        new_df_overall = pd.merge(new_df,
                                  pred_df_overall,
                                  on=["temporary_overall_dummy"],
                                  how="left")

        # when we have missing in the grouped case (which can happen if a level in match_cols didn't appear in train dataset)
        # we fall back to the overall case
        for col in value_cols:
            new_df_grouped.loc[new_df_grouped[col].isnull(),
                               col] = new_df_overall.loc[
                                   new_df_grouped[col].isnull(), col]

        del new_df_grouped["temporary_overall_dummy"]

        return new_df_grouped

    def predict_n(fut_time_num, new_external_regressor_df=None):
        """This is the forecast function which can be used to forecast.
        It accepts extra predictors if needed in the form of a dataframe: new_external_regressor_df.
        :param fut_time_num: number of needed future values
        :param new_external_regressor_df: extra predictors if available
        """
        # we create the future time grid
        date_list = pd.date_range(start=max_timestamp +
                                  pd.Timedelta(grid_size_str),
                                  periods=fut_time_num,
                                  freq=grid_size_str).tolist()

        fut_df = pd.DataFrame({time_col: date_list})
        return predict(fut_df,
                       new_external_regressor_df=new_external_regressor_df)

    return {"model": model, "predict": predict, "predict_n": predict_n}