def add_time_features(df): time_df = build_time_features_df( dt=df[time_col], conti_year_origin=origin_for_time_vars) for col in match_cols: if col not in df.columns: df[col] = time_df[col].values return df
def test_build_time_features_df_leap_years(): date_list_non_leap_year = pd.date_range(start=dt(2019, 2, 28), periods=3 * 24, freq="H").tolist() df0 = pd.DataFrame({"ts": date_list_non_leap_year}) time_df = build_time_features_df(dt=df0["ts"], conti_year_origin=2019) expected = (np.repeat([58.0, 59.0, 60.0], 24) + np.tile(range(24), 3) / 24) observed = 365.0 * time_df["toy"] assert np.allclose(observed, expected) date_list_leap_year = pd.date_range(start=dt(2020, 2, 28), periods=3 * 24, freq="H").tolist() df0 = pd.DataFrame({"ts": date_list_leap_year}) time_df = build_time_features_df(dt=df0["ts"], conti_year_origin=2019) expected = (np.repeat([58.0, 59.0, 59.0], 24) + np.concatenate( [range(24), np.repeat(0, 24), range(24)]) / 24) observed = 365.0 * time_df["toy"] assert np.allclose(observed, expected)
def test_BuildTimeseriesFeaturesTransformer_1(): """Checks if the transformer class returns same output as build_time_features_df""" date_list = pd.date_range(start=datetime(2019, 1, 1), periods=100, freq="H").tolist() df = pd.DataFrame({"ts": date_list}) timeseries_transform = BuildTimeseriesFeaturesTransformer(time_col="ts") result = timeseries_transform.fit_transform(df) features_ts = build_time_features_df(dt=df["ts"], conti_year_origin=2019) expected = pd.concat([df, features_ts], axis=1) assert result.equals(expected)
def transform(self, X): """ Calculates time series features of the input time series Parameters ---------- X : pd.DataFrame Returns ------- A copy of the data frame with original time points and calculated features """ if self.origin_for_time_vars is None: raise NotFittedError( "This instance is not fitted yet. Call 'fit' with appropriate arguments " "before calling 'transform'.") assert isinstance(X, pd.DataFrame) dt = X[self.time_col] features_ts = build_time_features_df( dt, conti_year_origin=self.origin_for_time_vars) output = pd.concat([dt, features_ts], axis=1) return output
def test_build_time_features_df(): date_list = pd.date_range(start=dt(2019, 1, 1), periods=24 * 365, freq="H").tolist() df0 = pd.DataFrame({"ts": date_list}) time_df = build_time_features_df(dt=df0["ts"], conti_year_origin=2019) assert time_df["datetime"][0] == datetime.datetime(2019, 1, 1, 0, 0, 0) assert time_df["date"][0] == datetime.date(2019, 1, 1) assert time_df["year"][0] == 2019 assert time_df["year_length"][0] == 365 assert time_df["quarter"][0] == 1 assert time_df["quarter_start"][0] == pd.to_datetime("2019-01-01") assert time_df["quarter_start"][24 * 89] == pd.to_datetime("2019-01-01") assert time_df["quarter_start"][24 * 91] == pd.to_datetime("2019-04-01") assert time_df["toq"][0] == 0 assert time_df["toq"][24 * 16] == 16.0 / 90.0 assert time_df["toq"][24 * 10] == 10.0 / 90.0 assert time_df["toq"][24 * 89] == 89.0 / 90.0 assert time_df["toq"][24 * 91] == 1.0 / 91.0 assert time_df["month"][0] == 1 assert time_df["month_length"][0] == 31 assert time_df["woy"][0] == 1 assert time_df["doy"][0] == 1 assert time_df["dom"][0] == 1 assert time_df["dow"][0] == 2 assert time_df["str_dow"][0] == "2-Tue" assert time_df["hour"][0] == 0 assert time_df["minute"][0] == 0 assert time_df["second"][0] == 0 assert time_df["year_month"][0] == "2019-01" assert time_df["year_woy"][0] == "2019_01" assert time_df["month_dom"][0] == "01/01" assert time_df["year_woy_dow"][0] == "2019_01_2" assert time_df["dow_hr"][0] == "2_00" assert time_df["dow_hr_min"][0] == "2_00_00" assert time_df["tod"][0] == 0.0 assert time_df["tow"][0] == 1.0 assert time_df["tom"][0] == 0.0 / 31 assert time_df["toy"][0] == 0.0 assert time_df["conti_year"][0] == 2019.0 assert not time_df["is_weekend"][0] assert time_df["dow_grouped"][0] == "1234-MTuWTh" assert time_df["dow_grouped"][24 * 3] == "5-Fri" assert time_df["dow_grouped"][24 * 4] == "6-Sat" assert time_df["dow_grouped"][24 * 5] == "7-Sun" # detailed check on dow_hr assert list(time_df["dow_hr"])[::7][:25] == [ '2_00', '2_07', '2_14', '2_21', '3_04', '3_11', '3_18', '4_01', '4_08', '4_15', '4_22', '5_05', '5_12', '5_19', '6_02', '6_09', '6_16', '6_23', '7_06', '7_13', '7_20', '1_03', '1_10', '1_17', '2_00' ] # noqa: E501 assert time_df["ct1"][0] == 0.0 assert time_df["ct2"][0] == 0.0 assert time_df["ct3"][0] == 0.0 assert time_df["ct_sqrt"][0] == 0.0 assert time_df["ct_root3"][0] == 0.0 ct1 = 50.0 / 365 / 24 assert time_df["ct1"][50] == pytest.approx(ct1, rel=1e-3) assert time_df["ct2"][50] == pytest.approx(ct1**2, rel=1e-3) assert time_df["ct3"][50] == pytest.approx(ct1**3, rel=1e-3) assert time_df["ct_sqrt"][50] == pytest.approx(ct1**0.5, rel=1e-3) assert time_df["ct_root3"][50] == pytest.approx(ct1**(1 / 3), rel=1e-3) quarter_dates = [ "2020-01-01", "2020-03-31", # Q1 2020 (leap year) "2020-04-01", "2020-06-30", # Q2 2020 "2020-07-01", "2020-09-30", # Q3 2020 "2020-10-01", "2020-12-31", # Q4 2020 "2021-01-01", "2021-03-31", # Q1 2021 "2021-05-13-12", "2021-08-03-18", # Q2/3 2021 ] time_df = build_time_features_df(quarter_dates, conti_year_origin=2020.0) assert_equal(time_df["quarter_start"], pd.Series( pd.to_datetime([ "2020-01-01", "2020-01-01", "2020-04-01", "2020-04-01", "2020-07-01", "2020-07-01", "2020-10-01", "2020-10-01", "2021-01-01", "2021-01-01", "2021-04-01", "2021-07-01", ])), check_names=False) assert_equal(time_df["quarter_length"], pd.Series([ 91, 91, 91, 91, 92, 92, 92, 92, 90, 90, 91, 92, ]), check_names=False) assert_equal(time_df["doq"], pd.Series([ 1, 91, 1, 91, 1, 92, 1, 92, 1, 90, 43, 34, ]), check_names=False) assert_equal(time_df["toq"], pd.Series([ 0.0, 90.0 / 91.0, 0.0, 90.0 / 91.0, 0.0, 91.0 / 92.0, 0.0, 91.0 / 92.0, 0.0, 89.0 / 90.0, 42.5 / 91.0, 33.75 / 92.0, ]), check_names=False) # Checks for exception with pytest.raises(ValueError, match="Length of dt cannot be zero."): build_time_features_df(dt=df0.iloc[0:0]["ts"], conti_year_origin=2019)
def test_get_changepoint_values_from_config(hourly_data): """Tests get_changepoint_values_from_config""" train_df = hourly_data["train_df"] conti_year_origin = get_default_origin_for_time_vars(train_df, TIME_COL) time_features_df = build_time_features_df( dt=train_df[TIME_COL], conti_year_origin=conti_year_origin) with pytest.raises(Exception, match="changepoint method must be specified"): get_changepoint_values_from_config( changepoints_dict={"n_changepoints": 2}, time_features_df=time_features_df, time_col="datetime") with pytest.raises(NotImplementedError, match="changepoint method.*not recognized"): get_changepoint_values_from_config( changepoints_dict={"method": "not implemented"}, time_features_df=time_features_df, time_col="datetime") # tests uniform method changepoint_values = get_changepoint_values_from_config( changepoints_dict={ "method": "uniform", "n_changepoints": 20 }, time_features_df=time_features_df, time_col="datetime") expected_changepoint_values = get_evenly_spaced_changepoints_values( df=time_features_df, n_changepoints=20) assert np.array_equal(changepoint_values, expected_changepoint_values) changepoint_values = get_changepoint_values_from_config( changepoints_dict={ "method": "uniform", "n_changepoints": 20, "continuous_time_col": "ct2" }, time_features_df=time_features_df, time_col="datetime") expected_changepoint_values = get_evenly_spaced_changepoints_values( df=time_features_df, n_changepoints=20, continuous_time_col="ct2") assert np.array_equal(changepoint_values, expected_changepoint_values) # tests custom method dates = ["2018-01-01", "2019-01-02-16", "2019-01-03", "2019-02-01"] changepoint_values = get_changepoint_values_from_config( changepoints_dict={ "method": "custom", "dates": dates }, time_features_df=time_features_df, time_col="datetime") expected_changepoint_values = get_custom_changepoints_values( df=time_features_df, changepoint_dates=dates, time_col="datetime") assert np.array_equal(changepoint_values, expected_changepoint_values) changepoint_values = get_changepoint_values_from_config( changepoints_dict={ "method": "custom", "dates": dates, "continuous_time_col": "ct2" }, time_features_df=time_features_df, time_col="datetime") expected_changepoint_values = get_custom_changepoints_values( df=time_features_df, changepoint_dates=dates, time_col="datetime", continuous_time_col="ct2") assert np.array_equal(changepoint_values, expected_changepoint_values)
def generate_df_for_tests(freq, periods, train_start_date=datetime.datetime(2018, 7, 1), train_end_date=None, train_frac=0.8, conti_year_origin=None, noise_std=2.0, remove_extra_cols=True, autoreg_coefs=None, fs_coefs=[-1, 3, 4], growth_coef=3.0, growth_pow=1.1, intercept=0.0): """Generates dataset for unit tests. :param freq: str pd.date_range freq parameter, e.g. H or D :param periods: int number of periods to generate :param train_start_date: datetime.datetime train start date :param train_end_date: Optional[datetime.datetime] train end date :param train_frac: Optional[float] fraction of data to use for training only used if train_end_date isn't provided :param noise_std: float standard deviation of gaussian noise :param conti_year_origin: float the time origin for continuous time variables :param remove_extra_cols: bool whether to remove extra columns besides TIME_COL, VALUE_COL :param autoreg_coefs: Optional[List[int]] The coefficients for the autoregressive terms. If provided the generated series denoted mathematically by Y(t) will be converted as follows: Y(t) -> Y(t) + c1 Y(t-1) + c2 Y(t-2) + c3 Y(t-3) + ... where autoreg_coefs = [c1, c2, c3, ...] In this fashion, the obtained series will have autoregressive properties not explained by seasonality and growth. :param fs_coefs: List[float] The fourier series coefficients used. :param growth_coef: float Multiplier for growth :param growth_pow: float Power for growth, as function of continuous time :param intercept: float Constant term added to Y(t) :return: Dict[str, any] contains full dataframe, train dataframe, test dataframe, and nrows in test dataframe """ np.random.seed(123) date_list = pd.date_range(start=train_start_date, periods=periods, freq=freq).tolist() df0 = pd.DataFrame({TIME_COL: date_list}) if conti_year_origin is None: conti_year_origin = get_default_origin_for_time_vars(df0, TIME_COL) time_df = build_time_features_df(dt=df0[TIME_COL], conti_year_origin=conti_year_origin) df = pd.concat([df0, time_df], axis=1) df["growth"] = growth_coef * (df["ct1"]**growth_pow) func = fourier_series_multi_fcn(col_names=["toy", "tow", "tod"], periods=[1.0, 7.0, 24.0], orders=[1, 1, 1], seas_names=None) res = func(df) df_seas = res["df"] df = pd.concat([df, df_seas], axis=1) df[VALUE_COL] = ( intercept + df["growth"] + fs_coefs[0] * df[get_fourier_col_name(1, "tod", function_name="sin")] + fs_coefs[1] * df[get_fourier_col_name(1, "tow", function_name="sin")] + fs_coefs[2] * df[get_fourier_col_name(1, "toy", function_name="sin")] + noise_std * np.random.normal(size=df.shape[0])) if autoreg_coefs is not None: df["temporary_new_value"] = df[VALUE_COL] k = len(autoreg_coefs) for i in range(k): df["temporary_new_value"] = ( df["temporary_new_value"] + autoreg_coefs[i] * df[VALUE_COL].shift(-i)).bfill() df[VALUE_COL] = df["temporary_new_value"] del df["temporary_new_value"] if train_end_date is None: train_rows = np.floor(train_frac * df.shape[0]).astype(int) train_end_date = df[TIME_COL][train_rows] if remove_extra_cols: df = df[[TIME_COL, VALUE_COL]] train_df = df.loc[df[TIME_COL] <= train_end_date] test_df = df.loc[df[TIME_COL] > train_end_date] fut_time_num = test_df.shape[0] return { "df": df, "train_df": train_df.reset_index(drop=True), "test_df": test_df.reset_index(drop=True), "fut_time_num": fut_time_num, }
# For a full list of such features, see `~greykite.common.features.timeseries_features.build_time_features_df`. # # If a feature is not automatically created by ``SILVERKITE``, we need to create it # beforehand and append it to the data df. # Here we create the "is_football_season" feature. # Note that we also need to provide the customized column for the forecast horizon period as well. # The way we do it is to first create the df with timestamps covering the forecast horizon. # This can be done with the `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.make_future_dataframe` # function within the `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries` class. # Then we create a new column of our customized regressor for this augmented df. # Makes augmented df with forecast horizon 365 days df_full = ts.make_future_dataframe(periods=365) # Builds "df_features" that contains datetime information of the "df" df_features = build_time_features_df( dt=df_full["ts"], conti_year_origin=convert_date_to_continuous_time(df_full["ts"][0])) # Roughly approximates the football season. # "woy" is short for "week of year", created above. # Football season is roughly the first 6 weeks and last 17 weeks in a year. is_football_season = (df_features["woy"] <= 6) | (df_features["woy"] >= 36) # Adds the new feature to the dataframe. df_full["is_football_season"] = is_football_season.astype(int).tolist() df_full.reset_index(drop=True, inplace=True) # Configures regressor column. regressors = {"regressor_cols": ["is_football_season"]} # %% # Interactions
def test_group_silverkite_seas_components(): """Tests group_silverkite_seas_components""" silverkite_diagnostics: SilverkiteDiagnostics = SilverkiteDiagnostics() time_col = "ts" # Daily date_list = pd.date_range(start="2018-01-01", end="2018-01-07", freq="H").tolist() time_df = build_time_features_df(date_list, conti_year_origin=2018) df = pd.DataFrame({ time_col: time_df["datetime"], "DAILY_SEASONALITY": time_df["hour"] }) res = silverkite_diagnostics.group_silverkite_seas_components(df) expected_df = pd.DataFrame({ "Hour of day": np.arange(24.0), "daily": np.arange(24.0), }) assert_frame_equal(res, expected_df) # Weekly date_list = pd.date_range(start="2018-01-01", end="2018-01-20", freq="D").tolist() time_df = build_time_features_df(date_list, conti_year_origin=2018) df = pd.DataFrame({ time_col: time_df["datetime"], "WEEKLY_SEASONALITY": time_df["tow"] }) res = silverkite_diagnostics.group_silverkite_seas_components(df) expected_df = pd.DataFrame({ "Day of week": np.arange(7.0), "weekly": np.arange(7.0), }) assert_frame_equal(res, expected_df) # Monthly date_list = pd.date_range(start="2018-01-01", end="2018-01-31", freq="D").tolist() time_df = build_time_features_df(date_list, conti_year_origin=2018) df = pd.DataFrame({ time_col: time_df["datetime"], "MONTHLY_SEASONALITY": time_df["dom"] }) res = silverkite_diagnostics.group_silverkite_seas_components(df) expected_df = pd.DataFrame({ "Time of month": np.arange(31.0) / 31, "monthly": np.arange(1.0, 32.0), }) assert_frame_equal(res, expected_df) # Quarterly (92 day quarters) date_list = pd.date_range(start="2018-07-01", end="2018-12-31", freq="D").tolist() time_df = build_time_features_df(date_list, conti_year_origin=2018) df = pd.DataFrame({ time_col: time_df["datetime"], "QUARTERLY_SEASONALITY": time_df["toq"] }) res = silverkite_diagnostics.group_silverkite_seas_components(df) expected_df = pd.DataFrame({ "Time of quarter": np.arange(92.0) / 92, "quarterly": np.arange(92.0) / 92, }) assert_frame_equal(res, expected_df) # Quarterly (90 day quarter) date_list = pd.date_range(start="2018-01-01", end="2018-03-31", freq="D").tolist() time_df = build_time_features_df(date_list, conti_year_origin=2018) df = pd.DataFrame({ time_col: time_df["datetime"], "QUARTERLY_SEASONALITY": time_df["toq"] }) res = silverkite_diagnostics.group_silverkite_seas_components(df) expected_df = pd.DataFrame({ "Time of quarter": np.arange(90.0) / 90, "quarterly": np.arange(90.0) / 90, }) assert_frame_equal(res, expected_df) # Yearly (non-leap years) date_list = pd.date_range(start="2018-01-01", end="2019-12-31", freq="D").tolist() time_df = build_time_features_df(date_list, conti_year_origin=2018) df = pd.DataFrame({ time_col: time_df["datetime"], "YEARLY_SEASONALITY": time_df["toy"] }) res = silverkite_diagnostics.group_silverkite_seas_components(df) expected_df = pd.DataFrame({ "Time of year": np.arange(365.0) / 365, "yearly": np.arange(365.0) / 365, }) assert_frame_equal(res, expected_df)
def add_groupby_column(df, time_col, groupby_time_feature=None, groupby_sliding_window_size=None, groupby_custom_column=None): """Extracts a column to group by from ``df``. Exactly one of ``groupby_time_feature``, ``groupby_sliding_window_size``, `groupby_custom_column` must be provided. Parameters ---------- df : 'pandas.DataFrame` Contains the univariate time series / forecast time_col : `str` The name of the time column of the univariate time series / forecast groupby_time_feature : `str` or None, optional If provided, groups by a column generated by `~greykite.common.features.timeseries_features.build_time_features_df`. See that function for valid values. groupby_sliding_window_size : `int` or None, optional If provided, sequentially partitions data into groups of size ``groupby_sliding_window_size``. groupby_custom_column : `pandas.Series` or None, optional If provided, groups by this column value. Should be same length as the ``df``. Returns ------- result : `dict` Dictionary with two items: * ``"df"`` : `pandas.DataFrame` ``df`` with a grouping column added. The column can be used to group rows together. * ``"groupby_col"`` : `str` The name of the groupby column added to ``df``. The column name depends on the grouping method: - ``groupby_time_feature`` for ``groupby_time_feature`` - ``{cst.TIME_COL}_downsample`` for ``groupby_sliding_window_size`` - ``groupby_custom_column.name`` for ``groupby_custom_column``. """ # Resets index to support indexing in groupby_sliding_window_size df = df.copy() dt = pd.Series(df[time_col].values) # Determines the groups is_groupby_time_feature = 1 if groupby_time_feature is not None else 0 is_groupby_sliding_window_size = 1 if groupby_sliding_window_size is not None else 0 is_groupby_custom_column = 1 if groupby_custom_column is not None else 0 if is_groupby_time_feature + is_groupby_sliding_window_size + is_groupby_custom_column != 1: raise ValueError( "Exactly one of (groupby_time_feature, groupby_rolling_window_size, groupby_custom_column)" "must be specified") groups = None if is_groupby_time_feature == 1: # Group by a value derived from the time column time_features = build_time_features_df(dt, conti_year_origin=min(dt).year) groups = time_features[groupby_time_feature] groups.name = groupby_time_feature elif is_groupby_sliding_window_size == 1: # Group by sliding window for evaluation over time index_dates = split_range_into_groups( n=df.shape[0], group_size=groupby_sliding_window_size, which_group_complete="last" ) # ensures the last group is complete (first group may be partial) groups = dt[ index_dates * groupby_sliding_window_size] # uses first date in each group as grouping value groups.name = f"{time_col}_downsample" elif is_groupby_custom_column == 1: # Group by custom column groups = groupby_custom_column groups_col_name = groups.name if groups.name is not None else "groups" df[groups_col_name] = groups.values if df.index.name in df.columns: # Removes ambiguity in case the index name is the same as the newly added column, # (or an existing column). df.index.name = None return {"df": df, "groupby_col": groups_col_name}