def test_add_time_features_df(): """Tests add_time_features_df""" # create indexed input date_list = pd.date_range(start=datetime.datetime(2019, 1, 1), periods=100, freq="H").tolist() df0 = pd.DataFrame({TIME_COL: date_list}, index=date_list) df = add_time_features_df(df=df0, time_col=TIME_COL, conti_year_origin=2018) assert df["year"][0] == 2019 assert df.shape[0] == df0.shape[0] hourly_data = generate_df_with_reg_for_tests( freq="H", periods=24 * 500, train_start_date=datetime.datetime(2018, 7, 1), conti_year_origin=2018) cols = [TIME_COL, "regressor1", "regressor_bool", "regressor_categ"] train_df = hourly_data["train_df"] df = add_time_features_df(df=train_df[cols], time_col=TIME_COL, conti_year_origin=2018) assert df["year"][0] == 2018 assert (df["dow_hr"][:3] == ["7_00", "7_01", "7_02"]).all() assert df.shape[0] == train_df.shape[0]
def test_add_daily_events(): # generate events dictionary countries = ["US", "India", "UK"] event_df_dict = get_holidays(countries, year_start=2015, year_end=2025) original_col_names = [ event_df_dict[country].columns[1] for country in countries ] # generate temporal data date_list = pd.date_range(start=dt(2019, 1, 1), periods=100, freq="H").tolist() df0 = pd.DataFrame({"ts": date_list}) df = add_time_features_df(df0, time_col="ts", conti_year_origin=2018) df_with_events = add_daily_events(df=df, event_df_dict=event_df_dict, date_col="date") assert df_with_events[f"{EVENT_PREFIX}_India"].values[ 0] == "New Year's Day" assert df_with_events[f"{EVENT_PREFIX}_US"].values[25] == "" # makes sure the function does not modify the input new_col_names = [ event_df_dict[country].columns[1] for country in countries ] assert original_col_names == new_col_names
def test_get_custom_changepoints(): """Tests get_custom_changepoints and get_changepoint_features""" date_list = pd.date_range(start=dt(2019, 1, 1), periods=20, freq="D").tolist() time_col = "custom_time_col" df0 = pd.DataFrame({time_col: date_list}) df = add_time_features_df(df0, time_col="custom_time_col", conti_year_origin=2018) # dates as datetime changepoint_dates = pd.to_datetime( ["2018-01-01", "2019-01-02-16", "2019-01-03", "2019-02-01"]) result = get_custom_changepoints_values( df=df, changepoint_dates=changepoint_dates, time_col=time_col, # pd.Timestamp type continuous_time_col=time_col # makes checking the result easier ) # 2018-01-01 is mapped to 2019-01-01. Duplicates mapped to "2019-01-03" are merged # Last requested changepoint is not found assert np.all(result == pd.to_datetime(["2019-01-01", "2019-01-03"])) # dates as strings changepoint_dates = [ "2018-01-01", "2019-01-02-16", "2019-01-03", "2019-02-01" ] result = get_custom_changepoints_values( df=df, changepoint_dates=changepoint_dates, time_col="date", # datetime.date type continuous_time_col=time_col) assert np.all(result == pd.to_datetime(["2019-01-01", "2019-01-03"])) # continuous_time_col different from time_col # check using timestamps from last `result` changepoints = get_custom_changepoints_values( df=df, changepoint_dates=changepoint_dates, time_col=time_col, continuous_time_col="ct1") assert np.all(changepoints == df[df[time_col].isin(result)]["ct1"].values) # generated features, using changepoints from above changepoint_df = get_changepoint_features( df, changepoints, continuous_time_col="ct1", growth_func=lambda x: x, ) assert changepoint_df.shape == (df.shape[0], len(result)) assert round(changepoint_df.iloc[15, 0], 3) == 0.041 assert round(changepoint_df.iloc[16, 1], 3) == 0.038 # no matching dates changepoint_dates = ["2019-02-01"] result = get_custom_changepoints_values( df=df, changepoint_dates=changepoint_dates, time_col=time_col, continuous_time_col=time_col) assert result is None # hourly data, provided changepoints at daily level date_list = pd.date_range(start=dt(2019, 1, 1), periods=20 * 24, freq="H").tolist() time_col = "custom_time_col" df0 = pd.DataFrame({time_col: date_list}) df = add_time_features_df(df0, time_col="custom_time_col", conti_year_origin=2018) # dates as datetime changepoint_dates = pd.to_datetime( ["2018-01-01", "2019-01-02-16", "2019-01-03", "2019-02-01"]) result = get_custom_changepoints_values( df=df, changepoint_dates=changepoint_dates, time_col=time_col, # pd.Timestamp type continuous_time_col=time_col # makes checking the result easier ) # 2018-01-01 is mapped to 2019-01-01-00. Mapped to -00 if no hour provided # Last requested changepoint is not found assert np.all(result == pd.to_datetime( ["2019-01-01-00", "2019-01-02-16", "2019-01-03-00"]))
def test_get_evenly_spaced_changepoint_values(): df = pd.DataFrame({ "time_col": np.arange(1, 11), "ts": pd.date_range(start="2020-01-01", periods=10, freq="AS") }) changepoints = get_evenly_spaced_changepoints_values(df, "time_col", n_changepoints=3) changepoint_dates = get_changepoint_dates_from_changepoints_dict( changepoints_dict={ "method": "uniform", "n_changepoints": 3 }, df=df, time_col="ts") # linear growth changepoint_df = get_changepoint_features( df, changepoints, continuous_time_col="time_col", growth_func=None, changepoint_dates=changepoint_dates) expected = pd.DataFrame({ "changepoint0_2022_01_01_00": [0, 0, 0, 1, 2, 3, 4, 5, 6, 7], "changepoint1_2025_01_01_00": [0, 0, 0, 0, 0, 0, 1, 2, 3, 4], "changepoint2_2027_01_01_00": [0, 0, 0, 0, 0, 0, 0, 0, 1, 2] }) assert changepoint_df.equals(expected) # quadratic growth changepoint_df = get_changepoint_features( df, changepoints, continuous_time_col="time_col", growth_func=lambda x: x**2, changepoint_dates=changepoint_dates) expected = pd.DataFrame({ "changepoint0_2022_01_01_00": [0, 0, 0, 1, 4, 9, 16, 25, 36, 49], "changepoint1_2025_01_01_00": [0, 0, 0, 0, 0, 0, 1, 4, 9, 16], "changepoint2_2027_01_01_00": [0, 0, 0, 0, 0, 0, 0, 0, 1, 4] }) assert changepoint_df.equals(expected) # real example n_changepoints = 3 date_list = pd.date_range(start=dt(2019, 1, 1), periods=20, freq="H").tolist() df0 = pd.DataFrame({"ts": date_list}) df = add_time_features_df(df0, time_col="ts", conti_year_origin=2018) changepoints = get_evenly_spaced_changepoints_values( df, "ct1", n_changepoints=n_changepoints) changepoint_dates = get_changepoint_dates_from_changepoints_dict( changepoints_dict={ "method": "uniform", "n_changepoints": 3 }, df=df, time_col="ts") changepoint_df = get_changepoint_features( df, changepoints, continuous_time_col="ct1", growth_func=lambda x: x, changepoint_dates=changepoint_dates) assert changepoint_df.shape == (df.shape[0], n_changepoints) assert changepoint_df.iloc[15, 2] == 0.0 assert changepoint_df.iloc[16, 2] == 0.00011415525113989133