def test_gcd_irregular(): """Checks sort and fill missing dates""" # gaps in unsorted, irregular input df = pd.DataFrame({ TIME_COL: [ datetime.datetime(2018, 1, 1, 0, 0, 1), datetime.datetime(2018, 1, 1, 0, 0, 2), datetime.datetime(2018, 1, 1, 0, 0, 10), # intentionally out of order datetime.datetime(2018, 1, 1, 0, 0, 4) ], VALUE_COL: [1, 2, 3, 4] }) expected = pd.DataFrame({ # in sorted order TIME_COL: pd.date_range(start=datetime.datetime(2018, 1, 1, 0, 0, 1), end=datetime.datetime(2018, 1, 1, 0, 0, 10), freq="S"), VALUE_COL: [1, 2, np.nan, 4, np.nan, np.nan, np.nan, np.nan, np.nan, 3] }) expected.index = expected[TIME_COL] expected.index.name = None canonical_data_dict = get_canonical_data( df=df, time_col=TIME_COL, value_col=VALUE_COL, freq="S") # the frequency should be provided when there are gaps assert canonical_data_dict["time_stats"]["added_timepoints"] == 6 assert canonical_data_dict["time_stats"]["dropped_timepoints"] == 0 assert_equal(canonical_data_dict["df"], expected) assert_equal(canonical_data_dict["time_stats"]["gaps"], find_missing_dates(df[TIME_COL]))
def test_load_data_anomaly(): """Checks anomaly_info parameter""" dl = DataLoaderTS() df = dl.load_beijing_pm() value_col = "pm" # no anomaly adjustment ts = UnivariateTimeSeries() ts.load_data(df=df, value_col=value_col) assert ts.df_before_adjustment is None # adjusts two columns dim_one = "one" dim_two = "two" anomaly_df = pd.DataFrame({ START_DATE_COL: ["2011-04-04-10", "2011-10-10-00", "2012-12-20-10"], END_DATE_COL: ["2011-04-05-20", "2011-10-11-23", "2012-12-20-13"], ADJUSTMENT_DELTA_COL: [np.nan, 100.0, -100.0], METRIC_COL: [dim_one, dim_one, dim_two] }) anomaly_info = [{ "value_col": value_col, "anomaly_df": anomaly_df, "start_date_col": START_DATE_COL, "end_date_col": END_DATE_COL, "adjustment_delta_col": ADJUSTMENT_DELTA_COL, "filter_by_dict": { METRIC_COL: dim_one }, "adjustment_method": "add" }, { "value_col": "pres", "anomaly_df": anomaly_df, "start_date_col": START_DATE_COL, "end_date_col": END_DATE_COL, "adjustment_delta_col": ADJUSTMENT_DELTA_COL, "filter_by_dict": { METRIC_COL: dim_two }, "adjustment_method": "subtract" }] ts = UnivariateTimeSeries() ts.load_data(df=df, value_col=value_col, anomaly_info=anomaly_info) canonical_data_dict = get_canonical_data(df=df, value_col=value_col, anomaly_info=anomaly_info) assert_equal(ts.df, canonical_data_dict["df"]) assert_equal(ts.df_before_adjustment, canonical_data_dict["df_before_adjustment"])
def test_gcd_freq(): # Checks warning for frequency not matching inferred frequency df = pd.DataFrame({ TIME_COL: [ datetime.datetime(2018, 1, 1, 0, 0, 0), datetime.datetime(2018, 1, 2, 0, 0, 0), datetime.datetime(2018, 1, 3, 0, 0, 0) ], VALUE_COL: [1, 2, 3] }) with pytest.warns(UserWarning) as record: canonical_data_dict = get_canonical_data(df=df, time_col=TIME_COL, value_col=VALUE_COL, freq="H") assert "does not match inferred frequency" in record[0].message.args[0] assert canonical_data_dict["time_stats"][ "added_timepoints"] == 24 * 2 - 2 assert canonical_data_dict["time_stats"]["dropped_timepoints"] == 0
def test_gcd_err(): df = pd.DataFrame({ TIME_COL: [ datetime.datetime(2018, 1, 1, 0, 0, 3), datetime.datetime(2018, 1, 1, 0, 0, 2), datetime.datetime(2018, 1, 1, 0, 0, 1) ], VALUE_COL: [1, 2, 3] }) with pytest.raises(ValueError, match="Time series has < 3 observations"): get_canonical_data(df=df.iloc[:2, ]) with pytest.raises(ValueError, match="column is not in input data"): get_canonical_data(df=df, time_col="time") with pytest.raises(ValueError, match="column is not in input data"): get_canonical_data(df=df, value_col="value")
def test_gcd_train_end_date_regressor(): """Tests train_end_date for data with regressors""" data = generate_df_with_reg_for_tests(freq="D", periods=30, train_start_date=datetime.datetime( 2018, 1, 1), remove_extra_cols=True, mask_test_actuals=True) regressor_cols = ["regressor1", "regressor2", "regressor_categ"] keep_cols = [TIME_COL, VALUE_COL] + regressor_cols df = data["df"][keep_cols].copy() # Setting NaN values at the end df.loc[df.tail(2).index, "regressor1"] = np.nan df.loc[df.tail(4).index, "regressor2"] = np.nan df.loc[df.tail(6).index, "regressor_categ"] = np.nan df.loc[df.tail(8).index, VALUE_COL] = np.nan # last date with a value result_train_end_date = datetime.datetime(2018, 1, 22) # default train_end_date, default regressor_cols with pytest.warns(UserWarning) as record: canonical_data_dict = get_canonical_data(df=df, train_end_date=None, regressor_cols=None) assert f"{VALUE_COL} column of the provided TimeSeries contains null " \ f"values at the end. Setting 'train_end_date' to the last timestamp with a " \ f"non-null value ({result_train_end_date})." in record[0].message.args[0] assert canonical_data_dict["df"].shape == df.shape assert canonical_data_dict["fit_df"].shape == (22, 2) assert canonical_data_dict["regressor_cols"] == [] assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL] assert canonical_data_dict["train_end_date"] == result_train_end_date assert canonical_data_dict[ "last_date_for_val"] == result_train_end_date assert canonical_data_dict["last_date_for_reg"] is None # train_end_date later than last date in df, all available regressor_cols with pytest.warns(UserWarning) as record: train_end_date = datetime.datetime(2018, 2, 10) canonical_data_dict = get_canonical_data(df=df, train_end_date=train_end_date, regressor_cols=regressor_cols) assert f"Input timestamp for the parameter 'train_end_date' " \ f"({train_end_date}) either exceeds the last available timestamp or" \ f"{VALUE_COL} column of the provided TimeSeries contains null " \ f"values at the end. Setting 'train_end_date' to the last timestamp with a " \ f"non-null value ({result_train_end_date})." in record[0].message.args[0] assert canonical_data_dict["fit_df"].shape == (22, 5) assert canonical_data_dict["regressor_cols"] == regressor_cols assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL ] + regressor_cols assert canonical_data_dict["train_end_date"] == result_train_end_date assert canonical_data_dict["last_date_for_val"] == datetime.datetime( 2018, 1, 22) assert canonical_data_dict["last_date_for_reg"] == datetime.datetime( 2018, 1, 28) # train_end_date in between last date in df and last date before null # user passes no regressor_cols with pytest.warns(UserWarning) as record: train_end_date = datetime.datetime(2018, 1, 25) canonical_data_dict = get_canonical_data(df=df, train_end_date=train_end_date, regressor_cols=None) assert f"Input timestamp for the parameter 'train_end_date' " \ f"({train_end_date}) either exceeds the last available timestamp or" \ f"{VALUE_COL} column of the provided TimeSeries contains null " \ f"values at the end. Setting 'train_end_date' to the last timestamp with a " \ f"non-null value ({result_train_end_date})." in record[0].message.args[0] assert canonical_data_dict["fit_df"].shape == (22, 2) assert canonical_data_dict["regressor_cols"] == [] assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL] assert canonical_data_dict["train_end_date"] == datetime.datetime( 2018, 1, 22) assert canonical_data_dict["last_date_for_val"] == datetime.datetime( 2018, 1, 22) assert canonical_data_dict["last_date_for_reg"] is None # train end date equal to last date before null # user requests a subset of the regressor_cols train_end_date = datetime.datetime(2018, 1, 22) regressor_cols = ["regressor2"] canonical_data_dict = get_canonical_data(df=df, train_end_date=train_end_date, regressor_cols=regressor_cols) assert canonical_data_dict["fit_df"].shape == (22, 3) assert canonical_data_dict["regressor_cols"] == regressor_cols assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL ] + regressor_cols assert canonical_data_dict["train_end_date"] == datetime.datetime( 2018, 1, 22) assert canonical_data_dict["last_date_for_val"] == datetime.datetime( 2018, 1, 22) assert canonical_data_dict["last_date_for_reg"] == datetime.datetime( 2018, 1, 26) # train_end_date smaller than last date before null # user requests regressor_cols that does not exist in df with pytest.warns(UserWarning) as record: train_end_date = datetime.datetime(2018, 1, 20) regressor_cols = ["regressor1", "regressor4", "regressor5"] canonical_data_dict = get_canonical_data(df=df, train_end_date=train_end_date, regressor_cols=regressor_cols) assert canonical_data_dict["fit_df"].shape == (20, 3) assert canonical_data_dict["regressor_cols"] == ["regressor1"] assert canonical_data_dict["fit_cols"] == [ TIME_COL, VALUE_COL, "regressor1" ] assert canonical_data_dict["train_end_date"] == datetime.datetime( 2018, 1, 20) assert canonical_data_dict["last_date_for_val"] == datetime.datetime( 2018, 1, 22) assert canonical_data_dict["last_date_for_reg"] == datetime.datetime( 2018, 1, 28) assert (f"The following columns are not available to use as " f"regressors: ['regressor4', 'regressor5']" ) in record[0].message.args[0]
def test_gcd_train_end_date(): """Tests train_end_date for data without regressors""" df = pd.DataFrame({ TIME_COL: [ datetime.datetime(2018, 1, 1, 3, 0, 0), datetime.datetime(2018, 1, 1, 4, 0, 0), datetime.datetime(2018, 1, 1, 5, 0, 0), datetime.datetime(2018, 1, 1, 6, 0, 0), datetime.datetime(2018, 1, 1, 7, 0, 0) ], VALUE_COL: [1, np.nan, 3, np.nan, np.nan], }) # no train_end_date with pytest.warns(UserWarning) as record: train_end_date = datetime.datetime(2018, 1, 1, 5, 0, 0) get_canonical_data(df=df) assert f"{VALUE_COL} column of the provided TimeSeries contains null " \ f"values at the end. Setting 'train_end_date' to the last timestamp with a " \ f"non-null value ({train_end_date})." in record[0].message.args[0] # train_end_date later than last date in df with pytest.warns(UserWarning) as record: train_end_date = datetime.datetime(2018, 1, 1, 8, 0, 0) result_train_end_date = datetime.datetime(2018, 1, 1, 5, 0, 0) get_canonical_data(df, train_end_date=train_end_date) assert f"Input timestamp for the parameter 'train_end_date' " \ f"({train_end_date}) either exceeds the last available timestamp or" \ f"{VALUE_COL} column of the provided TimeSeries contains null " \ f"values at the end. Setting 'train_end_date' to the last timestamp with a " \ f"non-null value ({result_train_end_date})." in record[0].message.args[0] # train_end_date in between last date in df and last date before with pytest.warns(UserWarning) as record: train_end_date = datetime.datetime(2018, 1, 1, 6, 0, 0) result_train_end_date = datetime.datetime(2018, 1, 1, 5, 0, 0) get_canonical_data(df, train_end_date=train_end_date) assert f"Input timestamp for the parameter 'train_end_date' " \ f"({train_end_date}) either exceeds the last available timestamp or" \ f"{VALUE_COL} column of the provided TimeSeries contains null " \ f"values at the end. Setting 'train_end_date' to the last timestamp with a " \ f"non-null value ({result_train_end_date})." in record[0].message.args[0] # train end date equal to last date before null canonical_data_dict = get_canonical_data(df, train_end_date=datetime.datetime( 2018, 1, 1, 5, 0, 0)) assert canonical_data_dict["train_end_date"] == datetime.datetime( 2018, 1, 1, 5, 0, 0) # train_end_date smaller than last date before null canonical_data_dict = get_canonical_data(df, train_end_date=datetime.datetime( 2018, 1, 1, 4, 0, 0)) assert_equal(canonical_data_dict["fit_df"], canonical_data_dict["df"].iloc[:2]) assert canonical_data_dict["regressor_cols"] == [] assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL] assert canonical_data_dict["train_end_date"] == datetime.datetime( 2018, 1, 1, 4, 0, 0) assert canonical_data_dict["last_date_for_val"] == datetime.datetime( 2018, 1, 1, 5, 0, 0) assert canonical_data_dict["last_date_for_reg"] is None
def test_gcd_load_data_anomaly(): """Checks anomaly_info parameter""" dl = DataLoader() df = dl.load_beijing_pm() value_col = "pm" # no anomaly adjustment canonical_data_dict = get_canonical_data(df=df, time_col=TIME_COL, value_col=value_col) assert canonical_data_dict["df_before_adjustment"] is None dim_one = "one" dim_two = "two" anomaly_df = pd.DataFrame({ START_DATE_COL: ["2011-04-04-10", "2011-10-10-00", "2012-12-20-10"], END_DATE_COL: ["2011-04-05-20", "2011-10-11-23", "2012-12-20-13"], ADJUSTMENT_DELTA_COL: [np.nan, 100.0, -100.0], METRIC_COL: [dim_one, dim_one, dim_two] # used to filter rows in this df }) # Adjusts one column (value_col) anomaly_info = { "value_col": value_col, "anomaly_df": anomaly_df, "start_date_col": START_DATE_COL, "end_date_col": END_DATE_COL, "adjustment_delta_col": ADJUSTMENT_DELTA_COL, "filter_by_dict": { METRIC_COL: dim_one }, "adjustment_method": "add" } canonical_data_dict2 = get_canonical_data(df=df, time_col=TIME_COL, value_col=value_col, anomaly_info=anomaly_info) assert_equal(canonical_data_dict2["df_before_adjustment"], canonical_data_dict["df"]) expected_df = canonical_data_dict["df"].copy() # first anomaly idx = ((expected_df[TIME_COL] >= anomaly_df[START_DATE_COL][0]) & (expected_df[TIME_COL] <= anomaly_df[END_DATE_COL][0])) expected_df.loc[idx, VALUE_COL] = np.nan # second anomaly idx = ((expected_df[TIME_COL] >= anomaly_df[START_DATE_COL][1]) & (expected_df[TIME_COL] <= anomaly_df[END_DATE_COL][1])) expected_df.loc[idx, VALUE_COL] += 100.0 assert_equal(canonical_data_dict2["df"], expected_df) # Adjusts two columns value_col_two = "pres" # second column to adjust anomaly_info = [ anomaly_info, { "value_col": value_col_two, "anomaly_df": anomaly_df, "start_date_col": START_DATE_COL, "end_date_col": END_DATE_COL, "adjustment_delta_col": ADJUSTMENT_DELTA_COL, "filter_by_dict": { METRIC_COL: dim_two }, "adjustment_method": "subtract" } ] canonical_data_dict3 = get_canonical_data(df=df, time_col=TIME_COL, value_col=value_col, anomaly_info=anomaly_info) # third anomaly. The value is subtracted, according to `adjustment_method`. idx = ((expected_df[TIME_COL] >= anomaly_df[START_DATE_COL][2]) & (expected_df[TIME_COL] <= anomaly_df[END_DATE_COL][2])) expected_df.loc[idx, value_col_two] -= -100.0 assert_equal(canonical_data_dict3["df_before_adjustment"], canonical_data_dict["df"]) assert_equal(canonical_data_dict3["df"], expected_df)
def test_gcd_dates(): """Checks if regular data can be properly loaded. Checks time column stats""" df = pd.DataFrame({ "time": [ datetime.datetime(2018, 1, 1, 0, 0, 1), datetime.datetime(2018, 1, 1, 0, 0, 2), datetime.datetime(2018, 1, 1, 0, 0, 3) ], "val": [1, 2, 3] }) canonical_data_dict = get_canonical_data(df=df, time_col="time", value_col="val") assert_equal(canonical_data_dict["time_stats"]["gaps"], find_missing_dates(df["time"])) assert canonical_data_dict["time_stats"]["added_timepoints"] == 0 assert canonical_data_dict["time_stats"]["dropped_timepoints"] == 0 assert canonical_data_dict["freq"] == "S" assert_equal(canonical_data_dict["df"][VALUE_COL].values, df["val"].values) assert canonical_data_dict["df"].index.name is None # string with date format date_format = "%Y-%m-%d" df = pd.DataFrame({ TIME_COL: ["2018-01-01", "2018-01-05", "2018-01-09"], VALUE_COL: [1, 2, 3] }) canonical_data_dict = get_canonical_data(df=df, time_col=TIME_COL, value_col=VALUE_COL, date_format=date_format, freq="4D") expected = pd.DataFrame({ TIME_COL: pd.Series([ datetime.datetime(2018, 1, 1, 0, 0, 0), datetime.datetime(2018, 1, 5, 0, 0, 0), datetime.datetime(2018, 1, 9, 0, 0, 0) ], name=TIME_COL), VALUE_COL: df[VALUE_COL] }) expected.index = expected[TIME_COL] expected.index.name = None assert canonical_data_dict["time_stats"]["added_timepoints"] == 0 assert canonical_data_dict["time_stats"]["dropped_timepoints"] == 0 assert canonical_data_dict["freq"] == "4D" assert_equal(canonical_data_dict["df"], expected) # string with inferred date format canonical_data_dict = get_canonical_data(df=df, time_col=TIME_COL, value_col=VALUE_COL, freq="4D") assert canonical_data_dict["time_stats"]["added_timepoints"] == 0 assert canonical_data_dict["time_stats"]["dropped_timepoints"] == 0 assert_equal(canonical_data_dict["df"], expected) # time zone canonical_data_dict = get_canonical_data(df=df, time_col=TIME_COL, value_col=VALUE_COL, freq="4D", tz="US/Pacific") expected = expected.tz_localize("US/Pacific") assert canonical_data_dict["time_stats"]["added_timepoints"] == 0 assert canonical_data_dict["time_stats"]["dropped_timepoints"] == 0 assert_equal(canonical_data_dict["df"], expected)
def get_forecast_time_properties( df, time_col=TIME_COL, value_col=VALUE_COL, freq=None, regressor_cols=None, lagged_regressor_cols=None, train_end_date=None, forecast_horizon=None): """Returns the number of training points in `df`, the start year, and prediction end year Parameters ---------- df : `pandas.DataFrame` with columns [``time_col``, ``value_col``] Univariate timeseries data to forecast time_col : `str`, default ``TIME_COL`` in constants.py Name of timestamp column in df value_col : `str`, default ``VALUE_COL`` in constants.py Name of value column in df (the values to forecast) freq : `str` or None, default None Frequency of input data. Used to generate future dates for prediction. Frequency strings can have multiples, e.g. '5H'. See https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases for a list of frequency aliases. If None, inferred by pd.infer_freq. Provide this parameter if ``df`` has missing timepoints. regressor_cols : `list` [`str`] or None, optional, default None A list of regressor columns used in the training and prediction DataFrames. If None, no regressor columns are used. Regressor columns that are unavailable in ``df`` are dropped. lagged_regressor_cols : `list` [`str`] or None, optional, default None A list of lagged regressor columns used in the training and prediction DataFrames. If None, no lagged regressor columns are used. Lagged regressor columns that are unavailable in ``df`` are dropped. train_end_date : `datetime.datetime`, optional, default None Last date to use for fitting the model. Forecasts are generated after this date. If None, it is set to the last date with a non-null value in ``value_col`` of ``df``. forecast_horizon : `int` or None, default None Number of periods to forecast into the future. Must be > 0 If None, default is determined from input data frequency Returns ------- time_properties : `dict` [`str`, `any`] Time properties dictionary with keys: ``"period"`` : `int` Period of each observation (i.e. minimum time between observations, in seconds). ``"simple_freq"`` : `SimpleTimeFrequencyEnum` ``SimpleTimeFrequencyEnum`` member corresponding to data frequency. ``"num_training_points"`` : `int` Number of observations for training. ``"num_training_days"`` : `int` Number of days for training. ``"days_per_observation"``: `float` The time frequency in day units. ``"forecast_horizon"``: `int` The number of time intervals for which forecast is needed. ``"forecast_horizon_in_timedelta"``: `datetime.timedelta` The forecast horizon length in timedelta units. ``"forecast_horizon_in_days"``: `float` The forecast horizon length in day units. ``"start_year"`` : `int` Start year of the training period. ``"end_year"`` : `int` End year of the forecast period. ``"origin_for_time_vars"`` : `float` Continuous time representation of the first date in ``df``. """ if regressor_cols is None: regressor_cols = [] # Defines ``fit_df``, the data available for fitting the model # and its time column (in `datetime.datetime` format) canonical_data_dict = get_canonical_data( df=df, time_col=time_col, value_col=value_col, freq=freq, train_end_date=train_end_date, regressor_cols=regressor_cols, lagged_regressor_cols=lagged_regressor_cols) fit_df = canonical_data_dict["fit_df"] # Calculates basic time properties train_start = fit_df[TIME_COL].min() start_year = int(train_start.strftime("%Y")) origin_for_time_vars = get_default_origin_for_time_vars(fit_df, TIME_COL) period = min_gap_in_seconds(df=fit_df, time_col=TIME_COL) simple_freq = get_simple_time_frequency_from_period(period) num_training_points = fit_df.shape[0] # Calculates number of (fractional) days in the training set time_delta = fit_df[TIME_COL].max() - train_start num_training_days = ( time_delta.days + (time_delta.seconds + period) / TimeEnum.ONE_DAY_IN_SECONDS.value) # Calculates forecast horizon (as a number of periods) if forecast_horizon is None: # expected to be kept in sync with default value set in ``get_default_time_parameters`` forecast_horizon = get_default_horizon_from_period( period=period, num_observations=num_training_points) days_per_observation = period / TimeEnum.ONE_DAY_IN_SECONDS.value forecast_horizon_in_days = forecast_horizon * days_per_observation forecast_horizon_in_timedelta = timedelta(days=forecast_horizon_in_days) # Calculates forecast end year train_end = fit_df[TIME_COL].max() days_to_forecast = math.ceil(forecast_horizon * days_per_observation) future_end = train_end + timedelta(days=days_to_forecast) end_year = int(future_end.strftime("%Y")) return { "period": period, "simple_freq": simple_freq, "num_training_points": num_training_points, "num_training_days": num_training_days, "days_per_observation": days_per_observation, "forecast_horizon": forecast_horizon, "forecast_horizon_in_timedelta": forecast_horizon_in_timedelta, "forecast_horizon_in_days": forecast_horizon_in_days, "start_year": start_year, "end_year": end_year, "origin_for_time_vars": origin_for_time_vars }