def test_advanced_degrees(self, config, input_df, input_dataset_columns): config["additional_parameters_STL"] = { "seasonal_deg": "1", "trend_deg": "1", "low_pass_deg": "1" } dku_config = STLConfig() dku_config.add_parameters(config, input_dataset_columns) timeseries_preparator = TimeseriesPreparator(dku_config) df_prepared = timeseries_preparator.prepare_timeseries_dataframe( input_df) decomposition = STLDecomposition(dku_config) result_df = decomposition.fit(df_prepared) assert result_df.shape == (26, 6) np.testing.assert_array_equal( np.round(result_df.value1_trend.values, 4), np.array([ 547017.8314, 537486.722, 528097.1954, 518846.2605, 509728.8989, 500744.2034, 491895.324, 483188.5115, 474630.5299, 466256.2782, 458496.2869, 454985.6935, 453114.0625, 452740.149, 453810.1866, 456404.7768, 463218.9767, 470913.292, 478947.2522, 487217.229, 495684.7824, 504325.6079, 513126.1746, 522081.8564, 531195.1428, 540473.2835 ])) assert np.mean(result_df["value1_trend"]) == 492101.0195351211 assert np.mean(result_df["value1_seasonal"]) == 32625.652227975654 assert np.mean(result_df["value1_residuals"]) == -5345.248686173698 config["additional_parameters_STL"] = { "seasonal_deg": "1", "trend_deg": "0" } dku_config = STLConfig() dku_config.add_parameters(config, input_dataset_columns) timeseries_preparator = TimeseriesPreparator(dku_config) df_prepared = timeseries_preparator.prepare_timeseries_dataframe( input_df) decomposition = STLDecomposition(dku_config) result_df = decomposition.fit(df_prepared) assert result_df.shape == (26, 6) np.testing.assert_array_equal( np.round(result_df.value1_seasonal.values, 4), np.array([ 334926.5396, 363552.8324, 380642.7497, 151182.772, -168020.8919, -209675.4339, -276299.0916, -289677.7104, -278165.1873, -126041.2679, -8513.4181, 175315.4394, 425222.6624, 396736.9844, 290811.7923, 45628.9471, -110941.82, -272356.2149, -303391.2037, -338667.781, -295226.877, -106373.2845, 41186.7333, 274657.8578, 516720.1595, 432742.083 ])) assert np.mean(result_df["value1_trend"]) == 470658.0934271346 assert np.mean(result_df["value1_seasonal"]) == 40229.89887290871 assert np.mean(result_df["value1_residuals"]) == 8493.430776879803
def test_target_column_preparation(self, time_column_name, timeseries_identifiers_names, basic_config): df = pd.DataFrame({ "date": [ "2020-12-31", "2021-12-15", "2022-12-01", ], "id": [1, 1, 1], "target": [1, 2, 3], "invalid_target": ["a", "b", "c"], "missing_target": [1, np.nan, 2], "unformatted_target": ["1", "2", "3"] }) dku_config = DecompositionConfig() basic_config["target_columns"] = ["target"] basic_config["frequency_unit"] = "12M" basic_config["season_length_12M"] = 4 dku_config.add_parameters(basic_config, list(df.columns)) preparator = TimeseriesPreparator(dku_config) df_prepared = preparator.prepare_timeseries_dataframe(df) assert df_prepared.loc[0, "target"] == 1 dku_config = DecompositionConfig() basic_config["target_columns"] = ["unformatted_target"] basic_config["frequency_unit"] = "12M" basic_config["season_length_12M"] = 4 dku_config.add_parameters(basic_config, list(df.columns)) preparator = TimeseriesPreparator(dku_config) df_prepared_unformatted = preparator.prepare_timeseries_dataframe(df) assert df_prepared_unformatted.loc[0, "unformatted_target"] == 1 dku_config = DecompositionConfig() basic_config["target_columns"] = ["invalid_target"] basic_config["frequency_unit"] = "12M" basic_config["season_length_12M"] = 4 dku_config.add_parameters(basic_config, list(df.columns)) preparator = TimeseriesPreparator(dku_config) with pytest.raises(ValueError) as err: _ = preparator.prepare_timeseries_dataframe(df) assert "must be numeric" in str(err.value) dku_config = DecompositionConfig() basic_config["target_columns"] = ["missing_target"] basic_config["frequency_unit"] = "12M" basic_config["season_length_12M"] = 4 dku_config.add_parameters(basic_config, list(df.columns)) preparator = TimeseriesPreparator(dku_config) with pytest.raises(ValueError) as err: _ = preparator.prepare_timeseries_dataframe(df) assert "missing value" in str(err.value)
def test_collision(self, basic_dku_config, input_df): basic_dku_config.target_columns = ["value1"] input_df = input_df.rename(columns={"value2": "value1_trend"}) timeseries_preparator = TimeseriesPreparator(basic_dku_config) df_prepared = timeseries_preparator.prepare_timeseries_dataframe( input_df) decomposition = MockDecomposition(basic_dku_config) df_results = decomposition.fit(df_prepared) assert df_results.columns[3] == "value1_trend_0" assert df_results.columns[4] == "value1_seasonal"
def test_long_format_multiple_ids(self, basic_dku_config, long_df_multiple_ids): basic_dku_config.long_format = True basic_dku_config.timeseries_identifiers = ["country", "items"] timeseries_preparator = TimeseriesPreparator(basic_dku_config) df_long_prepared = timeseries_preparator.prepare_timeseries_dataframe( long_df_multiple_ids) decomposition = MockDecomposition(basic_dku_config) df_results = decomposition.fit(df_long_prepared) np.testing.assert_equal(df_results["country"].values, np.array([0, 0, 1, 1, 1])) assert np.mean(df_results["value1_trend"]) == 2.2
def df_from_freq(dku_config): data = [315.58, 316.39, 316.79, 312.09, 321.08, 450.08, 298.79] freq = dku_config.frequency df = pd.DataFrame.from_dict({ "value1": data, "date": pd.date_range("1-1-1959", periods=len(data), freq=freq) }) timeseries_preparator = TimeseriesPreparator(dku_config) df_prepared = timeseries_preparator.prepare_timeseries_dataframe(df) return df_prepared
def add_future_external_features(gluon_train_dataset, external_features_future_df, prediction_length, frequency): """Append the future external features to the 'feat_dynamic_real' arrays of each timeseries of the ListDataset used for training. First check that all timeseries are valid (regular time steps of the chosen frequency and they all have the same start date). Args: gluon_train_dataset (gluonts.dataset.common.ListDataset): ListDataset created with the GluonDataset class. external_features_future_df (DataFrame): Dataframe of future (dated after timeseries of gluon_train_dataset) external features. prediction_length (int): To check that external_features_future_df has the right length. frequency (str): To check that the time column has the right frequency and values. Raises: ValueError: If the length of external_features_future_df is not prediction_length. Returns: gluonts.dataset.common.ListDataset with future external features. """ gluon_dataset = copy.deepcopy(gluon_train_dataset) if isinstance(to_offset(frequency), CUSTOMISABLE_FREQUENCIES_OFFSETS): frequency = gluon_train_dataset.process.trans[0].freq start_date, periods = None, None for i, timeseries in enumerate(gluon_train_dataset): if TIMESERIES_KEYS.IDENTIFIERS in timeseries: # filter the dataframe to only get rows with the right identifiers timeseries_identifiers = timeseries[TIMESERIES_KEYS.IDENTIFIERS] conditions = [external_features_future_df[k] == v for k, v in timeseries_identifiers.items()] timeseries_external_features_future_df = apply_filter_conditions(external_features_future_df, conditions) else: timeseries_external_features_future_df = external_features_future_df feat_dynamic_real_train = timeseries[TIMESERIES_KEYS.FEAT_DYNAMIC_REAL] feat_dynamic_real_columns_names = timeseries[TIMESERIES_KEYS.FEAT_DYNAMIC_REAL_COLUMNS_NAMES] time_column_name = timeseries[TIMESERIES_KEYS.TIME_COLUMN_NAME] timeseries_preparator = TimeseriesPreparator( time_column_name=time_column_name, frequency=frequency, ) timeseries_external_features_future_df = timeseries_preparator.prepare_timeseries_dataframe(timeseries_external_features_future_df) feat_dynamic_real_future = timeseries_external_features_future_df[feat_dynamic_real_columns_names].values.T if feat_dynamic_real_future.shape[1] != prediction_length: raise ValueError(f"Please provide {prediction_length} future values of external features, as this was the forecasting horizon used for training") feat_dynamic_real_appended = np.append(feat_dynamic_real_train, feat_dynamic_real_future, axis=1) gluon_dataset.list_data[i][TIMESERIES_KEYS.FEAT_DYNAMIC_REAL] = feat_dynamic_real_appended return gluon_dataset
def test_multiplicative_model_with_negative_values(self, basic_dku_config, input_df): input_df.loc[0, "value1"] = -2 timeseries_preparator = TimeseriesPreparator(basic_dku_config) df_prepared = timeseries_preparator.prepare_timeseries_dataframe( input_df) input_validator = DecompositionInputValidator(basic_dku_config) with pytest.raises(ValueError) as err: _ = input_validator.check(df_prepared) assert "multiplicative" in str(err.value) assert "negative" in str(err.value) assert "value1" in str(err.value)
def test_insufficient_samples_2_ts_identifiers(self, basic_dku_config, long_df): basic_dku_config.long_format = True basic_dku_config.timeseries_identifiers = ["country", "item"] timeseries_preparator = TimeseriesPreparator(basic_dku_config) df_too_short = timeseries_preparator.prepare_timeseries_dataframe( long_df) input_validator = DecompositionInputValidator(basic_dku_config) with pytest.raises(ValueError) as err: _ = input_validator.check(df_too_short) assert "need at least" in str(err.value) assert "country" in str(err.value) assert "item" in str(err.value) assert "[1 1 1 1]" in str(err.value)
def test_single_target(self, basic_dku_config, input_df): basic_dku_config.target_columns = ["value1"] timeseries_preparator = TimeseriesPreparator(basic_dku_config) df_prepared = timeseries_preparator.prepare_timeseries_dataframe( input_df) decomposition = MockDecomposition(basic_dku_config) df_results = decomposition.fit(df_prepared) size = df_prepared.shape[0] np.testing.assert_equal(df_results["value1_trend"], np.ones(size)) np.testing.assert_equal(df_results["value1_seasonal"], 2 * np.ones(size)) np.testing.assert_equal(df_results["value1_residuals"], 3 * np.ones(size))
def test_long_format(self, basic_dku_config, long_df): basic_dku_config.long_format = True basic_dku_config.timeseries_identifiers = ["country"] timeseries_preparator = TimeseriesPreparator(basic_dku_config) df_long_prepared = timeseries_preparator.prepare_timeseries_dataframe( long_df) decomposition = MockDecomposition(basic_dku_config) df_results = decomposition.fit(df_long_prepared) np.testing.assert_equal(df_results["value1_trend"], np.array([1, 1, 3, 3])) np.testing.assert_equal(df_results["value2_trend"], np.array([2, 2, 4, 4])) np.testing.assert_equal(df_results["value1_seasonal"], np.array([2, 2, 6, 6])) np.testing.assert_equal(df_results["value2_residuals"], np.array([6, 6, 12, 12]))
def test_missing_values_identifiers(): with pytest.raises(ValueError): df = pd.DataFrame( { "date": ["2018-01-06", "2018-01-07", "2018-01-08", "2018-01-06", "2018-01-07", "2018-01-08"], "volume": [2, 4, 2, 5, 2, 5], "item": [1, 1, np.NaN, 2, 2, 2], } ) timeseries_preparator = TimeseriesPreparator( time_column_name="date", frequency="D", target_columns_names=["volume"], timeseries_identifiers_names=["item"], ) training_df_prepared = timeseries_preparator.prepare_timeseries_dataframe(df)
def test_STL_additive(self, dku_config, input_df): timeseries_preparator = TimeseriesPreparator(dku_config) df_prepared = timeseries_preparator.prepare_timeseries_dataframe( input_df) decomposition = STLDecomposition(dku_config) results = decomposition.fit(df_prepared) expected_array = np.array([ 547017.8314, 537486.722, 528097.1954, 518846.2605, 509728.8989, 500744.2034, 491895.324, 483188.5115, 474630.5299, 466256.2782, 458496.2869, 454985.6935, 453114.0625, 452740.149, 453810.1866, 456404.7768, 463218.9767, 470913.292, 478947.2522, 487217.229, 495684.7824, 504325.6079, 513126.1746, 522081.8564, 531195.1428, 540473.2835 ]) rounded_results = np.round(results["value1_trend"].values, 4) np.testing.assert_equal(rounded_results, expected_array) assert np.mean(results["value1_trend"]) == 492101.0195351211 assert np.mean(results["value1_seasonal"]) == 32625.652227975654 assert np.mean(results["value1_residuals"]) == -5345.248686173698
def test_STL_multiplicative(self, dku_config, input_df): dku_config.model = "multiplicative" timeseries_preparator = TimeseriesPreparator(dku_config) df_prepared = timeseries_preparator.prepare_timeseries_dataframe( input_df) decomposition = STLDecomposition(dku_config) results = decomposition.fit(df_prepared) expected_array = [ 1.87080328, 1.94864198, 1.97546651, 1.47349625, 0.74672304, 0.6552587, 0.5000725, 0.46825876, 0.49417933, 0.86890043, 1.16434155, 1.63725892, 2.17084151, 2.106642, 1.95377386, 1.32400823, 0.92620183, 0.51855162, 0.44493062, 0.35877353, 0.47054681, 0.94481716, 1.30967762, 1.88240591, 2.51946737, 2.28270725 ] rounded_results = np.round(results["value1_seasonal"].values, 8) np.testing.assert_equal(rounded_results, expected_array) assert np.mean(results["value1_trend"]) == 409265.35453951 assert np.mean(results["value1_seasonal"]) == 1.2698748679749627 assert np.mean(results["value1_residuals"]) == 0.9941032097902623
models_parameters = get_models_parameters( config, is_training_multivariate=params["is_training_multivariate"]) start = perf_counter() training_df = params["training_dataset"].get_dataframe() timeseries_preparator = TimeseriesPreparator( time_column_name=params["time_column_name"], frequency=params["frequency"], target_columns_names=params["target_columns_names"], timeseries_identifiers_names=params["timeseries_identifiers_names"], external_features_columns_names=params["external_features_columns_names"], max_timeseries_length=params["max_timeseries_length"], ) training_df_prepared = timeseries_preparator.prepare_timeseries_dataframe( training_df) training_session = TrainingSession( target_columns_names=params["target_columns_names"], time_column_name=params["time_column_name"], frequency=params["frequency"], epoch=params["epoch"], models_parameters=models_parameters, prediction_length=params["prediction_length"], training_df=training_df_prepared, make_forecasts=params["make_forecasts"], external_features_columns_names=params["external_features_columns_names"], timeseries_identifiers_names=params["timeseries_identifiers_names"], batch_size=params["batch_size"], user_num_batches_per_epoch=params["num_batches_per_epoch"], season_length=params["season_length"],
def test_empty_input_dataset(self, dku_config, time_column_name): empty_df = pd.DataFrame(columns=["value1", "target", time_column_name]) timeseries_preparator = TimeseriesPreparator(dku_config) with pytest.raises(ValueError) as err: _ = timeseries_preparator.prepare_timeseries_dataframe(empty_df) assert "empty" in str(err.value)
from io_utils import get_input_output, set_column_description from recipe_config_loading import get_decomposition_params from safe_logger import SafeLogger from timeseries_preparation.preparation import TimeseriesPreparator logger = SafeLogger("Timeseries preparation plugin") (input_dataset, output_dataset) = get_input_output() config = get_recipe_config() input_dataset_columns = [ column["name"] for column in input_dataset.read_schema() ] (dku_config, input_validator, decomposition) = get_decomposition_params(config, input_dataset_columns) timeseries_preparator = TimeseriesPreparator(dku_config) input_df = input_dataset.get_dataframe(infer_with_pandas=False) df_prepared = timeseries_preparator.prepare_timeseries_dataframe(input_df) input_validator.check(df_prepared) start = perf_counter() logger.info("Decomposing time series...") transformed_df = decomposition.fit(df_prepared) logger.info( "Decomposing time series: Done in {:.2f} seconds".format(perf_counter() - start)) transformation_df = output_dataset.write_with_schema(transformed_df) set_column_description(output_dataset, decomposition.columns_descriptions, input_dataset)
def test_advanced_smoothers(self, config, input_df, input_dataset_columns): config["decomposition_model"] = "additive" config["additional_parameters_STL"] = {"trend": "35", "low_pass": "******"} dku_config = STLConfig() dku_config.add_parameters(config, input_dataset_columns) timeseries_preparator = TimeseriesPreparator(dku_config) df_prepared = timeseries_preparator.prepare_timeseries_dataframe( input_df) decomposition = STLDecomposition(dku_config) result_df = decomposition.fit(df_prepared) assert result_df.shape == (26, 6) np.testing.assert_array_equal( np.round(result_df.value1_seasonal.values, 4), np.array([ 329279.6394, 360305.5117, 378691.0343, 151319.491, -166075.4661, -206300.4391, -272041.7161, -285356.053, -274969.4078, -125368.4261, -10804.3636, 173084.5489, 421640.9531, 393264.9995, 288207.4229, 42573.3565, -111402.3446, -270267.5348, -299889.3857, -334837.5864, -291850.134, -103986.6224, 42205.6726, 274027.7075, 515335.6499, 429183.6225 ])) assert np.mean(result_df["value1_trend"]) == 482542.4367257319 assert np.mean(result_df["value1_seasonal"]) == 40229.62038767122 assert np.mean(result_df["value1_residuals"]) == -3390.634036480091 config["additional_parameters_STL"] = { "trend": "2999999", "low_pass": "******" } dku_config = STLConfig() dku_config.add_parameters(config, input_dataset_columns) timeseries_preparator = TimeseriesPreparator(dku_config) df_prepared = timeseries_preparator.prepare_timeseries_dataframe( input_df) decomposition = STLDecomposition(dku_config) result_df = decomposition.fit(df_prepared) assert result_df.shape == (26, 6) assert np.mean(result_df["value1_trend"]) == 476077.5935197392 assert np.mean(result_df["value1_seasonal"]) == 43303.82955718398 assert np.mean(result_df["value1_residuals"]) == -3.134258664571322e-11 config["additional_parameters_STL"] = {"trend": ""} dku_config = STLConfig() dku_config.add_parameters(config, input_dataset_columns) timeseries_preparator = TimeseriesPreparator(dku_config) df_prepared = timeseries_preparator.prepare_timeseries_dataframe( input_df) decomposition = STLDecomposition(dku_config) result_df = decomposition.fit(df_prepared) assert result_df.shape == (26, 6) assert np.mean(result_df["value1_trend"]) == 492101.0195351211 assert np.mean(result_df["value1_seasonal"]) == 32625.652227975654 assert np.mean(result_df["value1_residuals"]) == -5345.248686173698 config["additional_parameters_STL"] = {"trend": "None"} dku_config = STLConfig() dku_config.add_parameters(config, input_dataset_columns) timeseries_preparator = TimeseriesPreparator(dku_config) df_prepared = timeseries_preparator.prepare_timeseries_dataframe( input_df) decomposition = STLDecomposition(dku_config) result_df = decomposition.fit(df_prepared) assert result_df.shape == (26, 6) assert np.mean(result_df["value1_trend"]) == 492101.0195351211 assert np.mean(result_df["value1_seasonal"]) == 32625.652227975654 assert np.mean(result_df["value1_residuals"]) == -5345.248686173698