def test_advanced_degrees(self, config, input_df, input_dataset_columns):
        config["additional_parameters_STL"] = {
            "seasonal_deg": "1",
            "trend_deg": "1",
            "low_pass_deg": "1"
        }
        dku_config = STLConfig()
        dku_config.add_parameters(config, input_dataset_columns)
        timeseries_preparator = TimeseriesPreparator(dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = STLDecomposition(dku_config)
        result_df = decomposition.fit(df_prepared)
        assert result_df.shape == (26, 6)
        np.testing.assert_array_equal(
            np.round(result_df.value1_trend.values, 4),
            np.array([
                547017.8314, 537486.722, 528097.1954, 518846.2605, 509728.8989,
                500744.2034, 491895.324, 483188.5115, 474630.5299, 466256.2782,
                458496.2869, 454985.6935, 453114.0625, 452740.149, 453810.1866,
                456404.7768, 463218.9767, 470913.292, 478947.2522, 487217.229,
                495684.7824, 504325.6079, 513126.1746, 522081.8564,
                531195.1428, 540473.2835
            ]))
        assert np.mean(result_df["value1_trend"]) == 492101.0195351211
        assert np.mean(result_df["value1_seasonal"]) == 32625.652227975654
        assert np.mean(result_df["value1_residuals"]) == -5345.248686173698

        config["additional_parameters_STL"] = {
            "seasonal_deg": "1",
            "trend_deg": "0"
        }
        dku_config = STLConfig()
        dku_config.add_parameters(config, input_dataset_columns)
        timeseries_preparator = TimeseriesPreparator(dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = STLDecomposition(dku_config)
        result_df = decomposition.fit(df_prepared)
        assert result_df.shape == (26, 6)
        np.testing.assert_array_equal(
            np.round(result_df.value1_seasonal.values, 4),
            np.array([
                334926.5396, 363552.8324, 380642.7497, 151182.772,
                -168020.8919, -209675.4339, -276299.0916, -289677.7104,
                -278165.1873, -126041.2679, -8513.4181, 175315.4394,
                425222.6624, 396736.9844, 290811.7923, 45628.9471, -110941.82,
                -272356.2149, -303391.2037, -338667.781, -295226.877,
                -106373.2845, 41186.7333, 274657.8578, 516720.1595, 432742.083
            ]))
        assert np.mean(result_df["value1_trend"]) == 470658.0934271346
        assert np.mean(result_df["value1_seasonal"]) == 40229.89887290871
        assert np.mean(result_df["value1_residuals"]) == 8493.430776879803
    def test_target_column_preparation(self, time_column_name,
                                       timeseries_identifiers_names,
                                       basic_config):
        df = pd.DataFrame({
            "date": [
                "2020-12-31",
                "2021-12-15",
                "2022-12-01",
            ],
            "id": [1, 1, 1],
            "target": [1, 2, 3],
            "invalid_target": ["a", "b", "c"],
            "missing_target": [1, np.nan, 2],
            "unformatted_target": ["1", "2", "3"]
        })
        dku_config = DecompositionConfig()
        basic_config["target_columns"] = ["target"]
        basic_config["frequency_unit"] = "12M"
        basic_config["season_length_12M"] = 4
        dku_config.add_parameters(basic_config, list(df.columns))
        preparator = TimeseriesPreparator(dku_config)
        df_prepared = preparator.prepare_timeseries_dataframe(df)
        assert df_prepared.loc[0, "target"] == 1

        dku_config = DecompositionConfig()
        basic_config["target_columns"] = ["unformatted_target"]
        basic_config["frequency_unit"] = "12M"
        basic_config["season_length_12M"] = 4
        dku_config.add_parameters(basic_config, list(df.columns))
        preparator = TimeseriesPreparator(dku_config)
        df_prepared_unformatted = preparator.prepare_timeseries_dataframe(df)
        assert df_prepared_unformatted.loc[0, "unformatted_target"] == 1

        dku_config = DecompositionConfig()
        basic_config["target_columns"] = ["invalid_target"]
        basic_config["frequency_unit"] = "12M"
        basic_config["season_length_12M"] = 4
        dku_config.add_parameters(basic_config, list(df.columns))
        preparator = TimeseriesPreparator(dku_config)
        with pytest.raises(ValueError) as err:
            _ = preparator.prepare_timeseries_dataframe(df)
        assert "must be numeric" in str(err.value)

        dku_config = DecompositionConfig()
        basic_config["target_columns"] = ["missing_target"]
        basic_config["frequency_unit"] = "12M"
        basic_config["season_length_12M"] = 4
        dku_config.add_parameters(basic_config, list(df.columns))
        preparator = TimeseriesPreparator(dku_config)
        with pytest.raises(ValueError) as err:
            _ = preparator.prepare_timeseries_dataframe(df)
        assert "missing value" in str(err.value)
 def test_collision(self, basic_dku_config, input_df):
     basic_dku_config.target_columns = ["value1"]
     input_df = input_df.rename(columns={"value2": "value1_trend"})
     timeseries_preparator = TimeseriesPreparator(basic_dku_config)
     df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
         input_df)
     decomposition = MockDecomposition(basic_dku_config)
     df_results = decomposition.fit(df_prepared)
     assert df_results.columns[3] == "value1_trend_0"
     assert df_results.columns[4] == "value1_seasonal"
 def test_long_format_multiple_ids(self, basic_dku_config,
                                   long_df_multiple_ids):
     basic_dku_config.long_format = True
     basic_dku_config.timeseries_identifiers = ["country", "items"]
     timeseries_preparator = TimeseriesPreparator(basic_dku_config)
     df_long_prepared = timeseries_preparator.prepare_timeseries_dataframe(
         long_df_multiple_ids)
     decomposition = MockDecomposition(basic_dku_config)
     df_results = decomposition.fit(df_long_prepared)
     np.testing.assert_equal(df_results["country"].values,
                             np.array([0, 0, 1, 1, 1]))
     assert np.mean(df_results["value1_trend"]) == 2.2
def df_from_freq(dku_config):
    data = [315.58, 316.39, 316.79, 312.09, 321.08, 450.08, 298.79]
    freq = dku_config.frequency
    df = pd.DataFrame.from_dict({
        "value1":
        data,
        "date":
        pd.date_range("1-1-1959", periods=len(data), freq=freq)
    })
    timeseries_preparator = TimeseriesPreparator(dku_config)
    df_prepared = timeseries_preparator.prepare_timeseries_dataframe(df)
    return df_prepared
Esempio n. 6
0
def add_future_external_features(gluon_train_dataset, external_features_future_df, prediction_length, frequency):
    """Append the future external features to the 'feat_dynamic_real' arrays of each timeseries of the ListDataset used for training.
    First check that all timeseries are valid (regular time steps of the chosen frequency and they all have the same start date).

    Args:
        gluon_train_dataset (gluonts.dataset.common.ListDataset): ListDataset created with the GluonDataset class.
        external_features_future_df (DataFrame): Dataframe of future (dated after timeseries of gluon_train_dataset) external features.
        prediction_length (int): To check that external_features_future_df has the right length.
        frequency (str): To check that the time column has the right frequency and values.

    Raises:
        ValueError: If the length of external_features_future_df is not prediction_length.

    Returns:
        gluonts.dataset.common.ListDataset with future external features.
    """
    gluon_dataset = copy.deepcopy(gluon_train_dataset)
    if isinstance(to_offset(frequency), CUSTOMISABLE_FREQUENCIES_OFFSETS):
        frequency = gluon_train_dataset.process.trans[0].freq

    start_date, periods = None, None
    for i, timeseries in enumerate(gluon_train_dataset):
        if TIMESERIES_KEYS.IDENTIFIERS in timeseries:
            # filter the dataframe to only get rows with the right identifiers
            timeseries_identifiers = timeseries[TIMESERIES_KEYS.IDENTIFIERS]
            conditions = [external_features_future_df[k] == v for k, v in timeseries_identifiers.items()]
            timeseries_external_features_future_df = apply_filter_conditions(external_features_future_df, conditions)
        else:
            timeseries_external_features_future_df = external_features_future_df

        feat_dynamic_real_train = timeseries[TIMESERIES_KEYS.FEAT_DYNAMIC_REAL]
        feat_dynamic_real_columns_names = timeseries[TIMESERIES_KEYS.FEAT_DYNAMIC_REAL_COLUMNS_NAMES]
        time_column_name = timeseries[TIMESERIES_KEYS.TIME_COLUMN_NAME]

        timeseries_preparator = TimeseriesPreparator(
            time_column_name=time_column_name,
            frequency=frequency,
        )
        timeseries_external_features_future_df = timeseries_preparator.prepare_timeseries_dataframe(timeseries_external_features_future_df)

        feat_dynamic_real_future = timeseries_external_features_future_df[feat_dynamic_real_columns_names].values.T

        if feat_dynamic_real_future.shape[1] != prediction_length:
            raise ValueError(f"Please provide {prediction_length} future values of external features, as this was the forecasting horizon used for training")

        feat_dynamic_real_appended = np.append(feat_dynamic_real_train, feat_dynamic_real_future, axis=1)

        gluon_dataset.list_data[i][TIMESERIES_KEYS.FEAT_DYNAMIC_REAL] = feat_dynamic_real_appended

    return gluon_dataset
Esempio n. 7
0
    def test_multiplicative_model_with_negative_values(self, basic_dku_config,
                                                       input_df):
        input_df.loc[0, "value1"] = -2

        timeseries_preparator = TimeseriesPreparator(basic_dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)

        input_validator = DecompositionInputValidator(basic_dku_config)
        with pytest.raises(ValueError) as err:
            _ = input_validator.check(df_prepared)
        assert "multiplicative" in str(err.value)
        assert "negative" in str(err.value)
        assert "value1" in str(err.value)
Esempio n. 8
0
 def test_insufficient_samples_2_ts_identifiers(self, basic_dku_config,
                                                long_df):
     basic_dku_config.long_format = True
     basic_dku_config.timeseries_identifiers = ["country", "item"]
     timeseries_preparator = TimeseriesPreparator(basic_dku_config)
     df_too_short = timeseries_preparator.prepare_timeseries_dataframe(
         long_df)
     input_validator = DecompositionInputValidator(basic_dku_config)
     with pytest.raises(ValueError) as err:
         _ = input_validator.check(df_too_short)
     assert "need at least" in str(err.value)
     assert "country" in str(err.value)
     assert "item" in str(err.value)
     assert "[1 1 1 1]" in str(err.value)
    def test_single_target(self, basic_dku_config, input_df):
        basic_dku_config.target_columns = ["value1"]
        timeseries_preparator = TimeseriesPreparator(basic_dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = MockDecomposition(basic_dku_config)
        df_results = decomposition.fit(df_prepared)
        size = df_prepared.shape[0]

        np.testing.assert_equal(df_results["value1_trend"], np.ones(size))
        np.testing.assert_equal(df_results["value1_seasonal"],
                                2 * np.ones(size))
        np.testing.assert_equal(df_results["value1_residuals"],
                                3 * np.ones(size))
 def test_long_format(self, basic_dku_config, long_df):
     basic_dku_config.long_format = True
     basic_dku_config.timeseries_identifiers = ["country"]
     timeseries_preparator = TimeseriesPreparator(basic_dku_config)
     df_long_prepared = timeseries_preparator.prepare_timeseries_dataframe(
         long_df)
     decomposition = MockDecomposition(basic_dku_config)
     df_results = decomposition.fit(df_long_prepared)
     np.testing.assert_equal(df_results["value1_trend"],
                             np.array([1, 1, 3, 3]))
     np.testing.assert_equal(df_results["value2_trend"],
                             np.array([2, 2, 4, 4]))
     np.testing.assert_equal(df_results["value1_seasonal"],
                             np.array([2, 2, 6, 6]))
     np.testing.assert_equal(df_results["value2_residuals"],
                             np.array([6, 6, 12, 12]))
Esempio n. 11
0
def test_missing_values_identifiers():
    with pytest.raises(ValueError):
        df = pd.DataFrame(
            {
                "date": ["2018-01-06", "2018-01-07", "2018-01-08", "2018-01-06", "2018-01-07", "2018-01-08"],
                "volume": [2, 4, 2, 5, 2, 5],
                "item": [1, 1, np.NaN, 2, 2, 2],
            }
        )

        timeseries_preparator = TimeseriesPreparator(
            time_column_name="date",
            frequency="D",
            target_columns_names=["volume"],
            timeseries_identifiers_names=["item"],
        )

        training_df_prepared = timeseries_preparator.prepare_timeseries_dataframe(df)
 def test_STL_additive(self, dku_config, input_df):
     timeseries_preparator = TimeseriesPreparator(dku_config)
     df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
         input_df)
     decomposition = STLDecomposition(dku_config)
     results = decomposition.fit(df_prepared)
     expected_array = np.array([
         547017.8314, 537486.722, 528097.1954, 518846.2605, 509728.8989,
         500744.2034, 491895.324, 483188.5115, 474630.5299, 466256.2782,
         458496.2869, 454985.6935, 453114.0625, 452740.149, 453810.1866,
         456404.7768, 463218.9767, 470913.292, 478947.2522, 487217.229,
         495684.7824, 504325.6079, 513126.1746, 522081.8564, 531195.1428,
         540473.2835
     ])
     rounded_results = np.round(results["value1_trend"].values, 4)
     np.testing.assert_equal(rounded_results, expected_array)
     assert np.mean(results["value1_trend"]) == 492101.0195351211
     assert np.mean(results["value1_seasonal"]) == 32625.652227975654
     assert np.mean(results["value1_residuals"]) == -5345.248686173698
    def test_STL_multiplicative(self, dku_config, input_df):
        dku_config.model = "multiplicative"
        timeseries_preparator = TimeseriesPreparator(dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = STLDecomposition(dku_config)
        results = decomposition.fit(df_prepared)
        expected_array = [
            1.87080328, 1.94864198, 1.97546651, 1.47349625, 0.74672304,
            0.6552587, 0.5000725, 0.46825876, 0.49417933, 0.86890043,
            1.16434155, 1.63725892, 2.17084151, 2.106642, 1.95377386,
            1.32400823, 0.92620183, 0.51855162, 0.44493062, 0.35877353,
            0.47054681, 0.94481716, 1.30967762, 1.88240591, 2.51946737,
            2.28270725
        ]
        rounded_results = np.round(results["value1_seasonal"].values, 8)
        np.testing.assert_equal(rounded_results, expected_array)

        assert np.mean(results["value1_trend"]) == 409265.35453951
        assert np.mean(results["value1_seasonal"]) == 1.2698748679749627
        assert np.mean(results["value1_residuals"]) == 0.9941032097902623
models_parameters = get_models_parameters(
    config, is_training_multivariate=params["is_training_multivariate"])
start = perf_counter()

training_df = params["training_dataset"].get_dataframe()

timeseries_preparator = TimeseriesPreparator(
    time_column_name=params["time_column_name"],
    frequency=params["frequency"],
    target_columns_names=params["target_columns_names"],
    timeseries_identifiers_names=params["timeseries_identifiers_names"],
    external_features_columns_names=params["external_features_columns_names"],
    max_timeseries_length=params["max_timeseries_length"],
)

training_df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
    training_df)

training_session = TrainingSession(
    target_columns_names=params["target_columns_names"],
    time_column_name=params["time_column_name"],
    frequency=params["frequency"],
    epoch=params["epoch"],
    models_parameters=models_parameters,
    prediction_length=params["prediction_length"],
    training_df=training_df_prepared,
    make_forecasts=params["make_forecasts"],
    external_features_columns_names=params["external_features_columns_names"],
    timeseries_identifiers_names=params["timeseries_identifiers_names"],
    batch_size=params["batch_size"],
    user_num_batches_per_epoch=params["num_batches_per_epoch"],
    season_length=params["season_length"],
 def test_empty_input_dataset(self, dku_config, time_column_name):
     empty_df = pd.DataFrame(columns=["value1", "target", time_column_name])
     timeseries_preparator = TimeseriesPreparator(dku_config)
     with pytest.raises(ValueError) as err:
         _ = timeseries_preparator.prepare_timeseries_dataframe(empty_df)
     assert "empty" in str(err.value)
from io_utils import get_input_output, set_column_description
from recipe_config_loading import get_decomposition_params
from safe_logger import SafeLogger
from timeseries_preparation.preparation import TimeseriesPreparator

logger = SafeLogger("Timeseries preparation plugin")

(input_dataset, output_dataset) = get_input_output()
config = get_recipe_config()
input_dataset_columns = [
    column["name"] for column in input_dataset.read_schema()
]
(dku_config, input_validator,
 decomposition) = get_decomposition_params(config, input_dataset_columns)

timeseries_preparator = TimeseriesPreparator(dku_config)
input_df = input_dataset.get_dataframe(infer_with_pandas=False)
df_prepared = timeseries_preparator.prepare_timeseries_dataframe(input_df)
input_validator.check(df_prepared)

start = perf_counter()
logger.info("Decomposing time series...")
transformed_df = decomposition.fit(df_prepared)
logger.info(
    "Decomposing time series: Done in {:.2f} seconds".format(perf_counter() -
                                                             start))
transformation_df = output_dataset.write_with_schema(transformed_df)
set_column_description(output_dataset, decomposition.columns_descriptions,
                       input_dataset)
    def test_advanced_smoothers(self, config, input_df, input_dataset_columns):
        config["decomposition_model"] = "additive"
        config["additional_parameters_STL"] = {"trend": "35", "low_pass": "******"}
        dku_config = STLConfig()
        dku_config.add_parameters(config, input_dataset_columns)
        timeseries_preparator = TimeseriesPreparator(dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = STLDecomposition(dku_config)
        result_df = decomposition.fit(df_prepared)
        assert result_df.shape == (26, 6)
        np.testing.assert_array_equal(
            np.round(result_df.value1_seasonal.values, 4),
            np.array([
                329279.6394, 360305.5117, 378691.0343, 151319.491,
                -166075.4661, -206300.4391, -272041.7161, -285356.053,
                -274969.4078, -125368.4261, -10804.3636, 173084.5489,
                421640.9531, 393264.9995, 288207.4229, 42573.3565,
                -111402.3446, -270267.5348, -299889.3857, -334837.5864,
                -291850.134, -103986.6224, 42205.6726, 274027.7075,
                515335.6499, 429183.6225
            ]))
        assert np.mean(result_df["value1_trend"]) == 482542.4367257319
        assert np.mean(result_df["value1_seasonal"]) == 40229.62038767122
        assert np.mean(result_df["value1_residuals"]) == -3390.634036480091

        config["additional_parameters_STL"] = {
            "trend": "2999999",
            "low_pass": "******"
        }
        dku_config = STLConfig()
        dku_config.add_parameters(config, input_dataset_columns)
        timeseries_preparator = TimeseriesPreparator(dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = STLDecomposition(dku_config)
        result_df = decomposition.fit(df_prepared)
        assert result_df.shape == (26, 6)
        assert np.mean(result_df["value1_trend"]) == 476077.5935197392
        assert np.mean(result_df["value1_seasonal"]) == 43303.82955718398
        assert np.mean(result_df["value1_residuals"]) == -3.134258664571322e-11

        config["additional_parameters_STL"] = {"trend": ""}
        dku_config = STLConfig()
        dku_config.add_parameters(config, input_dataset_columns)
        timeseries_preparator = TimeseriesPreparator(dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = STLDecomposition(dku_config)
        result_df = decomposition.fit(df_prepared)
        assert result_df.shape == (26, 6)
        assert np.mean(result_df["value1_trend"]) == 492101.0195351211
        assert np.mean(result_df["value1_seasonal"]) == 32625.652227975654
        assert np.mean(result_df["value1_residuals"]) == -5345.248686173698

        config["additional_parameters_STL"] = {"trend": "None"}
        dku_config = STLConfig()
        dku_config.add_parameters(config, input_dataset_columns)
        timeseries_preparator = TimeseriesPreparator(dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = STLDecomposition(dku_config)
        result_df = decomposition.fit(df_prepared)
        assert result_df.shape == (26, 6)
        assert np.mean(result_df["value1_trend"]) == 492101.0195351211
        assert np.mean(result_df["value1_seasonal"]) == 32625.652227975654
        assert np.mean(result_df["value1_residuals"]) == -5345.248686173698