def test_collision(self, basic_dku_config, input_df):
     basic_dku_config.target_columns = ["value1"]
     input_df = input_df.rename(columns={"value2": "value1_trend"})
     timeseries_preparator = TimeseriesPreparator(basic_dku_config)
     df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
         input_df)
     decomposition = MockDecomposition(basic_dku_config)
     df_results = decomposition.fit(df_prepared)
     assert df_results.columns[3] == "value1_trend_0"
     assert df_results.columns[4] == "value1_seasonal"
def df_from_freq(dku_config):
    data = [315.58, 316.39, 316.79, 312.09, 321.08, 450.08, 298.79]
    freq = dku_config.frequency
    df = pd.DataFrame.from_dict({
        "value1":
        data,
        "date":
        pd.date_range("1-1-1959", periods=len(data), freq=freq)
    })
    timeseries_preparator = TimeseriesPreparator(dku_config)
    df_prepared = timeseries_preparator.prepare_timeseries_dataframe(df)
    return df_prepared
 def test_long_format_multiple_ids(self, basic_dku_config,
                                   long_df_multiple_ids):
     basic_dku_config.long_format = True
     basic_dku_config.timeseries_identifiers = ["country", "items"]
     timeseries_preparator = TimeseriesPreparator(basic_dku_config)
     df_long_prepared = timeseries_preparator.prepare_timeseries_dataframe(
         long_df_multiple_ids)
     decomposition = MockDecomposition(basic_dku_config)
     df_results = decomposition.fit(df_long_prepared)
     np.testing.assert_equal(df_results["country"].values,
                             np.array([0, 0, 1, 1, 1]))
     assert np.mean(df_results["value1_trend"]) == 2.2
    def test_single_target(self, basic_dku_config, input_df):
        basic_dku_config.target_columns = ["value1"]
        timeseries_preparator = TimeseriesPreparator(basic_dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = MockDecomposition(basic_dku_config)
        df_results = decomposition.fit(df_prepared)
        size = df_prepared.shape[0]

        np.testing.assert_equal(df_results["value1_trend"], np.ones(size))
        np.testing.assert_equal(df_results["value1_seasonal"],
                                2 * np.ones(size))
        np.testing.assert_equal(df_results["value1_residuals"],
                                3 * np.ones(size))
Esempio n. 5
0
    def test_multiplicative_model_with_negative_values(self, basic_dku_config,
                                                       input_df):
        input_df.loc[0, "value1"] = -2

        timeseries_preparator = TimeseriesPreparator(basic_dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)

        input_validator = DecompositionInputValidator(basic_dku_config)
        with pytest.raises(ValueError) as err:
            _ = input_validator.check(df_prepared)
        assert "multiplicative" in str(err.value)
        assert "negative" in str(err.value)
        assert "value1" in str(err.value)
Esempio n. 6
0
 def test_insufficient_samples_2_ts_identifiers(self, basic_dku_config,
                                                long_df):
     basic_dku_config.long_format = True
     basic_dku_config.timeseries_identifiers = ["country", "item"]
     timeseries_preparator = TimeseriesPreparator(basic_dku_config)
     df_too_short = timeseries_preparator.prepare_timeseries_dataframe(
         long_df)
     input_validator = DecompositionInputValidator(basic_dku_config)
     with pytest.raises(ValueError) as err:
         _ = input_validator.check(df_too_short)
     assert "need at least" in str(err.value)
     assert "country" in str(err.value)
     assert "item" in str(err.value)
     assert "[1 1 1 1]" in str(err.value)
Esempio n. 7
0
def add_future_external_features(gluon_train_dataset, external_features_future_df, prediction_length, frequency):
    """Append the future external features to the 'feat_dynamic_real' arrays of each timeseries of the ListDataset used for training.
    First check that all timeseries are valid (regular time steps of the chosen frequency and they all have the same start date).

    Args:
        gluon_train_dataset (gluonts.dataset.common.ListDataset): ListDataset created with the GluonDataset class.
        external_features_future_df (DataFrame): Dataframe of future (dated after timeseries of gluon_train_dataset) external features.
        prediction_length (int): To check that external_features_future_df has the right length.
        frequency (str): To check that the time column has the right frequency and values.

    Raises:
        ValueError: If the length of external_features_future_df is not prediction_length.

    Returns:
        gluonts.dataset.common.ListDataset with future external features.
    """
    gluon_dataset = copy.deepcopy(gluon_train_dataset)
    if isinstance(to_offset(frequency), CUSTOMISABLE_FREQUENCIES_OFFSETS):
        frequency = gluon_train_dataset.process.trans[0].freq

    start_date, periods = None, None
    for i, timeseries in enumerate(gluon_train_dataset):
        if TIMESERIES_KEYS.IDENTIFIERS in timeseries:
            # filter the dataframe to only get rows with the right identifiers
            timeseries_identifiers = timeseries[TIMESERIES_KEYS.IDENTIFIERS]
            conditions = [external_features_future_df[k] == v for k, v in timeseries_identifiers.items()]
            timeseries_external_features_future_df = apply_filter_conditions(external_features_future_df, conditions)
        else:
            timeseries_external_features_future_df = external_features_future_df

        feat_dynamic_real_train = timeseries[TIMESERIES_KEYS.FEAT_DYNAMIC_REAL]
        feat_dynamic_real_columns_names = timeseries[TIMESERIES_KEYS.FEAT_DYNAMIC_REAL_COLUMNS_NAMES]
        time_column_name = timeseries[TIMESERIES_KEYS.TIME_COLUMN_NAME]

        timeseries_preparator = TimeseriesPreparator(
            time_column_name=time_column_name,
            frequency=frequency,
        )
        timeseries_external_features_future_df = timeseries_preparator.prepare_timeseries_dataframe(timeseries_external_features_future_df)

        feat_dynamic_real_future = timeseries_external_features_future_df[feat_dynamic_real_columns_names].values.T

        if feat_dynamic_real_future.shape[1] != prediction_length:
            raise ValueError(f"Please provide {prediction_length} future values of external features, as this was the forecasting horizon used for training")

        feat_dynamic_real_appended = np.append(feat_dynamic_real_train, feat_dynamic_real_future, axis=1)

        gluon_dataset.list_data[i][TIMESERIES_KEYS.FEAT_DYNAMIC_REAL] = feat_dynamic_real_appended

    return gluon_dataset
 def test_long_format(self, basic_dku_config, long_df):
     basic_dku_config.long_format = True
     basic_dku_config.timeseries_identifiers = ["country"]
     timeseries_preparator = TimeseriesPreparator(basic_dku_config)
     df_long_prepared = timeseries_preparator.prepare_timeseries_dataframe(
         long_df)
     decomposition = MockDecomposition(basic_dku_config)
     df_results = decomposition.fit(df_long_prepared)
     np.testing.assert_equal(df_results["value1_trend"],
                             np.array([1, 1, 3, 3]))
     np.testing.assert_equal(df_results["value2_trend"],
                             np.array([2, 2, 4, 4]))
     np.testing.assert_equal(df_results["value1_seasonal"],
                             np.array([2, 2, 6, 6]))
     np.testing.assert_equal(df_results["value2_residuals"],
                             np.array([6, 6, 12, 12]))
def test_week_sunday_truncation():
    df = pd.DataFrame(
        {
            "date": [
                "2021-01-03 12:12:00",
                "2021-01-05 17:35:00",
                "2021-01-15 14:55:00",
            ],
            "id": [1, 1, 1],
        }
    )
    frequency = "W-SUN"
    time_column_name = "date"
    timeseries_identifiers_names = ["id"]
    df[time_column_name] = pd.to_datetime(df[time_column_name]).dt.tz_localize(tz=None)
    preparator = TimeseriesPreparator(
        time_column_name=time_column_name,
        frequency=frequency,
        timeseries_identifiers_names=timeseries_identifiers_names,
        max_timeseries_length=2,
    )
    dataframe_prepared = preparator._truncate_dates(df)
    dataframe_prepared = preparator._sort(dataframe_prepared)
    preparator._check_regular_frequency(dataframe_prepared)

    dataframe_prepared = preparator._keep_last_dates(dataframe_prepared)
    assert dataframe_prepared[time_column_name][0] == pd.Timestamp("2021-01-10")
    assert dataframe_prepared[time_column_name][1] == pd.Timestamp("2021-01-17")
 def test_hour_truncation(self, time_column_name,
                          timeseries_identifiers_names, basic_config):
     df = pd.DataFrame({
         "date": [
             "2020-01-07 12:12:00",
             "2020-01-07 17:35:00",
             "2020-01-07 14:55:00",
             "2020-01-07 18:06:00",
             "2020-01-08 04:40:00",
             "2020-01-08 06:13:00",
             "2020-01-08 03:23:00",
         ],
         "id": [1, 1, 1, 1, 2, 2, 2],
         "target": [1, 2, 3, 4, 5, 6, 7]
     })
     df[time_column_name] = pd.to_datetime(
         df[time_column_name]).dt.tz_localize(tz=None)
     dku_config = DecompositionConfig()
     basic_config["frequency_step_hours"] = "2"
     basic_config["frequency_unit"] = "H"
     basic_config["season_length_H"] = 12
     basic_config["long_format"] = True
     basic_config["timeseries_identifiers"] = timeseries_identifiers_names
     dku_config.add_parameters(basic_config, list(df.columns))
     preparator = TimeseriesPreparator(dku_config, max_timeseries_length=2)
     dataframe_prepared = preparator._truncate_dates(df)
     dataframe_prepared = preparator._sort(dataframe_prepared)
     preparator._check_regular_frequency(dataframe_prepared)
     dataframe_prepared = preparator._keep_last_dates(dataframe_prepared)
     assert dataframe_prepared[time_column_name][0] == pd.Timestamp(
         "2020-01-07 16:00:00")
     assert dataframe_prepared[time_column_name][3] == pd.Timestamp(
         "2020-01-08 06:00:00")
    def test_week_sunday_truncation(self, time_column_name,
                                    timeseries_identifiers_names,
                                    basic_config):
        df = pd.DataFrame({
            "date": [
                "2021-01-03 12:12:00",
                "2021-01-05 17:35:00",
                "2021-01-15 14:55:00",
            ],
            "id": [1, 1, 1],
            "target": [1, 2, 3]
        })
        dku_config = DecompositionConfig()
        basic_config["frequency_unit"] = "W"
        basic_config["frequency_end_of_week"] = "SUN"
        basic_config["season_length_W"] = 7
        basic_config["long_format"] = True
        basic_config["timeseries_identifiers"] = timeseries_identifiers_names
        dku_config.add_parameters(basic_config, list(df.columns))
        preparator = TimeseriesPreparator(dku_config, max_timeseries_length=2)
        df[time_column_name] = pd.to_datetime(
            df[time_column_name]).dt.tz_localize(tz=None)
        dataframe_prepared = preparator._truncate_dates(df)
        dataframe_prepared = preparator._sort(dataframe_prepared)
        preparator._check_regular_frequency(dataframe_prepared)

        dataframe_prepared = preparator._keep_last_dates(dataframe_prepared)
        assert dataframe_prepared[time_column_name][0] == pd.Timestamp(
            "2021-01-10")
        assert dataframe_prepared[time_column_name][1] == pd.Timestamp(
            "2021-01-17")
def test_hour_truncation():
    df = pd.DataFrame(
        {
            "date": [
                "2020-01-07 12:12:00",
                "2020-01-07 17:35:00",
                "2020-01-07 14:55:00",
                "2020-01-07 18:06:00",
                "2020-01-08 04:40:00",
                "2020-01-08 06:13:00",
                "2020-01-08 03:23:00",
            ],
            "id": [1, 1, 1, 1, 2, 2, 2],
        }
    )
    frequency = "2H"
    time_column_name = "date"
    timeseries_identifiers_names = ["id"]
    df[time_column_name] = pd.to_datetime(df[time_column_name]).dt.tz_localize(tz=None)
    preparator = TimeseriesPreparator(
        time_column_name=time_column_name,
        frequency=frequency,
        timeseries_identifiers_names=timeseries_identifiers_names,
        max_timeseries_length=2,
    )
    dataframe_prepared = preparator._truncate_dates(df)
    dataframe_prepared = preparator._sort(dataframe_prepared)
    preparator._check_regular_frequency(dataframe_prepared)
    dataframe_prepared = preparator._keep_last_dates(dataframe_prepared)
    assert dataframe_prepared[time_column_name][0] == pd.Timestamp("2020-01-07 16:00:00")
    assert dataframe_prepared[time_column_name][3] == pd.Timestamp("2020-01-08 06:00:00")
Esempio n. 13
0
def test_missing_values_identifiers():
    with pytest.raises(ValueError):
        df = pd.DataFrame(
            {
                "date": ["2018-01-06", "2018-01-07", "2018-01-08", "2018-01-06", "2018-01-07", "2018-01-08"],
                "volume": [2, 4, 2, 5, 2, 5],
                "item": [1, 1, np.NaN, 2, 2, 2],
            }
        )

        timeseries_preparator = TimeseriesPreparator(
            time_column_name="date",
            frequency="D",
            target_columns_names=["volume"],
            timeseries_identifiers_names=["item"],
        )

        training_df_prepared = timeseries_preparator.prepare_timeseries_dataframe(df)
 def test_STL_additive(self, dku_config, input_df):
     timeseries_preparator = TimeseriesPreparator(dku_config)
     df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
         input_df)
     decomposition = STLDecomposition(dku_config)
     results = decomposition.fit(df_prepared)
     expected_array = np.array([
         547017.8314, 537486.722, 528097.1954, 518846.2605, 509728.8989,
         500744.2034, 491895.324, 483188.5115, 474630.5299, 466256.2782,
         458496.2869, 454985.6935, 453114.0625, 452740.149, 453810.1866,
         456404.7768, 463218.9767, 470913.292, 478947.2522, 487217.229,
         495684.7824, 504325.6079, 513126.1746, 522081.8564, 531195.1428,
         540473.2835
     ])
     rounded_results = np.round(results["value1_trend"].values, 4)
     np.testing.assert_equal(rounded_results, expected_array)
     assert np.mean(results["value1_trend"]) == 492101.0195351211
     assert np.mean(results["value1_seasonal"]) == 32625.652227975654
     assert np.mean(results["value1_residuals"]) == -5345.248686173698
 def test_duplicate_dates(self, time_column_name,
                          timeseries_identifiers_names, basic_config):
     df = pd.DataFrame({
         "date": [
             "2021-01-01 12:12:00",
             "2021-01-01 17:35:00",
             "2021-01-02 14:55:00",
         ],
         "id": [1, 1, 1],
         "target": [1, 2, 3]
     })
     dku_config = DecompositionConfig()
     basic_config["frequency"] = "D"
     dku_config.add_parameters(basic_config, list(df.columns))
     df[time_column_name] = pd.to_datetime(
         df[time_column_name]).dt.tz_localize(tz=None)
     preparator = TimeseriesPreparator(dku_config)
     with pytest.raises(ValueError):
         _ = preparator._truncate_dates(df)
    def test_STL_multiplicative(self, dku_config, input_df):
        dku_config.model = "multiplicative"
        timeseries_preparator = TimeseriesPreparator(dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = STLDecomposition(dku_config)
        results = decomposition.fit(df_prepared)
        expected_array = [
            1.87080328, 1.94864198, 1.97546651, 1.47349625, 0.74672304,
            0.6552587, 0.5000725, 0.46825876, 0.49417933, 0.86890043,
            1.16434155, 1.63725892, 2.17084151, 2.106642, 1.95377386,
            1.32400823, 0.92620183, 0.51855162, 0.44493062, 0.35877353,
            0.47054681, 0.94481716, 1.30967762, 1.88240591, 2.51946737,
            2.28270725
        ]
        rounded_results = np.round(results["value1_seasonal"].values, 8)
        np.testing.assert_equal(rounded_results, expected_array)

        assert np.mean(results["value1_trend"]) == 409265.35453951
        assert np.mean(results["value1_seasonal"]) == 1.2698748679749627
        assert np.mean(results["value1_residuals"]) == 0.9941032097902623
def test_duplicate_dates():
    df = pd.DataFrame(
        {
            "date": [
                "2021-01-01 12:12:00",
                "2021-01-01 17:35:00",
                "2021-01-02 14:55:00",
            ],
            "id": [1, 1, 1],
        }
    )
    frequency = "D"
    time_column_name = "date"
    timeseries_identifiers_names = ["id"]
    df[time_column_name] = pd.to_datetime(df[time_column_name]).dt.tz_localize(tz=None)
    preparator = TimeseriesPreparator(
        time_column_name=time_column_name,
        frequency=frequency,
    )
    with pytest.raises(ValueError):
        dataframe_prepared = preparator._truncate_dates(df)
def test_day_truncation():
    df = pd.DataFrame(
        {
            "date": [
                "2021-01-01 12:17:42",
                "2021-01-02 00:00:00",
                "2021-01-03 12:46:00",
            ],
            "id": [1, 1, 1],
        }
    )
    frequency = "D"
    time_column_name = "date"
    timeseries_identifiers_names = ["id"]
    df[time_column_name] = pd.to_datetime(df[time_column_name]).dt.tz_localize(tz=None)
    preparator = TimeseriesPreparator(
        time_column_name=time_column_name,
        frequency=frequency,
        timeseries_identifiers_names=timeseries_identifiers_names,
    )
    dataframe_prepared = preparator._truncate_dates(df)
    dataframe_prepared = preparator._sort(dataframe_prepared)
    preparator._check_regular_frequency(dataframe_prepared)

    assert dataframe_prepared[time_column_name][0] == pd.Timestamp("2021-01-01")
    assert dataframe_prepared[time_column_name][2] == pd.Timestamp("2021-01-03")
    def test_business_day_truncation(self, time_column_name,
                                     timeseries_identifiers_names,
                                     basic_config):
        df = pd.DataFrame({
            "date": [
                "2021-01-04 12:17:42",
                "2021-01-05 00:00:00",
                "2021-01-06 12:46:00",
            ],
            "id": [1, 1, 1],
            "target": [1, 2, 3]
        })
        dku_config = DecompositionConfig()
        basic_config["frequency_unit"] = "B"
        basic_config["season_length_B"] = 5
        basic_config["long_format"] = True
        basic_config["timeseries_identifiers"] = timeseries_identifiers_names
        dku_config.add_parameters(basic_config, list(df.columns))
        preparator = TimeseriesPreparator(dku_config)
        df[time_column_name] = pd.to_datetime(
            df[time_column_name]).dt.tz_localize(tz=None)
        dataframe_prepared = preparator._truncate_dates(df)
        dataframe_prepared = preparator._sort(dataframe_prepared)
        preparator._check_regular_frequency(dataframe_prepared)

        assert dataframe_prepared[time_column_name][0] == pd.Timestamp(
            "2021-01-04")
        assert dataframe_prepared[time_column_name][2] == pd.Timestamp(
            "2021-01-06")
    def test_year_truncation(self, time_column_name,
                             timeseries_identifiers_names, basic_config):
        df = pd.DataFrame({
            "date": [
                "2020-12-31",
                "2021-12-15",
                "2022-12-01",
            ],
            "id": [1, 1, 1],
            "target": [1, 2, 3]
        })
        dku_config = DecompositionConfig()
        basic_config["frequency_unit"] = "12M"
        basic_config["season_length_12M"] = 4
        basic_config["long_format"] = True
        basic_config["timeseries_identifiers"] = timeseries_identifiers_names
        basic_config["season_length_12M"] = 4
        dku_config.add_parameters(basic_config, list(df.columns))
        preparator = TimeseriesPreparator(dku_config)
        df[time_column_name] = pd.to_datetime(
            df[time_column_name]).dt.tz_localize(tz=None)
        dataframe_prepared = preparator._truncate_dates(df)
        dataframe_prepared = preparator._sort(dataframe_prepared)
        preparator._check_regular_frequency(dataframe_prepared)

        assert dataframe_prepared[time_column_name][0] == pd.Timestamp(
            "2020-12-31")
        assert dataframe_prepared[time_column_name][1] == pd.Timestamp(
            "2021-12-31")
        assert dataframe_prepared[time_column_name][2] == pd.Timestamp(
            "2022-12-31")
    def test_minutes_truncation(self, time_column_name, basic_config):
        df = pd.DataFrame({
            "date": [
                "2021-01-01 12:17:42",
                "2021-01-01 12:30:00",
                "2021-01-01 12:46:00",
            ],
            "id": [1, 1, 1],
            "target": [1, 2, 3]
        })
        dku_config = DecompositionConfig()
        basic_config["frequency_step_minutes"] = "15"
        basic_config["frequency_unit"] = "min"
        basic_config["season_length_min"] = 4
        dku_config.add_parameters(basic_config, list(df.columns))
        df[time_column_name] = pd.to_datetime(
            df[time_column_name]).dt.tz_localize(tz=None)
        preparator = TimeseriesPreparator(dku_config)
        dataframe_prepared = preparator._truncate_dates(df)
        dataframe_prepared = preparator._sort(dataframe_prepared)
        preparator._check_regular_frequency(dataframe_prepared)

        assert dataframe_prepared[time_column_name][0] == pd.Timestamp(
            "2021-01-01  12:15:00")
        assert dataframe_prepared[time_column_name][2] == pd.Timestamp(
            "2021-01-01 12:45:00")
def test_semester_truncation():
    df = pd.DataFrame(
        {
            "date": [
                "2020-12-15",
                "2021-06-28",
                "2021-12-01",
            ],
            "id": [1, 1, 1],
        }
    )
    frequency = "6M"
    time_column_name = "date"
    timeseries_identifiers_names = ["id"]
    df[time_column_name] = pd.to_datetime(df[time_column_name]).dt.tz_localize(tz=None)
    preparator = TimeseriesPreparator(
        time_column_name=time_column_name,
        frequency=frequency,
        timeseries_identifiers_names=timeseries_identifiers_names,
    )
    dataframe_prepared = preparator._truncate_dates(df)
    dataframe_prepared = preparator._sort(dataframe_prepared)
    preparator._check_regular_frequency(dataframe_prepared)

    assert dataframe_prepared[time_column_name][0] == pd.Timestamp("2020-12-31")
    assert dataframe_prepared[time_column_name][1] == pd.Timestamp("2021-06-30")
    assert dataframe_prepared[time_column_name][2] == pd.Timestamp("2021-12-31")
    def test_advanced_degrees(self, config, input_df, input_dataset_columns):
        config["additional_parameters_STL"] = {
            "seasonal_deg": "1",
            "trend_deg": "1",
            "low_pass_deg": "1"
        }
        dku_config = STLConfig()
        dku_config.add_parameters(config, input_dataset_columns)
        timeseries_preparator = TimeseriesPreparator(dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = STLDecomposition(dku_config)
        result_df = decomposition.fit(df_prepared)
        assert result_df.shape == (26, 6)
        np.testing.assert_array_equal(
            np.round(result_df.value1_trend.values, 4),
            np.array([
                547017.8314, 537486.722, 528097.1954, 518846.2605, 509728.8989,
                500744.2034, 491895.324, 483188.5115, 474630.5299, 466256.2782,
                458496.2869, 454985.6935, 453114.0625, 452740.149, 453810.1866,
                456404.7768, 463218.9767, 470913.292, 478947.2522, 487217.229,
                495684.7824, 504325.6079, 513126.1746, 522081.8564,
                531195.1428, 540473.2835
            ]))
        assert np.mean(result_df["value1_trend"]) == 492101.0195351211
        assert np.mean(result_df["value1_seasonal"]) == 32625.652227975654
        assert np.mean(result_df["value1_residuals"]) == -5345.248686173698

        config["additional_parameters_STL"] = {
            "seasonal_deg": "1",
            "trend_deg": "0"
        }
        dku_config = STLConfig()
        dku_config.add_parameters(config, input_dataset_columns)
        timeseries_preparator = TimeseriesPreparator(dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = STLDecomposition(dku_config)
        result_df = decomposition.fit(df_prepared)
        assert result_df.shape == (26, 6)
        np.testing.assert_array_equal(
            np.round(result_df.value1_seasonal.values, 4),
            np.array([
                334926.5396, 363552.8324, 380642.7497, 151182.772,
                -168020.8919, -209675.4339, -276299.0916, -289677.7104,
                -278165.1873, -126041.2679, -8513.4181, 175315.4394,
                425222.6624, 396736.9844, 290811.7923, 45628.9471, -110941.82,
                -272356.2149, -303391.2037, -338667.781, -295226.877,
                -106373.2845, 41186.7333, 274657.8578, 516720.1595, 432742.083
            ]))
        assert np.mean(result_df["value1_trend"]) == 470658.0934271346
        assert np.mean(result_df["value1_seasonal"]) == 40229.89887290871
        assert np.mean(result_df["value1_residuals"]) == 8493.430776879803
config = get_recipe_config()
params = load_training_config(config)

mxnet_context = set_mxnet_context(params["gpu_devices"])

models_parameters = get_models_parameters(
    config, is_training_multivariate=params["is_training_multivariate"])
start = perf_counter()

training_df = params["training_dataset"].get_dataframe()

timeseries_preparator = TimeseriesPreparator(
    time_column_name=params["time_column_name"],
    frequency=params["frequency"],
    target_columns_names=params["target_columns_names"],
    timeseries_identifiers_names=params["timeseries_identifiers_names"],
    external_features_columns_names=params["external_features_columns_names"],
    max_timeseries_length=params["max_timeseries_length"],
)

training_df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
    training_df)

training_session = TrainingSession(
    target_columns_names=params["target_columns_names"],
    time_column_name=params["time_column_name"],
    frequency=params["frequency"],
    epoch=params["epoch"],
    models_parameters=models_parameters,
    prediction_length=params["prediction_length"],
    training_df=training_df_prepared,
    def test_target_column_preparation(self, time_column_name,
                                       timeseries_identifiers_names,
                                       basic_config):
        df = pd.DataFrame({
            "date": [
                "2020-12-31",
                "2021-12-15",
                "2022-12-01",
            ],
            "id": [1, 1, 1],
            "target": [1, 2, 3],
            "invalid_target": ["a", "b", "c"],
            "missing_target": [1, np.nan, 2],
            "unformatted_target": ["1", "2", "3"]
        })
        dku_config = DecompositionConfig()
        basic_config["target_columns"] = ["target"]
        basic_config["frequency_unit"] = "12M"
        basic_config["season_length_12M"] = 4
        dku_config.add_parameters(basic_config, list(df.columns))
        preparator = TimeseriesPreparator(dku_config)
        df_prepared = preparator.prepare_timeseries_dataframe(df)
        assert df_prepared.loc[0, "target"] == 1

        dku_config = DecompositionConfig()
        basic_config["target_columns"] = ["unformatted_target"]
        basic_config["frequency_unit"] = "12M"
        basic_config["season_length_12M"] = 4
        dku_config.add_parameters(basic_config, list(df.columns))
        preparator = TimeseriesPreparator(dku_config)
        df_prepared_unformatted = preparator.prepare_timeseries_dataframe(df)
        assert df_prepared_unformatted.loc[0, "unformatted_target"] == 1

        dku_config = DecompositionConfig()
        basic_config["target_columns"] = ["invalid_target"]
        basic_config["frequency_unit"] = "12M"
        basic_config["season_length_12M"] = 4
        dku_config.add_parameters(basic_config, list(df.columns))
        preparator = TimeseriesPreparator(dku_config)
        with pytest.raises(ValueError) as err:
            _ = preparator.prepare_timeseries_dataframe(df)
        assert "must be numeric" in str(err.value)

        dku_config = DecompositionConfig()
        basic_config["target_columns"] = ["missing_target"]
        basic_config["frequency_unit"] = "12M"
        basic_config["season_length_12M"] = 4
        dku_config.add_parameters(basic_config, list(df.columns))
        preparator = TimeseriesPreparator(dku_config)
        with pytest.raises(ValueError) as err:
            _ = preparator.prepare_timeseries_dataframe(df)
        assert "missing value" in str(err.value)
 def test_empty_input_dataset(self, dku_config, time_column_name):
     empty_df = pd.DataFrame(columns=["value1", "target", time_column_name])
     timeseries_preparator = TimeseriesPreparator(dku_config)
     with pytest.raises(ValueError) as err:
         _ = timeseries_preparator.prepare_timeseries_dataframe(empty_df)
     assert "empty" in str(err.value)
    def test_advanced_smoothers(self, config, input_df, input_dataset_columns):
        config["decomposition_model"] = "additive"
        config["additional_parameters_STL"] = {"trend": "35", "low_pass": "******"}
        dku_config = STLConfig()
        dku_config.add_parameters(config, input_dataset_columns)
        timeseries_preparator = TimeseriesPreparator(dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = STLDecomposition(dku_config)
        result_df = decomposition.fit(df_prepared)
        assert result_df.shape == (26, 6)
        np.testing.assert_array_equal(
            np.round(result_df.value1_seasonal.values, 4),
            np.array([
                329279.6394, 360305.5117, 378691.0343, 151319.491,
                -166075.4661, -206300.4391, -272041.7161, -285356.053,
                -274969.4078, -125368.4261, -10804.3636, 173084.5489,
                421640.9531, 393264.9995, 288207.4229, 42573.3565,
                -111402.3446, -270267.5348, -299889.3857, -334837.5864,
                -291850.134, -103986.6224, 42205.6726, 274027.7075,
                515335.6499, 429183.6225
            ]))
        assert np.mean(result_df["value1_trend"]) == 482542.4367257319
        assert np.mean(result_df["value1_seasonal"]) == 40229.62038767122
        assert np.mean(result_df["value1_residuals"]) == -3390.634036480091

        config["additional_parameters_STL"] = {
            "trend": "2999999",
            "low_pass": "******"
        }
        dku_config = STLConfig()
        dku_config.add_parameters(config, input_dataset_columns)
        timeseries_preparator = TimeseriesPreparator(dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = STLDecomposition(dku_config)
        result_df = decomposition.fit(df_prepared)
        assert result_df.shape == (26, 6)
        assert np.mean(result_df["value1_trend"]) == 476077.5935197392
        assert np.mean(result_df["value1_seasonal"]) == 43303.82955718398
        assert np.mean(result_df["value1_residuals"]) == -3.134258664571322e-11

        config["additional_parameters_STL"] = {"trend": ""}
        dku_config = STLConfig()
        dku_config.add_parameters(config, input_dataset_columns)
        timeseries_preparator = TimeseriesPreparator(dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = STLDecomposition(dku_config)
        result_df = decomposition.fit(df_prepared)
        assert result_df.shape == (26, 6)
        assert np.mean(result_df["value1_trend"]) == 492101.0195351211
        assert np.mean(result_df["value1_seasonal"]) == 32625.652227975654
        assert np.mean(result_df["value1_residuals"]) == -5345.248686173698

        config["additional_parameters_STL"] = {"trend": "None"}
        dku_config = STLConfig()
        dku_config.add_parameters(config, input_dataset_columns)
        timeseries_preparator = TimeseriesPreparator(dku_config)
        df_prepared = timeseries_preparator.prepare_timeseries_dataframe(
            input_df)
        decomposition = STLDecomposition(dku_config)
        result_df = decomposition.fit(df_prepared)
        assert result_df.shape == (26, 6)
        assert np.mean(result_df["value1_trend"]) == 492101.0195351211
        assert np.mean(result_df["value1_seasonal"]) == 32625.652227975654
        assert np.mean(result_df["value1_residuals"]) == -5345.248686173698
from io_utils import get_input_output, set_column_description
from recipe_config_loading import get_decomposition_params
from safe_logger import SafeLogger
from timeseries_preparation.preparation import TimeseriesPreparator

logger = SafeLogger("Timeseries preparation plugin")

(input_dataset, output_dataset) = get_input_output()
config = get_recipe_config()
input_dataset_columns = [
    column["name"] for column in input_dataset.read_schema()
]
(dku_config, input_validator,
 decomposition) = get_decomposition_params(config, input_dataset_columns)

timeseries_preparator = TimeseriesPreparator(dku_config)
input_df = input_dataset.get_dataframe(infer_with_pandas=False)
df_prepared = timeseries_preparator.prepare_timeseries_dataframe(input_df)
input_validator.check(df_prepared)

start = perf_counter()
logger.info("Decomposing time series...")
transformed_df = decomposition.fit(df_prepared)
logger.info(
    "Decomposing time series: Done in {:.2f} seconds".format(perf_counter() -
                                                             start))
transformation_df = output_dataset.write_with_schema(transformed_df)
set_column_description(output_dataset, decomposition.columns_descriptions,
                       input_dataset)