def test_hour_truncation():
    df = pd.DataFrame(
        {
            "date": [
                "2020-01-07 12:12:00",
                "2020-01-07 17:35:00",
                "2020-01-07 14:55:00",
                "2020-01-07 18:06:00",
                "2020-01-08 04:40:00",
                "2020-01-08 06:13:00",
                "2020-01-08 03:23:00",
            ],
            "id": [1, 1, 1, 1, 2, 2, 2],
        }
    )
    frequency = "2H"
    time_column_name = "date"
    timeseries_identifiers_names = ["id"]
    df[time_column_name] = pd.to_datetime(df[time_column_name]).dt.tz_localize(tz=None)
    preparator = TimeseriesPreparator(
        time_column_name=time_column_name,
        frequency=frequency,
        timeseries_identifiers_names=timeseries_identifiers_names,
        max_timeseries_length=2,
    )
    dataframe_prepared = preparator._truncate_dates(df)
    dataframe_prepared = preparator._sort(dataframe_prepared)
    preparator._check_regular_frequency(dataframe_prepared)
    dataframe_prepared = preparator._keep_last_dates(dataframe_prepared)
    assert dataframe_prepared[time_column_name][0] == pd.Timestamp("2020-01-07 16:00:00")
    assert dataframe_prepared[time_column_name][3] == pd.Timestamp("2020-01-08 06:00:00")
def test_day_truncation():
    df = pd.DataFrame(
        {
            "date": [
                "2021-01-01 12:17:42",
                "2021-01-02 00:00:00",
                "2021-01-03 12:46:00",
            ],
            "id": [1, 1, 1],
        }
    )
    frequency = "D"
    time_column_name = "date"
    timeseries_identifiers_names = ["id"]
    df[time_column_name] = pd.to_datetime(df[time_column_name]).dt.tz_localize(tz=None)
    preparator = TimeseriesPreparator(
        time_column_name=time_column_name,
        frequency=frequency,
        timeseries_identifiers_names=timeseries_identifiers_names,
    )
    dataframe_prepared = preparator._truncate_dates(df)
    dataframe_prepared = preparator._sort(dataframe_prepared)
    preparator._check_regular_frequency(dataframe_prepared)

    assert dataframe_prepared[time_column_name][0] == pd.Timestamp("2021-01-01")
    assert dataframe_prepared[time_column_name][2] == pd.Timestamp("2021-01-03")
def test_semester_truncation():
    df = pd.DataFrame(
        {
            "date": [
                "2020-12-15",
                "2021-06-28",
                "2021-12-01",
            ],
            "id": [1, 1, 1],
        }
    )
    frequency = "6M"
    time_column_name = "date"
    timeseries_identifiers_names = ["id"]
    df[time_column_name] = pd.to_datetime(df[time_column_name]).dt.tz_localize(tz=None)
    preparator = TimeseriesPreparator(
        time_column_name=time_column_name,
        frequency=frequency,
        timeseries_identifiers_names=timeseries_identifiers_names,
    )
    dataframe_prepared = preparator._truncate_dates(df)
    dataframe_prepared = preparator._sort(dataframe_prepared)
    preparator._check_regular_frequency(dataframe_prepared)

    assert dataframe_prepared[time_column_name][0] == pd.Timestamp("2020-12-31")
    assert dataframe_prepared[time_column_name][1] == pd.Timestamp("2021-06-30")
    assert dataframe_prepared[time_column_name][2] == pd.Timestamp("2021-12-31")
 def test_hour_truncation(self, time_column_name,
                          timeseries_identifiers_names, basic_config):
     df = pd.DataFrame({
         "date": [
             "2020-01-07 12:12:00",
             "2020-01-07 17:35:00",
             "2020-01-07 14:55:00",
             "2020-01-07 18:06:00",
             "2020-01-08 04:40:00",
             "2020-01-08 06:13:00",
             "2020-01-08 03:23:00",
         ],
         "id": [1, 1, 1, 1, 2, 2, 2],
         "target": [1, 2, 3, 4, 5, 6, 7]
     })
     df[time_column_name] = pd.to_datetime(
         df[time_column_name]).dt.tz_localize(tz=None)
     dku_config = DecompositionConfig()
     basic_config["frequency_step_hours"] = "2"
     basic_config["frequency_unit"] = "H"
     basic_config["season_length_H"] = 12
     basic_config["long_format"] = True
     basic_config["timeseries_identifiers"] = timeseries_identifiers_names
     dku_config.add_parameters(basic_config, list(df.columns))
     preparator = TimeseriesPreparator(dku_config, max_timeseries_length=2)
     dataframe_prepared = preparator._truncate_dates(df)
     dataframe_prepared = preparator._sort(dataframe_prepared)
     preparator._check_regular_frequency(dataframe_prepared)
     dataframe_prepared = preparator._keep_last_dates(dataframe_prepared)
     assert dataframe_prepared[time_column_name][0] == pd.Timestamp(
         "2020-01-07 16:00:00")
     assert dataframe_prepared[time_column_name][3] == pd.Timestamp(
         "2020-01-08 06:00:00")
def test_week_sunday_truncation():
    df = pd.DataFrame(
        {
            "date": [
                "2021-01-03 12:12:00",
                "2021-01-05 17:35:00",
                "2021-01-15 14:55:00",
            ],
            "id": [1, 1, 1],
        }
    )
    frequency = "W-SUN"
    time_column_name = "date"
    timeseries_identifiers_names = ["id"]
    df[time_column_name] = pd.to_datetime(df[time_column_name]).dt.tz_localize(tz=None)
    preparator = TimeseriesPreparator(
        time_column_name=time_column_name,
        frequency=frequency,
        timeseries_identifiers_names=timeseries_identifiers_names,
        max_timeseries_length=2,
    )
    dataframe_prepared = preparator._truncate_dates(df)
    dataframe_prepared = preparator._sort(dataframe_prepared)
    preparator._check_regular_frequency(dataframe_prepared)

    dataframe_prepared = preparator._keep_last_dates(dataframe_prepared)
    assert dataframe_prepared[time_column_name][0] == pd.Timestamp("2021-01-10")
    assert dataframe_prepared[time_column_name][1] == pd.Timestamp("2021-01-17")
    def test_minutes_truncation(self, time_column_name, basic_config):
        df = pd.DataFrame({
            "date": [
                "2021-01-01 12:17:42",
                "2021-01-01 12:30:00",
                "2021-01-01 12:46:00",
            ],
            "id": [1, 1, 1],
            "target": [1, 2, 3]
        })
        dku_config = DecompositionConfig()
        basic_config["frequency_step_minutes"] = "15"
        basic_config["frequency_unit"] = "min"
        basic_config["season_length_min"] = 4
        dku_config.add_parameters(basic_config, list(df.columns))
        df[time_column_name] = pd.to_datetime(
            df[time_column_name]).dt.tz_localize(tz=None)
        preparator = TimeseriesPreparator(dku_config)
        dataframe_prepared = preparator._truncate_dates(df)
        dataframe_prepared = preparator._sort(dataframe_prepared)
        preparator._check_regular_frequency(dataframe_prepared)

        assert dataframe_prepared[time_column_name][0] == pd.Timestamp(
            "2021-01-01  12:15:00")
        assert dataframe_prepared[time_column_name][2] == pd.Timestamp(
            "2021-01-01 12:45:00")
    def test_year_truncation(self, time_column_name,
                             timeseries_identifiers_names, basic_config):
        df = pd.DataFrame({
            "date": [
                "2020-12-31",
                "2021-12-15",
                "2022-12-01",
            ],
            "id": [1, 1, 1],
            "target": [1, 2, 3]
        })
        dku_config = DecompositionConfig()
        basic_config["frequency_unit"] = "12M"
        basic_config["season_length_12M"] = 4
        basic_config["long_format"] = True
        basic_config["timeseries_identifiers"] = timeseries_identifiers_names
        basic_config["season_length_12M"] = 4
        dku_config.add_parameters(basic_config, list(df.columns))
        preparator = TimeseriesPreparator(dku_config)
        df[time_column_name] = pd.to_datetime(
            df[time_column_name]).dt.tz_localize(tz=None)
        dataframe_prepared = preparator._truncate_dates(df)
        dataframe_prepared = preparator._sort(dataframe_prepared)
        preparator._check_regular_frequency(dataframe_prepared)

        assert dataframe_prepared[time_column_name][0] == pd.Timestamp(
            "2020-12-31")
        assert dataframe_prepared[time_column_name][1] == pd.Timestamp(
            "2021-12-31")
        assert dataframe_prepared[time_column_name][2] == pd.Timestamp(
            "2022-12-31")
    def test_week_sunday_truncation(self, time_column_name,
                                    timeseries_identifiers_names,
                                    basic_config):
        df = pd.DataFrame({
            "date": [
                "2021-01-03 12:12:00",
                "2021-01-05 17:35:00",
                "2021-01-15 14:55:00",
            ],
            "id": [1, 1, 1],
            "target": [1, 2, 3]
        })
        dku_config = DecompositionConfig()
        basic_config["frequency_unit"] = "W"
        basic_config["frequency_end_of_week"] = "SUN"
        basic_config["season_length_W"] = 7
        basic_config["long_format"] = True
        basic_config["timeseries_identifiers"] = timeseries_identifiers_names
        dku_config.add_parameters(basic_config, list(df.columns))
        preparator = TimeseriesPreparator(dku_config, max_timeseries_length=2)
        df[time_column_name] = pd.to_datetime(
            df[time_column_name]).dt.tz_localize(tz=None)
        dataframe_prepared = preparator._truncate_dates(df)
        dataframe_prepared = preparator._sort(dataframe_prepared)
        preparator._check_regular_frequency(dataframe_prepared)

        dataframe_prepared = preparator._keep_last_dates(dataframe_prepared)
        assert dataframe_prepared[time_column_name][0] == pd.Timestamp(
            "2021-01-10")
        assert dataframe_prepared[time_column_name][1] == pd.Timestamp(
            "2021-01-17")
    def test_business_day_truncation(self, time_column_name,
                                     timeseries_identifiers_names,
                                     basic_config):
        df = pd.DataFrame({
            "date": [
                "2021-01-04 12:17:42",
                "2021-01-05 00:00:00",
                "2021-01-06 12:46:00",
            ],
            "id": [1, 1, 1],
            "target": [1, 2, 3]
        })
        dku_config = DecompositionConfig()
        basic_config["frequency_unit"] = "B"
        basic_config["season_length_B"] = 5
        basic_config["long_format"] = True
        basic_config["timeseries_identifiers"] = timeseries_identifiers_names
        dku_config.add_parameters(basic_config, list(df.columns))
        preparator = TimeseriesPreparator(dku_config)
        df[time_column_name] = pd.to_datetime(
            df[time_column_name]).dt.tz_localize(tz=None)
        dataframe_prepared = preparator._truncate_dates(df)
        dataframe_prepared = preparator._sort(dataframe_prepared)
        preparator._check_regular_frequency(dataframe_prepared)

        assert dataframe_prepared[time_column_name][0] == pd.Timestamp(
            "2021-01-04")
        assert dataframe_prepared[time_column_name][2] == pd.Timestamp(
            "2021-01-06")
 def test_duplicate_dates(self, time_column_name,
                          timeseries_identifiers_names, basic_config):
     df = pd.DataFrame({
         "date": [
             "2021-01-01 12:12:00",
             "2021-01-01 17:35:00",
             "2021-01-02 14:55:00",
         ],
         "id": [1, 1, 1],
         "target": [1, 2, 3]
     })
     dku_config = DecompositionConfig()
     basic_config["frequency"] = "D"
     dku_config.add_parameters(basic_config, list(df.columns))
     df[time_column_name] = pd.to_datetime(
         df[time_column_name]).dt.tz_localize(tz=None)
     preparator = TimeseriesPreparator(dku_config)
     with pytest.raises(ValueError):
         _ = preparator._truncate_dates(df)
def test_duplicate_dates():
    df = pd.DataFrame(
        {
            "date": [
                "2021-01-01 12:12:00",
                "2021-01-01 17:35:00",
                "2021-01-02 14:55:00",
            ],
            "id": [1, 1, 1],
        }
    )
    frequency = "D"
    time_column_name = "date"
    timeseries_identifiers_names = ["id"]
    df[time_column_name] = pd.to_datetime(df[time_column_name]).dt.tz_localize(tz=None)
    preparator = TimeseriesPreparator(
        time_column_name=time_column_name,
        frequency=frequency,
    )
    with pytest.raises(ValueError):
        dataframe_prepared = preparator._truncate_dates(df)