Ejemplo n.º 1
0
 def test_weekly_params(self, config):
     config["time_unit"] = "weeks"
     params = get_resampling_params(config)
     assert params.resampling_step == "2W"
     config["time_unit_end_of_week"] = "MON"
     params = get_resampling_params(config)
     assert params.time_unit_end_of_week == "MON"
     assert params.resampling_step == "2W-MON"
Ejemplo n.º 2
0
 def test_semi_annual_params(self, config):
     config["time_unit"] = "semi_annual"
     params = get_resampling_params(config)
     assert params.time_step == 12
     assert params.resampling_step == "12M"
     config["time_step"] = 1.5
     params = get_resampling_params(config)
     assert params.time_step == 9
     assert params.resampling_step == "9M"
Ejemplo n.º 3
0
    def test_invalid_time_step(self, config):
        config.pop("time_step")
        with pytest.raises(ValueError) as err:
            _ = get_resampling_params(config)
        assert "Invalid time step" in str(err.value)

        config["time_step"] = 0
        with pytest.raises(ValueError) as err:
            _ = get_resampling_params(config)
        assert "Time step can not be null or negative" in str(err.value)
Ejemplo n.º 4
0
    def test_no_categorical_impute(self, df, config, columns):
        config.pop("category_imputation_method")
        params_no_impute = get_resampling_params(config)
        resampler_no_impute = Resampler(params_no_impute)
        no_impute_df = resampler_no_impute.transform(df, "Date")
        assert pd.isnull(no_impute_df[columns.category].values).all()

        config["category_imputation_method"] = "empty"
        params_with_impute = get_resampling_params(config)
        resampler_with_impute = Resampler(params_with_impute)
        impute_df = resampler_with_impute.transform(df, "Date")
        assert pd.isnull(impute_df[columns.category].values).all()
Ejemplo n.º 5
0
    def test_missing_categorical(self, missing_row_df, config, columns):
        config["time_unit"] = "weeks"
        config["time_step"] = 12
        config["category_imputation_method"] = "clip"
        params = get_resampling_params(config)
        resampler = Resampler(params)
        output_df = resampler.transform(missing_row_df, columns.date)
        assert np.all(output_df.categorical.values == "second")

        config["category_imputation_method"] = "previous"
        params = get_resampling_params(config)
        resampler = Resampler(params)
        output_df = resampler.transform(missing_row_df, columns.date)
        assert math.isnan(output_df.loc[0, columns.category])
        assert np.all(output_df.loc[1:, columns.category].values == "second")
    def test_generate_date_range_month(self, config):
        config["time_unit"] = "months"
        params = get_resampling_params(config)
        frequency = params.resampling_step
        time_unit = params.time_unit
        time_step = params.time_step

        end_time = pd.Timestamp('2021-06-20 00:00:00')

        start_time = pd.Timestamp('2021-01-31 00:00:00')
        date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit)
        np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-31', '2021-03-31', '2021-05-31', '2021-07-31']))

        start_time = pd.Timestamp('2021-01-23 00:00:00')
        date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit)
        np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-31', '2021-03-31', '2021-05-31', '2021-07-31']))

        start_time = pd.Timestamp('2021-01-31 10:00:00')
        date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit)
        np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-31', '2021-03-31', '2021-05-31', '2021-07-31']))

        start_time = pd.Timestamp('2021-01-31 10:00:00').tz_localize("CET")
        end_time = pd.Timestamp('2021-06-20 00:00:00').tz_localize("CET")
        date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit)
        np.testing.assert_array_equal(date_range, pd.DatetimeIndex(
            ['2021-01-31 00:00:00+01:00', '2021-03-31 00:00:00+02:00', '2021-05-31 00:00:00+02:00', '2021-07-31 00:00:00+02:00']))

        start_time = pd.Timestamp('2021-01-31 10:00:00')
        end_time = pd.Timestamp('2021-06-20 00:00:00')
        date_range = generate_date_range(start_time, end_time, 1, 0, 1, frequency, time_step, time_unit)
        np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-03-31', '2021-05-31', '2021-07-31']))
    def test_generate_date_range_b_days(self, config):
        config["time_unit"] = "business_days"
        config["time_step"] = 1
        start_time = pd.Timestamp('2021-01-02 00:00:00')
        end_time = pd.Timestamp('2021-01-10 00:00:00')

        params = get_resampling_params(config)
        frequency = params.resampling_step
        time_unit = params.time_unit
        time_step = params.time_step

        date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit)
        np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08', '2021-01-11']))

        clip_start = 1
        clip_end = 1
        shift = 0
        date_range = generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit)
        np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08', '2021-01-11']))

        clip_start = 2
        clip_end = 2
        shift = 0
        date_range = generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit)
        np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08']))
    def test_generate_date_range_microseconds(self, config):
        config["time_unit"] = "microseconds"
        config["time_step"] = 1
        start_time = pd.Timestamp('20190131 01:59:00').tz_localize('CET')
        end_time = pd.Timestamp('2019-01-31 01:59:00.000016').tz_localize('CET')

        params = get_resampling_params(config)
        frequency = params.resampling_step
        time_unit = params.time_unit
        time_step = params.time_step

        clip_start = 5
        shift = 2
        clip_end = 3

        date_range = generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit)
        expected_range = pd.DatetimeIndex(['2019-01-31 01:59:00.000007+01:00',
                                           '2019-01-31 01:59:00.000008+01:00',
                                           '2019-01-31 01:59:00.000009+01:00',
                                           '2019-01-31 01:59:00.000010+01:00',
                                           '2019-01-31 01:59:00.000011+01:00',
                                           '2019-01-31 01:59:00.000012+01:00',
                                           '2019-01-31 01:59:00.000013+01:00',
                                           '2019-01-31 01:59:00.000014+01:00',
                                           '2019-01-31 01:59:00.000015+01:00'])
        np.testing.assert_array_equal(date_range, expected_range)
Ejemplo n.º 9
0
    def test_no_category_values(self, df, config, columns):
        config["category_imputation_method"] = "previous"
        params = get_resampling_params(config)
        resampler = Resampler(params)
        output_df_first = resampler.transform(df, columns.date)
        np.testing.assert_array_equal(
            output_df_first.categorical.values,
            np.array([
                'first', 'first', 'first', 'first', 'first', 'second',
                'second', 'second'
            ]))

        config["category_imputation_method"] = "empty"
        params = get_resampling_params(config)
        resampler = Resampler(params)
        output_df_empty = resampler.transform(df, columns.date)
        assert math.isnan(output_df_empty.loc[0, columns.category])
Ejemplo n.º 10
0
 def test_df_multiple_dates(self, df_multiple_dates, config, columns):
     config["category_imputation_method"] = "previous"
     config["time_unit"] = "hours"
     config["time_step"] = 12
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(df_multiple_dates, columns.date)
     assert pd.isnull(output_df.loc[1, "date2"])
Ejemplo n.º 11
0
 def test_mode_filling(self, df3, config, columns):
     config["category_imputation_method"] = "mode"
     config["time_unit"] = "weeks"
     config["time_step"] = 1
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(df3, columns.date)
     assert np.all(output_df.categorical.values == "second")
Ejemplo n.º 12
0
 def test_microseconds(self, config, columns):
     config["time_unit"] = "microseconds"
     config["time_step"] = 3
     params = get_resampling_params(config)
     resampler = Resampler(params)
     df_DST = get_df_DST("U", columns)
     output_df = resampler.transform(df_DST, columns.date)
     expected_dates = pd.DatetimeIndex(['2019-01-31T00:59:00.000000000', '2019-01-31T00:59:00.000003000'])
     np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
Ejemplo n.º 13
0
 def test_clip_filling(self, long_df, config, columns):
     config["category_imputation_method"] = "clip"
     config["time_unit"] = "weeks"
     config["time_step"] = 1
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(long_df,
                                     columns.date,
                                     groupby_columns=["country"])
     assert output_df.loc[3, columns.category] == "first"
Ejemplo n.º 14
0
 def test_next_filling_long_format(self, long_df, config, columns):
     config["category_imputation_method"] = "next"
     config["time_unit"] = "weeks"
     config["time_step"] = 1
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(long_df,
                                     columns.date,
                                     groupby_columns=["country"])
     assert math.isnan(output_df.loc[4, columns.category])
     assert output_df.loc[3, columns.category] == "second"
Ejemplo n.º 15
0
 def test_empty_filling(self, df2, config, columns):
     config["time_unit"] = "hours"
     config["time_step"] = 12
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(df2, columns.date)
     assert output_df.loc[0, columns.category] == "first"
     assert math.isnan(output_df.loc[1, columns.category])
     assert math.isnan(output_df.loc[2, columns.category])
     assert output_df.loc[6, columns.category] == "second"
     assert math.isnan(output_df.loc[7, columns.category])
Ejemplo n.º 16
0
    def test_year(self, config, columns):
        config["time_unit"] = "years"
        params = get_resampling_params(config)
        resampler = Resampler(params)
        df = get_df("Y", columns)
        output_df = resampler.transform(df, columns.date)

        assert np.mean(output_df[columns.data]) == 316.19
        expected_dates = pd.DatetimeIndex(['1959-12-31T00:00:00.000000000', '1961-12-31T00:00:00.000000000',
                                           '1963-12-31T00:00:00.000000000'])
        np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
Ejemplo n.º 17
0
 def test_next_filling(self, df2, config, columns):
     config["category_imputation_method"] = "next"
     config["time_unit"] = "hours"
     config["time_step"] = 12
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(df2, columns.date)
     assert output_df.loc[0, columns.category] == "first"
     assert output_df.loc[1, columns.category] == "first"
     assert output_df.loc[3, columns.category] == "first"
     assert output_df.loc[5, columns.category] == "second"
     assert output_df.loc[9, columns.category] == "third"
Ejemplo n.º 18
0
 def test_hours_DST(self, config, columns):
     config["time_unit"] = "hours"
     params = get_resampling_params(config)
     resampler = Resampler(params)
     df_DST = get_df_DST("4H", columns)
     output_df = resampler.transform(df_DST, columns.date)
     assert np.mean(output_df[columns.data]) == 316.33428571428567
     expected_dates = pd.DatetimeIndex(['2019-01-31T01:00:00.000000000', '2019-01-31T03:00:00.000000000',
                                        '2019-01-31T05:00:00.000000000', '2019-01-31T07:00:00.000000000',
                                        '2019-01-31T09:00:00.000000000', '2019-01-31T11:00:00.000000000',
                                        '2019-01-31T13:00:00.000000000'])
     np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
Ejemplo n.º 19
0
 def test_seconds(self, config, columns):
     config["time_unit"] = "seconds"
     config["time_step"] = 30
     params = get_resampling_params(config)
     resampler = Resampler(params)
     df_DST = get_df_DST("min", columns)
     output_df = resampler.transform(df_DST, columns.date)
     assert np.mean(output_df[columns.data]) == 316.28999999999996
     expected_dates = pd.DatetimeIndex(['2019-01-31T00:59:00.000000000', '2019-01-31T00:59:30.000000000',
                                        '2019-01-31T01:00:00.000000000', '2019-01-31T01:00:30.000000000',
                                        '2019-01-31T01:01:00.000000000', '2019-01-31T01:01:30.000000000',
                                        '2019-01-31T01:02:00.000000000'])
     np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
Ejemplo n.º 20
0
 def test_mode_filling_long_format(self, long_df_mode, config, columns):
     config["category_imputation_method"] = "mode"
     config["time_unit"] = "weeks"
     config["time_step"] = 1
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(long_df_mode,
                                     columns.date,
                                     groupby_columns=["country"])
     assert np.all(output_df.loc[output_df.country == 0,
                                 columns.category].values == "first")
     assert np.all(output_df.loc[output_df.country == 1,
                                 columns.category].values == "fourth")
Ejemplo n.º 21
0
 def test_bool_column(self, bool_df, config, columns):
     config["category_imputation_method"] = "previous"
     config["time_unit"] = "hours"
     config["time_step"] = 12
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(bool_df, columns.date)
     np.testing.assert_array_equal(
         output_df.categorical.values,
         np.array([
             True, True, True, True, True, True, False, False, False, False,
             False
         ]))
    def test_generate_date_range_half_year(self, config):
        config["time_step"] = 1
        config["time_unit"] = "semi_annual"
        start_time = pd.Timestamp('2020-01-01 00:00:00')
        end_time = pd.Timestamp('2021-06-18 00:00:00')

        params = get_resampling_params(config)
        frequency = params.resampling_step
        time_unit = params.time_unit
        time_step = params.time_step

        date_range = generate_date_range(start_time, end_time, 0, 0, 0, frequency, time_step, time_unit)
        np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2020-01-31', '2020-07-31', '2021-01-31', '2021-07-31']))
Ejemplo n.º 23
0
 def test_weeks_monday_end(self, config, columns):
     config["time_unit"] = "weeks"
     config["time_unit_end_of_week"] = "MON"
     params = get_resampling_params(config)
     resampler = Resampler(params)
     df = get_df("M", columns)
     output_df = resampler.transform(df, columns.date)
     assert np.mean(output_df[columns.data]) == 316.36625000000004
     expected_dates = pd.DatetimeIndex(['1959-02-02T00:00:00.000000000', '1959-02-16T00:00:00.000000000',
                                        '1959-03-02T00:00:00.000000000', '1959-03-16T00:00:00.000000000',
                                        '1959-03-30T00:00:00.000000000', '1959-04-13T00:00:00.000000000',
                                        '1959-04-27T00:00:00.000000000', '1959-05-11T00:00:00.000000000'])
     np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
Ejemplo n.º 24
0
 def test_nanoseconds(self, config, columns):
     config["time_unit"] = "nanoseconds"
     config["time_step"] = 1
     params = get_resampling_params(config)
     resampler = Resampler(params)
     df = get_df_DST("3N", columns)
     output_df = resampler.transform(df, columns.date)
     expected_dates = pd.DatetimeIndex(['2019-01-31T00:59:00.000000000', '2019-01-31T00:59:00.000000001',
                                        '2019-01-31T00:59:00.000000002', '2019-01-31T00:59:00.000000003',
                                        '2019-01-31T00:59:00.000000004', '2019-01-31T00:59:00.000000005',
                                        '2019-01-31T00:59:00.000000006', '2019-01-31T00:59:00.000000007',
                                        '2019-01-31T00:59:00.000000008', '2019-01-31T00:59:00.000000009'])
     np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
Ejemplo n.º 25
0
 def test_previous_filling(self, df2, config, columns):
     config["category_imputation_method"] = "previous"
     config["time_unit"] = "hours"
     config["time_step"] = 12
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(df2, columns.date)
     np.testing.assert_array_equal(
         output_df.categorical.values,
         np.array([
             'first', 'first', 'first', 'first', 'first', 'first', 'second',
             'second', 'second', 'second', 'third'
         ]))
Ejemplo n.º 26
0
 def test_constant_value_filling(self, df2, config, columns):
     config["category_imputation_method"] = "constant"
     config["category_constant_value"] = "myvalue"
     config["time_unit"] = "hours"
     config["time_step"] = 12
     params = get_resampling_params(config)
     resampler = Resampler(params)
     output_df = resampler.transform(df2, columns.date)
     assert output_df.loc[0, columns.category] == "first"
     assert output_df.loc[1, columns.category] == "myvalue"
     assert output_df.loc[2, columns.category] == "myvalue"
     assert output_df.loc[6, columns.category] == "second"
     assert output_df.loc[7, columns.category] == "myvalue"
Ejemplo n.º 27
0
 def test_days(self, config, columns):
     config["time_unit"] = "days"
     params = get_resampling_params(config)
     resampler = Resampler(params)
     df = get_df("W-TUE", columns)
     output_df = resampler.transform(df, columns.date)
     assert np.mean(output_df[columns.data]) == 316.3254545454545
     expected_dates = pd.DatetimeIndex(['1959-01-06T00:00:00.000000000', '1959-01-08T00:00:00.000000000',
                                        '1959-01-10T00:00:00.000000000', '1959-01-12T00:00:00.000000000',
                                        '1959-01-14T00:00:00.000000000', '1959-01-16T00:00:00.000000000',
                                        '1959-01-18T00:00:00.000000000', '1959-01-20T00:00:00.000000000',
                                        '1959-01-22T00:00:00.000000000', '1959-01-24T00:00:00.000000000',
                                        '1959-01-26T00:00:00.000000000'])
     np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
Ejemplo n.º 28
0
 def test_days_DST(self, config, columns):
     config["time_unit"] = "days"
     params = get_resampling_params(config)
     resampler = Resampler(params)
     df_DST = get_df_DST("W-WED", columns)
     output_df = resampler.transform(df_DST, columns.date)
     assert np.mean(output_df[columns.data]) == 316.3072727272727
     expected_dates = pd.DatetimeIndex(['2019-02-05T23:00:00.000000000', '2019-02-07T23:00:00.000000000',
                                        '2019-02-09T23:00:00.000000000', '2019-02-11T23:00:00.000000000',
                                        '2019-02-13T23:00:00.000000000', '2019-02-15T23:00:00.000000000',
                                        '2019-02-17T23:00:00.000000000', '2019-02-19T23:00:00.000000000',
                                        '2019-02-21T23:00:00.000000000', '2019-02-23T23:00:00.000000000',
                                        '2019-02-25T23:00:00.000000000'])
     np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
    def test_generate_date_range_nanoseconds(self, config):
        config["time_unit"] = "nanoseconds"
        config["time_step"] = 1
        start_time = pd.Timestamp('2019-01-31T00:59:00.000000000')
        end_time = pd.Timestamp('2019-01-31T00:59:00.000000009')

        params = get_resampling_params(config)
        frequency = params.resampling_step
        time_unit = params.time_unit
        time_step = params.time_step

        clip_start = 5
        shift = 2
        clip_end = 3

        date_range = generate_date_range(start_time, end_time, clip_start, clip_end, shift, frequency, time_step, time_unit)
        np.testing.assert_array_equal(date_range, pd.DatetimeIndex(['2019-01-31 00:59:00.000000007',
                                                                    '2019-01-31 00:59:00.000000008']))
Ejemplo n.º 30
0
    def test_month(self, config, columns):
        config["time_unit"] = "months"
        params = get_resampling_params(config)
        resampler = Resampler(params)
        df = get_df("Y", columns)
        output_df = resampler.transform(df, columns.date)

        assert np.mean(output_df[columns.data]) == 316.32550000000003
        expected_dates = pd.DatetimeIndex(['1959-12-31T00:00:00.000000000', '1960-02-29T00:00:00.000000000',
                                           '1960-04-30T00:00:00.000000000', '1960-06-30T00:00:00.000000000',
                                           '1960-08-31T00:00:00.000000000', '1960-10-31T00:00:00.000000000',
                                           '1960-12-31T00:00:00.000000000', '1961-02-28T00:00:00.000000000',
                                           '1961-04-30T00:00:00.000000000', '1961-06-30T00:00:00.000000000',
                                           '1961-08-31T00:00:00.000000000', '1961-10-31T00:00:00.000000000',
                                           '1961-12-31T00:00:00.000000000', '1962-02-28T00:00:00.000000000',
                                           '1962-04-30T00:00:00.000000000', '1962-06-30T00:00:00.000000000',
                                           '1962-08-31T00:00:00.000000000', '1962-10-31T00:00:00.000000000',
                                           '1962-12-31T00:00:00.000000000', '1963-02-28T00:00:00.000000000'])
        np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)