def test_extrapolation(self, df, config, columns): params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(df, columns.date) assert output_df.loc[7, columns.data] == 316.2 assert math.isnan(output_df.loc[7, columns.category]) config.pop("category_imputation_method") resampler = Resampler(params) output_df = resampler.transform(df, columns.date) assert output_df.loc[7, columns.data] == 316.2 assert math.isnan(output_df.loc[7, columns.category]) config["extrapolation_method"] = "none" params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(df, columns.date) assert math.isnan(output_df.loc[6, columns.category]) category_results = np.array(output_df[columns.category].values, dtype=np.float64) assert np.isnan(category_results).all() config["extrapolation_method"] = "interpolation" params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(df, columns.date) assert np.round(output_df.loc[7, columns.data], 3) == 316.003 assert math.isnan(output_df.loc[7, columns.category])
def test_no_categorical_impute(self, df, config, columns): config.pop("category_imputation_method") params_no_impute = get_resampling_params(config) resampler_no_impute = Resampler(params_no_impute) no_impute_df = resampler_no_impute.transform(df, "Date") assert pd.isnull(no_impute_df[columns.category].values).all() config["category_imputation_method"] = "empty" params_with_impute = get_resampling_params(config) resampler_with_impute = Resampler(params_with_impute) impute_df = resampler_with_impute.transform(df, "Date") assert pd.isnull(impute_df[columns.category].values).all()
def test_empty_identifiers(self, df, params, config, datetime_column): resampler = Resampler(params) datetime_column = config.get('datetime_column') output_df = resampler.transform(df, datetime_column, groupby_columns=[]) assert output_df.shape == (8, 4) output_df = resampler.transform(df, datetime_column) assert output_df.shape == (8, 4) output_df = resampler.transform(df, datetime_column, groupby_columns=None) assert output_df.shape == (8, 4)
def test_missing_categorical(self, missing_row_df, config, columns): config["time_unit"] = "weeks" config["time_step"] = 12 config["category_imputation_method"] = "clip" params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(missing_row_df, columns.date) assert np.all(output_df.categorical.values == "second") config["category_imputation_method"] = "previous" params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(missing_row_df, columns.date) assert math.isnan(output_df.loc[0, columns.category]) assert np.all(output_df.loc[1:, columns.category].values == "second")
def test_three_identifiers(self, long_df_3, params, config, datetime_column): resampler = Resampler(params) groupby_columns = ["country", "item", "store"] datetime_column = config.get('datetime_column') output_df = resampler.transform(long_df_3, datetime_column, groupby_columns=groupby_columns) np.testing.assert_array_equal( output_df[datetime_column].values, pd.DatetimeIndex([ "1959-02-01", "1959-02-15", "1959-03-01", "1959-02-01", "1959-02-15", "1959-03-01", "1959-02-01", "1959-02-15", "1959-03-01", "1959-02-01", "1959-02-15", "1959-03-01", ]))
def test_no_category_values(self, df, config, columns): config["category_imputation_method"] = "previous" params = get_resampling_params(config) resampler = Resampler(params) output_df_first = resampler.transform(df, columns.date) np.testing.assert_array_equal( output_df_first.categorical.values, np.array([ 'first', 'first', 'first', 'first', 'first', 'second', 'second', 'second' ])) config["category_imputation_method"] = "empty" params = get_resampling_params(config) resampler = Resampler(params) output_df_empty = resampler.transform(df, columns.date) assert math.isnan(output_df_empty.loc[0, columns.category])
def test_df_multiple_dates(self, df_multiple_dates, config, columns): config["category_imputation_method"] = "previous" config["time_unit"] = "hours" config["time_step"] = 12 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(df_multiple_dates, columns.date) assert pd.isnull(output_df.loc[1, "date2"])
def test_mode_filling(self, df3, config, columns): config["category_imputation_method"] = "mode" config["time_unit"] = "weeks" config["time_step"] = 1 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(df3, columns.date) assert np.all(output_df.categorical.values == "second")
def test_microseconds(self, config, columns): config["time_unit"] = "microseconds" config["time_step"] = 3 params = get_resampling_params(config) resampler = Resampler(params) df_DST = get_df_DST("U", columns) output_df = resampler.transform(df_DST, columns.date) expected_dates = pd.DatetimeIndex(['2019-01-31T00:59:00.000000000', '2019-01-31T00:59:00.000003000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_long_df_different_sizes(self, long_df_different_sizes, params, config, datetime_column): resampler = Resampler(params) groupby_columns = ["country"] datetime_column = config.get('datetime_column') output_df = resampler.transform(long_df_different_sizes, datetime_column, groupby_columns=groupby_columns) assert output_df.shape == (12, 4)
def test_clip_filling(self, long_df, config, columns): config["category_imputation_method"] = "clip" config["time_unit"] = "weeks" config["time_step"] = 1 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(long_df, columns.date, groupby_columns=["country"]) assert output_df.loc[3, columns.category] == "first"
def test_year(self, config, columns): config["time_unit"] = "years" params = get_resampling_params(config) resampler = Resampler(params) df = get_df("Y", columns) output_df = resampler.transform(df, columns.date) assert np.mean(output_df[columns.data]) == 316.19 expected_dates = pd.DatetimeIndex(['1959-12-31T00:00:00.000000000', '1961-12-31T00:00:00.000000000', '1963-12-31T00:00:00.000000000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_empty_filling(self, df2, config, columns): config["time_unit"] = "hours" config["time_step"] = 12 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(df2, columns.date) assert output_df.loc[0, columns.category] == "first" assert math.isnan(output_df.loc[1, columns.category]) assert math.isnan(output_df.loc[2, columns.category]) assert output_df.loc[6, columns.category] == "second" assert math.isnan(output_df.loc[7, columns.category])
def test_next_filling_long_format(self, long_df, config, columns): config["category_imputation_method"] = "next" config["time_unit"] = "weeks" config["time_step"] = 1 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(long_df, columns.date, groupby_columns=["country"]) assert math.isnan(output_df.loc[4, columns.category]) assert output_df.loc[3, columns.category] == "second"
def test_next_filling(self, df2, config, columns): config["category_imputation_method"] = "next" config["time_unit"] = "hours" config["time_step"] = 12 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(df2, columns.date) assert output_df.loc[0, columns.category] == "first" assert output_df.loc[1, columns.category] == "first" assert output_df.loc[3, columns.category] == "first" assert output_df.loc[5, columns.category] == "second" assert output_df.loc[9, columns.category] == "third"
def test_hours_DST(self, config, columns): config["time_unit"] = "hours" params = get_resampling_params(config) resampler = Resampler(params) df_DST = get_df_DST("4H", columns) output_df = resampler.transform(df_DST, columns.date) assert np.mean(output_df[columns.data]) == 316.33428571428567 expected_dates = pd.DatetimeIndex(['2019-01-31T01:00:00.000000000', '2019-01-31T03:00:00.000000000', '2019-01-31T05:00:00.000000000', '2019-01-31T07:00:00.000000000', '2019-01-31T09:00:00.000000000', '2019-01-31T11:00:00.000000000', '2019-01-31T13:00:00.000000000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_constant_value_filling(self, df2, config, columns): config["category_imputation_method"] = "constant" config["category_constant_value"] = "myvalue" config["time_unit"] = "hours" config["time_step"] = 12 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(df2, columns.date) assert output_df.loc[0, columns.category] == "first" assert output_df.loc[1, columns.category] == "myvalue" assert output_df.loc[2, columns.category] == "myvalue" assert output_df.loc[6, columns.category] == "second" assert output_df.loc[7, columns.category] == "myvalue"
def test_long_format(self, long_df, params, config, datetime_column): resampler = Resampler(params) groupby_columns = ["country"] datetime_column = config.get('datetime_column') output_df = resampler.transform(long_df, datetime_column, groupby_columns=groupby_columns) np.testing.assert_array_equal( output_df[datetime_column].values, pd.DatetimeIndex([ "1959-02-01", "1959-02-15", "1959-03-01", "1959-02-01", "1959-02-15", "1959-03-01" ]))
def test_weeks_monday_end(self, config, columns): config["time_unit"] = "weeks" config["time_unit_end_of_week"] = "MON" params = get_resampling_params(config) resampler = Resampler(params) df = get_df("M", columns) output_df = resampler.transform(df, columns.date) assert np.mean(output_df[columns.data]) == 316.36625000000004 expected_dates = pd.DatetimeIndex(['1959-02-02T00:00:00.000000000', '1959-02-16T00:00:00.000000000', '1959-03-02T00:00:00.000000000', '1959-03-16T00:00:00.000000000', '1959-03-30T00:00:00.000000000', '1959-04-13T00:00:00.000000000', '1959-04-27T00:00:00.000000000', '1959-05-11T00:00:00.000000000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_mode_filling_long_format(self, long_df_mode, config, columns): config["category_imputation_method"] = "mode" config["time_unit"] = "weeks" config["time_step"] = 1 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(long_df_mode, columns.date, groupby_columns=["country"]) assert np.all(output_df.loc[output_df.country == 0, columns.category].values == "first") assert np.all(output_df.loc[output_df.country == 1, columns.category].values == "fourth")
def test_nanoseconds(self, config, columns): config["time_unit"] = "nanoseconds" config["time_step"] = 1 params = get_resampling_params(config) resampler = Resampler(params) df = get_df_DST("3N", columns) output_df = resampler.transform(df, columns.date) expected_dates = pd.DatetimeIndex(['2019-01-31T00:59:00.000000000', '2019-01-31T00:59:00.000000001', '2019-01-31T00:59:00.000000002', '2019-01-31T00:59:00.000000003', '2019-01-31T00:59:00.000000004', '2019-01-31T00:59:00.000000005', '2019-01-31T00:59:00.000000006', '2019-01-31T00:59:00.000000007', '2019-01-31T00:59:00.000000008', '2019-01-31T00:59:00.000000009']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_bool_column(self, bool_df, config, columns): config["category_imputation_method"] = "previous" config["time_unit"] = "hours" config["time_step"] = 12 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(bool_df, columns.date) np.testing.assert_array_equal( output_df.categorical.values, np.array([ True, True, True, True, True, True, False, False, False, False, False ]))
def test_seconds(self, config, columns): config["time_unit"] = "seconds" config["time_step"] = 30 params = get_resampling_params(config) resampler = Resampler(params) df_DST = get_df_DST("min", columns) output_df = resampler.transform(df_DST, columns.date) assert np.mean(output_df[columns.data]) == 316.28999999999996 expected_dates = pd.DatetimeIndex(['2019-01-31T00:59:00.000000000', '2019-01-31T00:59:30.000000000', '2019-01-31T01:00:00.000000000', '2019-01-31T01:00:30.000000000', '2019-01-31T01:01:00.000000000', '2019-01-31T01:01:30.000000000', '2019-01-31T01:02:00.000000000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_previous_filling(self, df2, config, columns): config["category_imputation_method"] = "previous" config["time_unit"] = "hours" config["time_step"] = 12 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(df2, columns.date) np.testing.assert_array_equal( output_df.categorical.values, np.array([ 'first', 'first', 'first', 'first', 'first', 'first', 'second', 'second', 'second', 'second', 'third' ]))
def test_days(self, config, columns): config["time_unit"] = "days" params = get_resampling_params(config) resampler = Resampler(params) df = get_df("W-TUE", columns) output_df = resampler.transform(df, columns.date) assert np.mean(output_df[columns.data]) == 316.3254545454545 expected_dates = pd.DatetimeIndex(['1959-01-06T00:00:00.000000000', '1959-01-08T00:00:00.000000000', '1959-01-10T00:00:00.000000000', '1959-01-12T00:00:00.000000000', '1959-01-14T00:00:00.000000000', '1959-01-16T00:00:00.000000000', '1959-01-18T00:00:00.000000000', '1959-01-20T00:00:00.000000000', '1959-01-22T00:00:00.000000000', '1959-01-24T00:00:00.000000000', '1959-01-26T00:00:00.000000000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_days_DST(self, config, columns): config["time_unit"] = "days" params = get_resampling_params(config) resampler = Resampler(params) df_DST = get_df_DST("W-WED", columns) output_df = resampler.transform(df_DST, columns.date) assert np.mean(output_df[columns.data]) == 316.3072727272727 expected_dates = pd.DatetimeIndex(['2019-02-05T23:00:00.000000000', '2019-02-07T23:00:00.000000000', '2019-02-09T23:00:00.000000000', '2019-02-11T23:00:00.000000000', '2019-02-13T23:00:00.000000000', '2019-02-15T23:00:00.000000000', '2019-02-17T23:00:00.000000000', '2019-02-19T23:00:00.000000000', '2019-02-21T23:00:00.000000000', '2019-02-23T23:00:00.000000000', '2019-02-25T23:00:00.000000000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_mix_identifiers(self, long_df_4, params, config, datetime_column): resampler = Resampler(params) groupby_columns = ["country", "item", "store"] datetime_column = config.get('datetime_column') output_df = resampler.transform(long_df_4, datetime_column, groupby_columns=groupby_columns) expected_dates = pd.DatetimeIndex([ '2020-02-02T00:00:00.000000000', '2020-02-16T00:00:00.000000000', '2020-03-01T00:00:00.000000000', '2020-02-29T00:00:00.000000000', '2020-01-31T00:00:00.000000000', '2020-02-02T00:00:00.000000000', '2020-02-16T00:00:00.000000000', '2020-03-01T00:00:00.000000000', '2020-02-02T00:00:00.000000000', '2020-02-16T00:00:00.000000000', '2020-03-01T00:00:00.000000000' ]) np.testing.assert_array_equal(output_df[datetime_column].values, expected_dates)
def test_month(self, config, columns): config["time_unit"] = "months" params = get_resampling_params(config) resampler = Resampler(params) df = get_df("Y", columns) output_df = resampler.transform(df, columns.date) assert np.mean(output_df[columns.data]) == 316.32550000000003 expected_dates = pd.DatetimeIndex(['1959-12-31T00:00:00.000000000', '1960-02-29T00:00:00.000000000', '1960-04-30T00:00:00.000000000', '1960-06-30T00:00:00.000000000', '1960-08-31T00:00:00.000000000', '1960-10-31T00:00:00.000000000', '1960-12-31T00:00:00.000000000', '1961-02-28T00:00:00.000000000', '1961-04-30T00:00:00.000000000', '1961-06-30T00:00:00.000000000', '1961-08-31T00:00:00.000000000', '1961-10-31T00:00:00.000000000', '1961-12-31T00:00:00.000000000', '1962-02-28T00:00:00.000000000', '1962-04-30T00:00:00.000000000', '1962-06-30T00:00:00.000000000', '1962-08-31T00:00:00.000000000', '1962-10-31T00:00:00.000000000', '1962-12-31T00:00:00.000000000', '1963-02-28T00:00:00.000000000']) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_previous_filling_long_format(self, long_df, config, columns): config["category_imputation_method"] = "previous" config["time_unit"] = "weeks" config["time_step"] = 1 params = get_resampling_params(config) resampler = Resampler(params) output_df = resampler.transform(long_df, columns.date, groupby_columns=["country"]) expected_dates = pd.DatetimeIndex([ '1959-02-01T00:00:00.000000000', '1959-02-08T00:00:00.000000000', '1959-02-15T00:00:00.000000000', '1959-02-22T00:00:00.000000000', '1959-03-01T00:00:00.000000000', '1959-02-01T00:00:00.000000000', '1959-02-08T00:00:00.000000000', '1959-02-15T00:00:00.000000000', '1959-02-22T00:00:00.000000000', '1959-03-01T00:00:00.000000000' ]) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates) expected_categorical = np.array([ 'first', 'first', 'first', 'first', 'second', 'third', 'third', 'third', 'third', 'fourth' ]) np.testing.assert_array_equal(output_df.categorical.values, expected_categorical)
from dataiku.customrecipe import get_recipe_config from dku_timeseries import Resampler from io_utils import get_input_output from recipe_config_loading import check_and_get_groupby_columns, check_time_column_parameter, check_python_version, get_resampling_params check_python_version() # --- Setup (input_dataset, output_dataset) = get_input_output() recipe_config = get_recipe_config() input_dataset_columns = [ column["name"] for column in input_dataset.read_schema() ] check_time_column_parameter(recipe_config, input_dataset_columns) groupby_columns = check_and_get_groupby_columns(recipe_config, input_dataset_columns) datetime_column = recipe_config.get('datetime_column') params = get_resampling_params(recipe_config) # --- Run df = input_dataset.get_dataframe() resampler = Resampler(params) output_df = resampler.transform(df, datetime_column, groupby_columns=groupby_columns) # --- Write output output_dataset.write_with_schema(output_df)