def test_monthly_causal(self, monthly_df, recipe_config): recipe_config["causal_window"] = True recipe_config["window_type"] = "triang" params_causal = get_params(recipe_config) window_aggregator = WindowAggregator(params_causal) datetime_column = recipe_config.get('datetime_column') output_df = window_aggregator.compute(monthly_df, datetime_column) assert output_df.shape == (6, 7)
def test_annual_no_causal(self, annual_df, recipe_config): recipe_config["window_unit"] = "years" params_no_causal = get_params(recipe_config) window_aggregator = WindowAggregator(params_no_causal) datetime_column = recipe_config.get('datetime_column') output_df = window_aggregator.compute(annual_df, datetime_column) assert output_df.shape == (6, 4) np.testing.assert_array_equal( output_df.value1_sum, np.array([np.nan, 17, 15, 11, 8, np.nan]))
def test_monthly_no_causal(self, monthly_df, recipe_config): params_no_causal = get_params(recipe_config) window_aggregator = WindowAggregator(params_no_causal) datetime_column = recipe_config.get('datetime_column') output_df = window_aggregator.compute(monthly_df, datetime_column) assert output_df.shape == (6, 7) np.testing.assert_array_equal( output_df.value1_sum.values, np.array([np.nan, 17, 15, 11, 8, np.nan]))
def test_weekly_no_causal(self, weekly_df, recipe_config): params_no_causal = get_params(recipe_config) window_aggregator = WindowAggregator(params_no_causal) datetime_column = recipe_config.get('datetime_column') output_df = window_aggregator.compute(weekly_df, datetime_column) assert output_df.shape == (9, 4) np.testing.assert_array_equal( output_df.value1_sum.values, np.array( [np.nan, np.nan, np.nan, 27, 26, 21, np.nan, np.nan, np.nan]))
def test_long_format_numerical(self, long_df_numerical, params, recipe_config, columns): window_aggregator = WindowAggregator(params) groupby_columns = ["country"] datetime_column = recipe_config.get('datetime_column') output_df = window_aggregator.compute(long_df_numerical, datetime_column, groupby_columns=groupby_columns) np.testing.assert_array_equal(output_df.country.values, np.array([1, 1, 1, 1, 2, 2, 2, 2]))
def test_annual_causal(self, annual_df, recipe_config): recipe_config["causal_window"] = True recipe_config["window_type"] = "triang" recipe_config["window_unit"] = "years" params_causal = get_params(recipe_config) window_aggregator = WindowAggregator(params_causal) datetime_column = recipe_config.get('datetime_column') output_df = window_aggregator.compute(annual_df, datetime_column) np.testing.assert_array_equal( output_df.value1_avg, np.array([np.nan, np.nan, np.nan, 6.5, 4.75, 3.25])) assert output_df.shape == (6, 4)
def test_empty_identifiers(self, df, params, recipe_config, columns): window_aggregator = WindowAggregator(params) datetime_column = recipe_config.get('datetime_column') output_df = window_aggregator.compute(df, datetime_column, groupby_columns=[]) assert output_df.shape == (4, 5) output_df = window_aggregator.compute(df, datetime_column) assert output_df.shape == (4, 5) output_df = window_aggregator.compute(df, datetime_column, groupby_columns=None) assert output_df.shape == (4, 5)
def test_weekly_causal(self, weekly_df, recipe_config): recipe_config["causal_window"] = True recipe_config["window_type"] = "triang" params_causal = get_params(recipe_config) window_aggregator = WindowAggregator(params_causal) datetime_column = recipe_config.get('datetime_column') output_df = window_aggregator.compute(weekly_df, datetime_column) np.testing.assert_array_equal( output_df.value1_sum, np.array([ np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 15.25, 13.25 ])) assert output_df.shape == (9, 4)
def test_mix_identifiers(self, long_df_4, params, recipe_config, columns): window_aggregator = WindowAggregator(params) groupby_columns = ["country", "item", "store"] datetime_column = recipe_config.get('datetime_column') output_df = window_aggregator.compute(long_df_4, datetime_column, groupby_columns=groupby_columns) expected_dates = pd.DatetimeIndex([ '2020-01-31T00:00:00.000000000', '2020-02-29T00:00:00.000000000', '2020-02-29T00:00:00.000000000', '2020-01-31T00:00:00.000000000', '2020-01-31T00:00:00.000000000', '2020-02-29T00:00:00.000000000', '2020-01-31T00:00:00.000000000', '2020-02-29T00:00:00.000000000' ]) np.testing.assert_array_equal(output_df[datetime_column].values, expected_dates)
def test_weeks(self, recipe_config, columns): recipe_config["window_unit"] = "weeks" params = get_params(recipe_config) window_aggregator = WindowAggregator(params) datetime_column = columns.date df = get_df_DST("W", columns) output_df = window_aggregator.compute(df, datetime_column) assert output_df.shape == (6, 7) expected_dates = pd.DatetimeIndex([ '2019-02-03T00:59:00.000000000', '2019-02-10T00:59:00.000000000', '2019-02-17T00:59:00.000000000', '2019-02-24T00:59:00.000000000', '2019-03-03T00:59:00.000000000', '2019-03-10T00:59:00.000000000' ]) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_nanoseconds(self, recipe_config, columns): recipe_config["window_unit"] = "nanoseconds" params = get_params(recipe_config) window_aggregator = WindowAggregator(params) datetime_column = columns.date df = get_df_DST("N", columns) output_df = window_aggregator.compute(df, datetime_column) assert output_df.shape == (6, 7) expected_dates = pd.DatetimeIndex([ '2019-01-31T00:59:00.000000000', '2019-01-31T00:59:00.000000001', '2019-01-31T00:59:00.000000002', '2019-01-31T00:59:00.000000003', '2019-01-31T00:59:00.000000004', '2019-01-31T00:59:00.000000005' ]) np.testing.assert_array_equal(output_df[columns.date].values, expected_dates)
def test_two_identifiers(self, long_df_2, params, recipe_config, columns): window_aggregator = WindowAggregator(params) groupby_columns = ["country", "item"] datetime_column = recipe_config.get('datetime_column') output_df = window_aggregator.compute(long_df_2, datetime_column, groupby_columns=groupby_columns) np.testing.assert_array_equal( output_df[datetime_column].values, pd.DatetimeIndex([ '1959-01-31T00:00:00.000000000', '1959-02-28T00:00:00.000000000', '1959-01-31T00:00:00.000000000', '1959-02-28T00:00:00.000000000', '1959-01-31T00:00:00.000000000', '1959-02-28T00:00:00.000000000' ]))
def test_year_start(self, annual_start_df, recipe_config): recipe_config["window_unit"] = "years" params = get_params(recipe_config) window_aggregator = WindowAggregator(params) datetime_column = recipe_config.get('datetime_column') output_df = window_aggregator.compute(annual_start_df, datetime_column) assert output_df.shape == (6, 4) np.testing.assert_array_equal( output_df.Date.values, pd.DatetimeIndex([ '2015-01-01T00:00:00.000000000', '2016-01-01T00:00:00.000000000', '2017-01-01T00:00:00.000000000', '2018-01-01T00:00:00.000000000', '2019-01-01T00:00:00.000000000', '2020-01-01T00:00:00.000000000' ]))
def test_long_format(self, long_df, params, recipe_config, columns): window_aggregator = WindowAggregator(params) groupby_columns = [columns.category] datetime_column = recipe_config.get('datetime_column') output_df = window_aggregator.compute(long_df, datetime_column, groupby_columns=groupby_columns) np.testing.assert_array_equal( np.round(output_df[columns.aggregation].values, 2), np.array( [np.nan, 315.58, 315.98, 316.25, np.nan, 345., 289.5, 226.33])) np.testing.assert_array_equal( output_df.country.values, np.array([ 'first', 'first', 'first', 'first', 'second', 'second', 'second', 'second' ]))
def test_month_start(self, monthly_start_df, recipe_config): recipe_config["window_width"] = 1 recipe_config["aggregation_types"] = [u'average', 'retrieve'] params = get_params(recipe_config) window_aggregator = WindowAggregator(params) datetime_column = recipe_config.get('datetime_column') output_df = window_aggregator.compute(monthly_start_df, datetime_column) assert output_df.shape == (6, 5) np.testing.assert_array_equal( output_df.Date.values, pd.DatetimeIndex([ '2015-01-01T00:00:00.000000000', '2015-02-01T00:00:00.000000000', '2015-03-01T00:00:00.000000000', '2015-04-01T00:00:00.000000000', '2015-05-01T00:00:00.000000000', '2015-06-01T00:00:00.000000000' ]))
def test_long_format_no_causal(self, long_df, params_no_causal, recipe_config, columns): window_aggregator = WindowAggregator(params_no_causal) groupby_columns = ["country"] datetime_column = recipe_config.get('datetime_column') output_df = window_aggregator.compute(long_df, datetime_column, groupby_columns=groupby_columns) np.testing.assert_array_equal( np.round(output_df[columns.aggregation].values, 2), np.array( [np.nan, 316.25, 316.46, np.nan, np.nan, 226.33, 211., np.nan])) np.testing.assert_array_equal( output_df.country.values, np.array([ 'first', 'first', 'first', 'first', 'second', 'second', 'second', 'second' ]))
def test_invalid_frequencies(self, annual_df, recipe_config): params_no_causal = get_params(recipe_config) window_aggregator = WindowAggregator(params_no_causal) datetime_column = recipe_config.get('datetime_column') with pytest.raises(Exception) as err: _ = window_aggregator.compute(annual_df, datetime_column) assert "smaller than the timeseries frequency" in str(err.value) recipe_config["causal_window"] = True recipe_config["window_type"] = "triang" params_causal = get_params(recipe_config) window_aggregator = WindowAggregator(params_causal) with pytest.raises(Exception) as err: _ = window_aggregator.compute(annual_df, datetime_column) assert "smaller than the timeseries frequency" in str(err.value) recipe_config["window_type"] = "none" params_causal = get_params(recipe_config) window_aggregator = WindowAggregator(params_causal) output_df = window_aggregator.compute(annual_df, datetime_column) np.testing.assert_array_equal(output_df.value1_sum, np.nan * np.ones(6))
from recipe_config_loading import check_time_column_parameter, check_and_get_groupby_columns, check_python_version, get_windowing_params logger = logging.getLogger(__name__) logging.basicConfig( level=logging.INFO, format='timeseries-preparation plugin %(levelname)s - %(message)s') check_python_version() # --- Setup (input_dataset, output_dataset) = get_input_output() recipe_config = get_recipe_config() input_dataset_columns = [ column["name"] for column in input_dataset.read_schema() ] check_time_column_parameter(recipe_config, input_dataset_columns) datetime_column = recipe_config.get('datetime_column') groupby_columns = check_and_get_groupby_columns(recipe_config, input_dataset_columns) params = get_windowing_params(recipe_config) # --- Run df = input_dataset.get_dataframe() window_aggregator = WindowAggregator(params) output_df = window_aggregator.compute(df, datetime_column, groupby_columns=groupby_columns) # --- Write output output_dataset.write_with_schema(output_df)