def test_grouped_returns_numpy(pipeline, groupby, is_estimator, input_df): gp = GroupedPipeline(groupby=groupby, pipeline=pipeline) if is_estimator: out = gp.fit_predict(input_df) else: out = gp.fit_transform(input_df) assert type(out) is np.ndarray
def test_grouped_order(pipeline, groupby, is_estimator, input_df): gp = GroupedPipeline(groupby=groupby, pipeline=pipeline) if is_estimator: out = gp.fit_predict(input_df).values expected = input_df[ini.Columns.target].values else: out = gp.fit_transform(input_df).values expected = input_df[[ini.Columns.target]].values np.testing.assert_array_equal(out, expected)
def test_iter_groups_non_consecutive_index(index): group = [1] * 2 + [2] * (len(index) - 2) value = np.random.random(len(index)) input_df = pd.DataFrame( [group, value], index=["group", "value"], columns=index ).T gp = GroupedPipeline(groupby=["group"], pipeline=None) for key, sub_df, _ in gp._iter_groups(input_df): assert (sub_df["group"] == key).all()
def test_grouped_with_y(pipeline, groupby, y, input_df, y_mode): if y_mode == 'series': y = input_df[y] elif y_mode == 'array': y = input_df[y].values gp = GroupedPipeline(groupby=groupby, pipeline=pipeline) out = gp.fit_predict(input_df, y) assert type(out) is np.ndarray assert len(out) == len(input_df)
def test_one_group_missing_return_none(input_df): transform_df = mock.mock_raw_data(ids=[0, 1, 2]) gp = GroupedPipeline(groupby=['id'], pipeline=Pipeline([val_selector]), errors='return_empty') gp.fit(input_df) out = gp.transform(transform_df) assert out.shape[1] == 1 transformed_part = transform_df[ini.Columns.target].values[:96] np.testing.assert_array_equal(out[:96, 0], transformed_part) assert np.isnan(out[96:]).all()
def test_grouped_values(mocker, pipeline, groupby, is_estimator, input_df): gp = GroupedPipeline(groupby=groupby, pipeline=pipeline) if is_estimator: out = gp.fit_predict(input_df) else: out = gp.fit_transform(input_df) input_df['out'] = out for _, df in input_df.groupby(groupby): expected = df[ini.Columns.target].shift(1) np.testing.assert_array_equal(expected.values, df['out'].values)
def test_one_groups_missing_return_df(input_df): transform_df = mock.mock_raw_data(ids=[0, 1, 2]) dt_feat = PandasDateTimeFeaturizer(attributes='month') gp = GroupedPipeline(groupby=['id'], pipeline=dt_feat, errors='return_df') gp.fit(input_df) out = gp.transform(transform_df) set(out.columns) == { 'id', ini.Columns.datetime, ini.Columns.target, 'month' } assert (~out[out.id == 0].month.isnull()).all() assert (~out[out.id == 1].month.isnull()).all() assert (out[out.id == 2].month.isnull()).all() orig_cols = ['id', ini.Columns.datetime, ini.Columns.target] pd.testing.assert_frame_equal(out[orig_cols], transform_df)
def test_all_groups_missing_raises(input_df, errors): transform_df = mock.mock_raw_data(ids=[2, 3]) gp = GroupedPipeline(groupby=['id'], pipeline=Pipeline([col_selector]), errors=errors) gp.fit(input_df) with pytest.raises(KeyError, message='All keys missing in fitted pipelines'): gp.transform(transform_df)
test_pipeline1 = Pipeline([('select', PandasDateTimeFeaturizer(column='datetime', attributes='hour')), ('select2', PandasValueSelector(['hour', 'id']))]) test_pipeline2 = Pipeline([('select', PandasDateTimeFeaturizer(column='datetime', attributes='hour')), ('select2', PandasValueSelector(['id']))]) test_pipeline3 = Pipeline([('select', PandasDateTimeFeaturizer(column='datetime', attributes='hour')), ('select2', PandasColumnSelector(['hour', 'id']))]) test_grouped_pipeline1 = GroupedPipeline(groupby='team', pipeline=test_pipeline1) test_grouped_pipeline1_multiple = GroupedPipeline(groupby=['week', 'team'], pipeline=test_pipeline1) test_grouped_pipeline2 = GroupedPipeline(groupby='team', pipeline=test_pipeline2) test_grouped_pipeline3 = GroupedPipeline(groupby='team', pipeline=test_pipeline3) # feature union pipeline1 = Pipeline([('featurize', PandasDateTimeFeaturizer(column='datetime', attributes='hour')), ('select', PandasColumnSelector(columns=['hour']))]) pipeline2 = Pipeline([('select2', PandasColumnSelector('id'))]) pipeline3 = Pipeline([('select3', PandasValueSelector('id'))]) pipeline4 = Pipeline([('featurize',
def test_grouped_returns_pandas(pipeline, groupby, input_df): gp = GroupedPipeline(groupby=groupby, pipeline=pipeline) out = gp.fit_transform(input_df) assert type(out) is pd.DataFrame
def test_raises_when_missing_key(input_df): transform_df = mock.mock_raw_data(ids=[0, 1, 2]) gp = GroupedPipeline(groupby=['id'], pipeline=Pipeline([col_selector])) gp.fit(input_df) with pytest.raises(KeyError, message="Missing key 2 in fitted pipelines"): gp.transform(transform_df)