def test_grouped_function_transformer(): df = pd.DataFrame( data={ 'country': ['USA', 'USA', 'USA', 'Canada', 'Fiji'], 'year': [2001, 2002, 2003, 2001, 2001], 'length': [1, 2, 3, 4, 5], 'width': [1.0, 1.0, 7.5, 9.0, 11.0], }).set_index(['country', 'year']).sort_index() # with groupby kwargs, produces a df func = np.sum trans = ballet.eng.GroupedFunctionTransformer( func, groupby_kwargs={'level': 'country'}) trans.fit(df) result = trans.transform(df) expected_result = df.groupby(level='country').apply(func) assert_frame_equal(result, expected_result) # without groupby kwargs, produces a series func = np.min trans = ballet.eng.GroupedFunctionTransformer(func) trans.fit(df) result = trans.transform(df) expected_result = df.pipe(func) assert_series_equal(result, expected_result)
def test_groupwise_transformer_ignore_on_transform_error(sample_data): X_tr, X_te = sample_data exc = Exception class TransformErrorTransformer(ballet.eng.BaseTransformer): def transform(self, X, **transform_kwargs): raise exc groupwise_transformer = ballet.eng.GroupwiseTransformer( TransformErrorTransformer(), groupby_kwargs={'level': 'name'}, handle_error='ignore', ) groupwise_transformer.fit(X_tr) result_tr = groupwise_transformer.transform(X_tr) expected_tr = X_tr assert_frame_equal(result_tr, expected_tr) result_te = groupwise_transformer.transform(X_te) expected_te = X_te assert_frame_equal(result_te, expected_te)
def test_groupwise_transformer_ignore_on_new_group(sample_data, individual_transformer, groupby_kwargs): X_tr, X_te = sample_data groupwise_transformer = ballet.eng.GroupwiseTransformer( individual_transformer, groupby_kwargs=groupby_kwargs, handle_unknown='ignore', ) groupwise_transformer.fit(X_tr) X_te = X_te.copy().reset_index() X_te.loc[0, 'name'] = 'Z' # new group X_te = X_te.set_index(['name', 'year']) result = groupwise_transformer.transform(X_te) # the first group, Z, is new, and values are passed through, so such # be nan expected = X_te.copy() expected['value'] = np.array([np.nan, 1.5, 5.0]) expected['size'] = np.array([4.0, 1.0, 4.0]) assert_frame_equal(result, expected)
def test_single_lagger(): # simple test data = pd.util.testing.makeTimeSeries() trans = ballet.eng.ts.SingleLagger(1) result = trans.fit_transform(data) expected_result = data.shift(1) assert_series_equal(result, expected_result) data = pd.DataFrame( data={ 'city': ['LA', 'LA', 'LA', 'NYC', 'BOS', 'BOS', 'BOS'], 'year': [2001, 2002, 2003, 2002, 2003, 2004, 2005], 'width': [1, 2, 3, 4, 5, 6, 7], }).set_index(['city', 'year']).sort_index() trans = ballet.eng.ts.SingleLagger(1, groupby_kwargs={'level': 'city'}) result = trans.fit_transform(data) expected_result = pd.DataFrame( data={ 'city': ['LA', 'LA', 'LA', 'NYC', 'BOS', 'BOS', 'BOS'], 'year': [2001, 2002, 2003, 2002, 2003, 2004, 2005], 'width': [np.nan, 1, 2, np.nan, np.nan, 5, 6], }).set_index(['city', 'year']).sort_index() assert_frame_equal(result, expected_result)
def test_subset_transformer_identity(sample_data): """After passing through a column unchanged, the entire df is the same as before""" # noqa X_tr, X_te = sample_data t = ballet.eng.SubsetTransformer('value', None) result_tr = t.fit_transform(X_tr) result_te = t.transform(X_te) assert_frame_equal(result_tr, X_tr) assert_frame_equal(result_te, X_te)
def test_groupwise_transformer_can_transform(sample_data, groupwise_transformer): X_tr, X_te = sample_data groupwise_transformer.fit(X_tr) result_tr = groupwise_transformer.transform(X_tr) expected_tr = X_tr.copy() expected_tr['value'] = np.array([1, 2, 1.5, 4, 4, 5, 5]) expected_tr = expected_tr.drop('size', axis=1) assert_frame_equal(result_tr, expected_tr) result_te = groupwise_transformer.transform(X_te) expected_te = X_te.copy() expected_te['value'] = np.array([1.5, 1.5, 5]) expected_te = expected_te.drop('size', axis=1) assert_frame_equal(result_te, expected_te)
def test_subset_transformer_mutate(sample_data): """After modifying one column, that column is different and the complement is the same""" # noqa X_tr, X_te = sample_data input = 'size' t = ballet.eng.SubsetTransformer(input, lambda x: x + 1) result_tr = t.fit_transform(X_tr) result_te = t.transform(X_te) # the input col is modified assert_series_not_equal(result_tr[input], X_tr[input]) assert_series_not_equal(result_te[input], X_te[input]) # the complement is passed through unchanged complement = [col for col in X_tr.columns if col != input] assert_frame_equal(result_tr[complement], X_tr[complement]) assert_frame_equal(result_te[complement], X_te[complement])
def test_assert_frame_equal(): a = pdt.makeCustomDataframe(10, 7, data_gen_f=lambda row, col: row * col) b = a.copy() assert_frame_equal(a, b) c = a + 1 with pytest.raises(AssertionError): assert_frame_equal(a, c) d = pdt.makeCustomDataframe(11, 9) with pytest.raises(AssertionError): assert_frame_equal(a, d) e = pdt.makeTimeSeries() with pytest.raises(AssertionError): assert_frame_equal(a, e) f = pd.DataFrame([1, 2, 3, 4]) g = pd.Series([1, 2, 3, 4]) with pytest.raises(AssertionError): assert_frame_equal(f, g)