class NewFeatureTransformer(BaseEstimator, TransformerMixin): def __init__(self, quantile_range=None): super().__init__() if not quantile_range: self.cc = ColumnCapper() else: self.cc = ColumnCapper(quantile_range=quantile_range) def fit(self, X, y=None): self.cc.fit(X["annual_inc"].values.reshape(-1, 1)) return self def transform(self, X): func_tot_loan_amnt = lambda df: df["installment"] * ( (1 + (df["int_rate"] / 100))**(df["term"] / 12)) func_int_to_inc = lambda df: ((df["installment"] * 12) / (df["annual_inc"])) * 100 func_tot_loan_to_inc = lambda df: (df["total_loan_amount"] / (df["annual_inc"] * (df["term"] / 12))) * 100 X = (X.assign(annual_inc=lambda df: self.cc.transform(df[ "annual_inc"].values.reshape(-1, 1)), total_loan_amount=func_tot_loan_amnt, installment_to_income=func_int_to_inc, total_loan_to_income=func_tot_loan_to_inc)) return X
def test_X_types_and_transformed_shapes(valid_df): def expect_value_error(X, X_transform=None): if X_transform is None: X_transform = X with pytest.raises(ValueError): capper = ColumnCapper().fit(X) capper.transform(X_transform) # Fitted and transformed arrays must have the same number of columns expect_value_error(valid_df, valid_df[["a", "b"]]) invalid_dfs = [ pd.DataFrame({ "a": [np.nan, np.nan, np.nan], "b": [11, 12, 13] }), pd.DataFrame({ "a": [np.inf, np.inf, np.inf], "b": [11, 12, 13] }), ] for invalid_df in invalid_dfs: expect_value_error(invalid_df) # contains an invalid column ('a') expect_value_error( invalid_df["b"] ) # 1d arrays should be reshaped before fitted/transformed # Like this: ColumnCapper().fit_transform(invalid_df["b"].values.reshape(-1, 1)) ColumnCapper().fit_transform(invalid_df["b"].values.reshape(1, -1)) capper = ColumnCapper() for X in valid_df, valid_df.values: assert capper.fit_transform(X).shape == X.shape
def test_interpolation(): valid_interpolations = ("linear", "lower", "higher", "midpoint", "nearest") invalid_interpolations = ("test", 42, None, [], {}, set(), 0.42) for interpolation in valid_interpolations: ColumnCapper(interpolation=interpolation) for interpolation in invalid_interpolations: with pytest.raises(ValueError): ColumnCapper(interpolation=interpolation)
def test_interpolation(): valid_interpolations = ('linear', 'lower', 'higher', 'midpoint', 'nearest') invalid_interpolations = ('test', 42, None, [], {}, set(), .42) for interpolation in valid_interpolations: ColumnCapper(interpolation=interpolation) for interpolation in invalid_interpolations: with pytest.raises(ValueError): ColumnCapper(interpolation=interpolation)
def test_nan_inf(valid_df): # Capping infs capper = ColumnCapper(discard_infs=False) assert (capper.fit_transform(valid_df) == np.inf).sum().sum() == 0 assert np.isnan(capper.fit_transform(valid_df)).sum() == 1 # Discarding infs capper = ColumnCapper(discard_infs=True) assert (capper.fit_transform(valid_df) == np.inf).sum().sum() == 0 assert np.isnan(capper.fit_transform(valid_df)).sum() == 2
def expect_value_error(X, X_transform=None): if X_transform is None: X_transform = X with pytest.raises(ValueError): capper = ColumnCapper().fit(X) capper.transform(X_transform)
def expect_value_error(quantile_range): with pytest.raises(ValueError): ColumnCapper(quantile_range)
def expect_type_error(quantile_range): with pytest.raises(TypeError): ColumnCapper(quantile_range)
def test_estimator_checks(test_fn): test_fn(ColumnCapper.__name__, ColumnCapper())
def test_dtype_classification(random_xy_dataset_clf): X, y = random_xy_dataset_clf assert ColumnCapper().fit(X, y).transform(X).dtype in FLOAT_DTYPES
def test_dtype_regression(random_xy_dataset_regr): X, y = random_xy_dataset_regr assert ColumnCapper().fit(X, y).transform(X).dtype in FLOAT_DTYPES
def __init__(self, quantile_range=None): super().__init__() if not quantile_range: self.cc = ColumnCapper() else: self.cc = ColumnCapper(quantile_range=quantile_range)