def check_raises_error_when_input_not_a_df(estimator): """ Checks if transformer raises error when user passes input other than a pandas dataframe or numpy array to fit() or transform() methods. Functionality is provided by `is_dataframe`. """ # non-permitted inputs. _not_a_df = [ "not_a_df", [1, 2, 3, "some_data"], pd.Series([-2, 1.5, 8.94], name="not_a_df"), ] # permitted input X, y = test_df(categorical=True, datetime=True) transformer = clone(estimator) for not_df in _not_a_df: # test fitting not a df with pytest.raises(TypeError): transformer.fit(not_df) transformer.fit(X, y) # test transforming not a df with pytest.raises(TypeError): transformer.transform(not_df)
def test_raises_error_when_input_not_a_df(estimator): # non-permitted inputs. _not_a_df = [ "not_a_df", [1, 2, 3, "some_data"], pd.Series([-2, 1.5, 8.94], name="not_a_df"), ] transformer = clone(estimator) # Error in fit param: for not_df in _not_a_df: # test fitting not a df with pytest.raises(TypeError): transformer.fit(not_df) # error in transform param: X, y = test_df(categorical=True, datetime=True) if transformer.__class__.__name__ == "TargetMeanRegressor": # y needs to be continuous y = X["var_1"] X.drop(["var_1"], axis=1, inplace=True) transformer.fit(X, y) for not_df in _not_a_df: if hasattr(transformer, "_predict"): with pytest.raises(TypeError): transformer._predict(not_df) else: with pytest.raises(TypeError): transformer.predict(not_df)
def test_raises_error_when_df_has_nan(df_enc, df_na, estimator): transformer = clone(estimator) X, y = test_df(categorical=True) X_na = X.copy() X_na.loc[0, "var_1"] = np.nan if transformer.__class__.__name__ != "TargetMeanRegressor": # Raise error when dataset contains na, fit method with pytest.raises(ValueError): transformer.fit(X_na, y) transformer.fit(X, y) if hasattr(transformer, "_predict"): with pytest.raises(ValueError): transformer._predict(X_na) else: with pytest.raises(ValueError): transformer.predict(X_na) with pytest.raises(ValueError): transformer.predict_proba(X_na) with pytest.raises(ValueError): transformer.predict_log_proba(X_na) else: y = X["var_10"] with pytest.raises(ValueError): transformer.fit(X_na, y) transformer.fit(X, y) with pytest.raises(ValueError): transformer.predict(X_na)
def test_attributes_upon_fitting(_strategy, _bins, estimator): transformer = clone(estimator) transformer.set_params(bins=_bins, strategy=_strategy) X, y = test_df(categorical=True) if transformer.__class__.__name__ == "TargetMeanRegressor": # y needs to be continuous y = X["var_10"] transformer.fit(X, y) assert transformer.bins == _bins assert transformer.strategy == _strategy if _strategy == "equal_width": assert (type(transformer._pipeline.named_steps["discretiser"]) == EqualWidthDiscretiser) else: assert (type(transformer._pipeline.named_steps["discretiser"]) == EqualFrequencyDiscretiser) assert type( transformer._pipeline.named_steps["encoder_num"]) == MeanEncoder assert type( transformer._pipeline.named_steps["encoder_cat"]) == MeanEncoder
def check_error_if_y_not_passed(estimator): """ Checks that transformer raises error when y is not passed during fit. Functionality is provided by Python, when making a parameter mandatory. For this test to run, we need to add the tag 'requires_y' to the transformer. """ X, y = test_df() estimator = clone(estimator) with pytest.raises(TypeError): estimator.fit(X)
def check_raises_non_fitted_error(estimator): """ Check if transformer raises error when transform() method is called before calling fit() method. The functionality is provided by sklearn's `check_is_fitted` function. """ X, y = test_df() transformer = clone(estimator) # Test when fit is not called prior to transform. with pytest.raises(NotFittedError): transformer.transform(X)
def check_feature_names_in(estimator): """Checks that transformers learn the variable names of the train set used during fit, as well as the number of variables. Should be applied to all transformers. """ # the estimator learns the parameters from the train set X, y = test_df(categorical=True, datetime=True) varnames = list(X.columns) estimator = clone(estimator) estimator.fit(X, y) assert estimator.feature_names_in_ == varnames assert estimator.n_features_in_ == len(varnames)
def test_feature_names_in(estimator): transformer = clone(estimator) X, y = test_df(categorical=True) if transformer.__class__.__name__ == "TargetMeanRegressor": # y needs to be continuous y = X["var_10"] varnames = list(X.columns) transformer.fit(X, y) assert transformer.feature_names_in_ == varnames assert transformer.n_features_in_ == len(varnames)
def check_confirm_variables(estimator): """ Only for transformers with a parameter `confirm_variables`in init. At the moment, this test applies to variable selection transformers. The idea is to corroborate if the variables entered by the user are present in the dataframe before doing the selection, when the parameter is True. """ X, y = test_df() Xs = X.drop(labels=["var_10", "var_11"], axis=1) # original variables in X all_vars = ["var_" + str(i) for i in range(12)] estimator = clone(estimator) sel = estimator.set_params( variables=all_vars, confirm_variables=False, ) sel.fit(X, y) assert sel.variables_ == all_vars sel = estimator.set_params( variables=all_vars, confirm_variables=True, ) sel.fit(Xs, y) assert sel.variables_ == ["var_" + str(i) for i in range(10)] sel = estimator.set_params( variables=all_vars, confirm_variables=False, ) with pytest.raises(KeyError): sel.fit(Xs, y) # When variables is None. sel = estimator.set_params( variables=None, confirm_variables=True, ) sel.fit(X, y) assert sel.variables_ == all_vars sel.fit(Xs, y) assert sel.variables_ == ["var_" + str(i) for i in range(10)]
def check_drop_original_variables(estimator): """ Only for transformers with a parameter `drop_original`in init. Checks correct functionality of `drop_original`. If True, the original variables, that is, those stored in the attribute `variables_` are dropped from the transformed dataframe (after transform()). If False, original variables are returned in the transformed dataframe. """ # Test df X, y = test_df(categorical=True, datetime=True) # when drop_original is true estimator = clone(estimator) estimator.set_params(drop_original=True) X_tr = estimator.fit_transform(X, y) if hasattr(estimator, "variables_"): vars = estimator.variables_ elif hasattr(estimator, "reference"): vars = estimator.variables + estimator.reference else: vars = estimator.variables # Check that original variables are not in transformed dataframe assert set(vars).isdisjoint(set(X_tr.columns)) # Check that remaining variables are in transformed dataframe remaining = [f for f in estimator.feature_names_in_ if f not in vars] assert all([f in X_tr.columns for f in remaining]) # when drop_original is False estimator = clone(estimator) estimator.set_params(drop_original=False) X_tr = estimator.fit_transform(X, y) if hasattr(estimator, "variables_"): vars = estimator.variables_ else: vars = estimator.variables # Check that original variables are in transformed dataframe assert len([f in X_tr.columns for f in vars]) # Check that remaining variables are in transformed dataframe remaining = [f for f in estimator.feature_names_in_ if f not in vars] assert all([f in X_tr.columns for f in remaining])
def check_raises_error_if_only_1_variable(estimator): """For feature selection transformers. Checks that the transformer has 2 or more variables to select from during the search procedure. """ X, y = test_df() estimator = clone(estimator) sel = estimator.set_params( variables=["var_1"], confirm_variables=False, ) msg = ("The selector needs at least 2 or more variables to select from. " "Got only 1 variable: ['var_1'].") with pytest.raises(ValueError) as record: sel.fit(X, y) assert str(record.value) == msg
def check_get_feature_names_out(estimator): """ Check that the method get_feature_names_out() returns the variable names of the transformed dataframe. In most transformers that would be the same as the variable names in the train set used in fit(). The value is stored in `feature_names_in_`. For those transformers that return additional variables, we need to incorporate specific tests, based on the transformer functionality. They will be skipped from this test. """ _skip_test = [ "OneHotEncoder", "AddMissingIndicator", "LagFeatures", "WindowFeatures", "ExpandingWindowFeatures", "MathFeatures", "CyclicalFeatures", "RelativeFeatures", "DatetimeFeatures", ] if estimator.__class__.__name__ not in _skip_test: # train set X, y = test_df(categorical=True, datetime=True) # train transformer estimator = clone(estimator) estimator.fit(X, y) # train pipeline with transformer pipe = Pipeline([("transformer", clone(estimator))]) pipe.fit(X, y) # feature names in train set feature_names = list(X.columns) # selection transformers if (hasattr(estimator, "confirm_variables") or estimator.__class__.__name__ == "DropFeatures"): feature_names = [ f for f in feature_names if f not in estimator.features_to_drop_ ] # take a few as input features (selectors ignore this parameter) input_features = [feature_names[0:3]] # test transformer assert estimator.get_feature_names_out() == feature_names assert estimator.get_feature_names_out( input_features) == feature_names assert estimator.transform(X).shape[1] == len(feature_names) # test transformer within pipeline assert pipe.get_feature_names_out() == feature_names assert pipe.get_feature_names_out(input_features) == feature_names elif estimator.__class__.__name__ == "MatchVariables": # take a few as input features (these transformers ignore this parameter) input_features = [feature_names[0:3]] # test transformer assert estimator.get_feature_names_out() == feature_names assert estimator.get_feature_names_out( input_features) == feature_names assert estimator.transform(X).shape[1] == len(feature_names) # test transformer within pipeline assert pipe.get_feature_names_out() == feature_names assert pipe.get_feature_names_out(input_features) == feature_names else: input_features = estimator.variables_ # test transformer assert estimator.get_feature_names_out() == feature_names assert estimator.get_feature_names_out( input_features) == input_features assert estimator.transform(X).shape[1] == len(feature_names) # test transformer within pipeline assert pipe.get_feature_names_out() == feature_names assert pipe.get_feature_names_out(input_features) == input_features
def check_takes_cv_constructor(estimator): """ Only for transformers with a parameter `cv`in init. For those transformers that implement cross-validation, checks that all sklearn cross-validation constructors can be used with the transformer. This checks corroborates that the attributes learned during fit() are indeed learned. """ from sklearn.model_selection import KFold, StratifiedKFold X, y = test_df(categorical=True) estimator = clone(estimator) cv_constructor_ls = [KFold(n_splits=3), StratifiedKFold(n_splits=3), None] for cv_constructor in cv_constructor_ls: sel = estimator.set_params(cv=cv_constructor) sel.fit(X, y) Xtransformed = sel.transform(X) # test fit attrs if hasattr(sel, "initial_model_performance_"): assert isinstance(sel.initial_model_performance_, (int, float)) if hasattr(sel, "features_to_drop_"): assert isinstance(sel.features_to_drop_, list) assert all([x for x in sel.features_to_drop_ if x in X.columns]) assert len(sel.features_to_drop_) < X.shape[1] assert not Xtransformed.empty assert all([ x for x in Xtransformed.columns if x not in sel.features_to_drop_ ]) if hasattr(sel, "performance_drifts_"): assert isinstance(sel.performance_drifts_, dict) assert all( [x for x in X.columns if x in sel.performance_drifts_.keys()]) assert all([ isinstance(sel.performance_drifts_[var], (int, float)) for var in sel.performance_drifts_.keys() ]) if hasattr(sel, "feature_performance_"): assert isinstance(sel.feature_performance_, dict) assert all( [x for x in X.columns if x in sel.feature_performance_.keys()]) assert all([ isinstance(sel.feature_performance_[var], (int, float)) for var in sel.feature_performance_.keys() ]) if hasattr(sel, "scores_dict_"): assert isinstance(sel.scores_dict_, dict) assert all([x for x in X.columns if x in sel.scores_dict_.keys()]) assert all([ isinstance(sel.scores_dict_[var], (int, float)) for var in sel.scores_dict_.keys() ])
def test_variable_selection(estimator): transformer = clone(estimator) X, y = test_df(categorical=True, datetime=True) if transformer.__class__.__name__ == "TargetMeanRegressor": # y needs to be continuous y = X["var_10"] X.drop(["var_10"], axis=1, inplace=True) # cast one variable as category X[["cat_var2"]] = X[["cat_var2"]].astype("category") # cast datetime as object X[["date1"]] = X[["date1"]].astype("O") # Case 1: numerical variable as string transformer.set_params(variables="var_1") assert transformer.variables == "var_1" transformer.fit(X, y) assert transformer.variables == "var_1" assert transformer.variables_categorical_ == [] assert transformer.variables_numerical_ == ["var_1"] # Case 2: numerical variable as list transformer.set_params(variables=["var_1"]) assert transformer.variables == ["var_1"] transformer.fit(X, y) assert transformer.variables == ["var_1"] assert transformer.variables_categorical_ == [] assert transformer.variables_numerical_ == ["var_1"] # Case 3: categorical variable as string transformer.set_params(variables="cat_var1") assert transformer.variables == "cat_var1" transformer.fit(X, y) assert transformer.variables == "cat_var1" assert transformer.variables_categorical_ == ["cat_var1"] assert transformer.variables_numerical_ == [] # Case 4: categorical variable as list transformer.set_params(variables=["cat_var1"]) assert transformer.variables == ["cat_var1"] transformer.fit(X, y) assert transformer.variables == ["cat_var1"] assert transformer.variables_categorical_ == ["cat_var1"] assert transformer.variables_numerical_ == [] # Case 5: numerical and categorical variables transformer.set_params( variables=["var_1", "var_2", "cat_var1", "cat_var2", "date1"]) assert transformer.variables == [ "var_1", "var_2", "cat_var1", "cat_var2", "date1" ] transformer.fit(X, y) assert transformer.variables == [ "var_1", "var_2", "cat_var1", "cat_var2", "date1" ] assert transformer.variables_categorical_ == [ "cat_var1", "cat_var2", "date1" ] assert transformer.variables_numerical_ == ["var_1", "var_2"] # Case 6: automatically select variables X_c = X[["var_1", "var_2", "cat_var1", "cat_var2", "date1", "date2"]].copy() transformer.set_params(variables=None) assert transformer.variables is None transformer.fit(X_c, y) assert transformer.variables is None assert transformer.variables_categorical_ == ["cat_var1", "cat_var2"] assert transformer.variables_numerical_ == ["var_1", "var_2"] transformer.set_params(variables=["var_1", "cat_var1", "date2"]) with pytest.raises(TypeError): transformer.fit(X, y) # Case 6: user passes empty list transformer.set_params(variables=[]) with pytest.raises(ValueError): transformer.fit(X, y)