def test_cardinality_2_correlated_groups(df_double): X, y = df_double X[["var_0", "var_6", "var_7", "var_9"]] = X[["var_0", "var_6", "var_7", "var_9"]].astype(int) transformer = SmartCorrelatedSelection( variables=None, method="pearson", threshold=0.8, missing_values="raise", selection_method="cardinality", estimator=None, ) Xt = transformer.fit_transform(X, y) # expected result df = X[[ "var_1", "var_2", "var_3", "var_4", "var_5", "var_8", "var_10", "var_11" ]].copy() assert transformer.features_to_drop_ == [ "var_0", "var_6", "var_7", "var_9", ] # test transform output pd.testing.assert_frame_equal(Xt, df)
def test_model_performance_2_correlated_groups(df_double): X, y = df_double transformer = SmartCorrelatedSelection( variables=None, method="pearson", threshold=0.8, missing_values="raise", selection_method="model_performance", estimator=RandomForestClassifier(n_estimators=10, random_state=1), scoring="roc_auc", cv=3, ) Xt = transformer.fit_transform(X, y) # expected result df = X[[ "var_0", "var_1", "var_2", "var_3", "var_5", "var_7", "var_10", "var_11" ]].copy() # test fit attrs assert transformer.correlated_feature_sets_ == [ {"var_0", "var_8"}, {"var_4", "var_6", "var_7", "var_9"}, ] assert transformer.features_to_drop_ == [ "var_4", "var_6", "var_8", "var_9", ] # test transform output pd.testing.assert_frame_equal(Xt, df)
def test_error_if_select_model_performance_and_y_is_none(df_single): X, y = df_single transformer = SmartCorrelatedSelection( selection_method="model_performance", estimator=RandomForestClassifier(n_estimators=10, random_state=1), scoring="roc_auc", ) with pytest.raises(ValueError): transformer.fit(X)
def test_raises_param_errors(): with pytest.raises(ValueError): SmartCorrelatedSelection(threshold=None) with pytest.raises(ValueError): SmartCorrelatedSelection(missing_values=None) with pytest.raises(ValueError): SmartCorrelatedSelection(selection_method="random") with pytest.raises(ValueError): SmartCorrelatedSelection(selection_method="missing_values", missing_values="raise")
def test_callable_method(df_test, random_uniform_method): X, _ = df_test transformer = SmartCorrelatedSelection(method=random_uniform_method, ) Xt = transformer.fit_transform(X) # test no empty dataframe assert not Xt.empty # test fit attrs assert len(transformer.correlated_feature_sets_) > 0 assert len(transformer.features_to_drop_) > 0 assert len(transformer.variables_) > 0 assert transformer.n_features_in_ == len(X.columns)
def test_error_if_select_model_performance_and_y_is_none(df_single): X, y = df_single transformer = SmartCorrelatedSelection( variables=None, method="pearson", threshold=0.8, missing_values="raise", selection_method="model_performance", estimator=RandomForestClassifier(n_estimators=10, random_state=1), scoring="roc_auc", cv=3, ) with pytest.raises(ValueError): transformer.fit(X)
def test_error_method_supplied(df_test): X, _ = df_test method = "hola" transformer = SmartCorrelatedSelection(method=method) with pytest.raises(ValueError) as errmsg: _ = transformer.fit_transform(X) exceptionmsg = errmsg.value.args[0] assert ( exceptionmsg == "method must be either 'pearson', 'spearman', 'kendall', or a callable," + f" '{method}' was supplied")
def test_automatic_variable_selection(df_double): X, y = df_double X[["var_0", "var_6", "var_7", "var_9"]] = X[["var_0", "var_6", "var_7", "var_9"]].astype(int) # add 2 additional categorical variables, these should not be evaluated by # the selector X["cat_1"] = "cat1" X["cat_2"] = "cat2" transformer = SmartCorrelatedSelection( variables=None, method="pearson", threshold=0.8, missing_values="raise", selection_method="cardinality", estimator=None, ) Xt = transformer.fit_transform(X, y) # expected result df = X[[ "var_1", "var_2", "var_3", "var_4", "var_5", "var_8", "var_10", "var_11", "cat_1", "cat_2", ]].copy() assert transformer.features_to_drop_ == [ "var_0", "var_6", "var_7", "var_9", ] # test transform output pd.testing.assert_frame_equal(Xt, df)
def test_callable_method(df_test, random_uniform_method): X, _ = df_test transformer = SmartCorrelatedSelection( variables=None, method=random_uniform_method, threshold=0.8, missing_values="raise", selection_method="variance", ) Xt = transformer.fit_transform(X) # test no empty dataframe assert not Xt.empty # test fit attrs assert len(transformer.correlated_feature_sets_) > 0 assert len(transformer.features_to_drop_) > 0 assert len(transformer.variables_) > 0 assert transformer.n_features_in_ == len(X.columns)
def test_error_method_supplied(df_test): X, _ = df_test method = "hola" transformer = SmartCorrelatedSelection( variables=None, method=method, threshold=0.8, missing_values="raise", selection_method="variance", ) with pytest.raises(ValueError) as errmsg: _ = transformer.fit_transform(X) exceptionmsg = errmsg.value.args[0] assert ( exceptionmsg == "method must be either 'pearson', 'spearman', 'kendall', or a callable," + f" '{method}' was supplied")
def test_non_fitted_error(df_single): X, y = df_single # when fit is not called prior to transform with pytest.raises(NotFittedError): transformer = SmartCorrelatedSelection() transformer.transform(X) transformer = SmartCorrelatedSelection( variables=None, method="pearson", threshold=0.8, missing_values="raise", selection_method="model_performance", estimator=RandomForestClassifier(n_estimators=10, random_state=1), scoring="roc_auc", cv=3, )
def test_non_fitted_error(df_single): X, y = df_single # when fit is not called prior to transform with pytest.raises(NotFittedError): transformer = SmartCorrelatedSelection() transformer.transform(X)
def test_error_if_fit_input_not_dataframe(): with pytest.raises(TypeError): SmartCorrelatedSelection().fit({"Name": [1]})
RecursiveFeatureElimination, SelectByShuffling, SelectBySingleFeaturePerformance, SelectByTargetMeanPerformance, SmartCorrelatedSelection, ) _logreg = LogisticRegression(C=0.0001, max_iter=2, random_state=1) _estimators = [ DropFeatures(features_to_drop=["0"]), DropConstantFeatures(missing_values="ignore"), DropDuplicateFeatures(), DropCorrelatedFeatures(), DropHighPSIFeatures(bins=5), SmartCorrelatedSelection(), SelectByShuffling(estimator=_logreg, scoring="accuracy"), SelectByTargetMeanPerformance(bins=3, regression=False), SelectBySingleFeaturePerformance(estimator=_logreg, scoring="accuracy"), RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"), RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100), ] _multivariate_estimators = [ DropDuplicateFeatures(), DropCorrelatedFeatures(), SmartCorrelatedSelection(), SelectByShuffling(estimator=_logreg, scoring="accuracy"), RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"), RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100), ]
def test_KFold_generators(df_test): X, y = df_test # Kfold sel = SmartCorrelatedSelection( variables=None, method="pearson", threshold=0.8, missing_values="raise", selection_method="model_performance", estimator=RandomForestClassifier(n_estimators=10, random_state=1), scoring="roc_auc", cv=KFold(n_splits=3), ) sel.fit(X, y) Xtransformed = sel.transform(X) # test fit attrs assert isinstance(sel.features_to_drop_, list) assert all([x for x in sel.features_to_drop_ if x in X.columns]) assert len(sel.features_to_drop_) < X.shape[1] assert not Xtransformed.empty assert all( [x for x in Xtransformed.columns if x not in sel.features_to_drop_]) # Stratfied sel = SmartCorrelatedSelection( variables=None, method="pearson", threshold=0.8, missing_values="raise", selection_method="model_performance", estimator=RandomForestClassifier(n_estimators=10, random_state=1), scoring="roc_auc", cv=StratifiedKFold(n_splits=3), ) sel.fit(X, y) Xtransformed = sel.transform(X) # test fit attrs assert isinstance(sel.features_to_drop_, list) assert all([x for x in sel.features_to_drop_ if x in X.columns]) assert len(sel.features_to_drop_) < X.shape[1] assert not Xtransformed.empty assert all( [x for x in Xtransformed.columns if x not in sel.features_to_drop_]) # None sel = SmartCorrelatedSelection( variables=None, method="pearson", threshold=0.8, missing_values="raise", selection_method="model_performance", estimator=RandomForestClassifier(n_estimators=10, random_state=1), scoring="roc_auc", cv=None, ) sel.fit(X, y) Xtransformed = sel.transform(X) # test fit attrs assert isinstance(sel.features_to_drop_, list) assert all([x for x in sel.features_to_drop_ if x in X.columns]) assert len(sel.features_to_drop_) < X.shape[1] assert not Xtransformed.empty assert all( [x for x in Xtransformed.columns if x not in sel.features_to_drop_])