def test_regression_cv_2_and_mse(load_diabetes_dataset): # test for regression using cv=2, and the neg_mean_squared_error as metric. # add suitable threshold for regression mse X, y = load_diabetes_dataset sel = RecursiveFeatureAddition( estimator=DecisionTreeRegressor(random_state=0), scoring="neg_mean_squared_error", cv=2, threshold=10, ) # fit transformer sel.fit(X, y) # expected output Xtransformed = X[[1, 2, 7]].copy() # expected ordred features by importance, from most important # to least important ordered_features = [2, 8, 5, 7, 3, 9, 6, 4, 0, 1] # test init params assert sel.cv == 2 assert sel.variables is None assert sel.scoring == "neg_mean_squared_error" assert sel.threshold == 10 # fit params assert sel.variables_ == list(X.columns) assert np.round(sel.initial_model_performance_, 0) == -5836.0 assert sel.features_to_drop_ == [0, 3, 4, 5, 6, 8, 9] assert list(sel.performance_drifts_.keys()) == ordered_features # test transform output pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_regression_cv_3_and_r2(load_diabetes_dataset): # test for regression using cv=3, and the r2 as metric. X, y = load_diabetes_dataset sel = RecursiveFeatureAddition(estimator=LinearRegression(), scoring="r2", cv=3) sel.fit(X, y) # expected output Xtransformed = X[[2, 3, 4, 8]].copy() # expected ordred features by importance, from most important # to least important ordered_features = [4, 8, 2, 5, 3, 1, 7, 6, 9, 0] # test init params assert sel.cv == 3 assert sel.variables is None assert sel.scoring == "r2" assert sel.threshold == 0.01 # fit params assert sel.variables_ == list(X.columns) assert np.round(sel.initial_model_performance_, 3) == 0.489 assert sel.features_to_drop_ == [0, 1, 5, 6, 7, 9] assert list(sel.performance_drifts_.keys()) == ordered_features # test transform output pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_classification(estimator, cv, threshold, scoring, dropped_features, performances, df_test): X, y = df_test sel = RecursiveFeatureAddition(estimator=estimator, cv=cv, threshold=threshold, scoring=scoring) sel.fit(X, y) Xtransformed = X.copy() Xtransformed = Xtransformed.drop(labels=dropped_features, axis=1) # test fit attrs assert sel.features_to_drop_ == dropped_features assert len(sel.performance_drifts_.keys()) == len(X.columns) assert all([var in sel.performance_drifts_.keys() for var in X.columns]) rounded_perfs = { key: round(sel.performance_drifts_[key], 4) for key in sel.performance_drifts_ } assert rounded_perfs == performances # test transform output pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_regression( estimator, cv, threshold, scoring, dropped_features, performances, load_diabetes_dataset, ): # test for regression using cv=3, and the r2 as metric. X, y = load_diabetes_dataset sel = RecursiveFeatureAddition(estimator=estimator, cv=cv, threshold=threshold, scoring=scoring) sel.fit(X, y) Xtransformed = X.copy() Xtransformed = Xtransformed.drop(labels=dropped_features, axis=1) # test fit attrs assert sel.features_to_drop_ == dropped_features assert len(sel.performance_drifts_.keys()) == len(X.columns) assert all([var in sel.performance_drifts_.keys() for var in X.columns]) rounded_perfs = { key: round(sel.performance_drifts_[key], 4) for key in sel.performance_drifts_ } assert rounded_perfs == performances # test transform output pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_feature_importances(_estimator, _importance, df_test): X, y = df_test sel = RecursiveFeatureAddition(_estimator, threshold=-100).fit(X, y) _importance.sort(reverse=True) assert list(np.round(sel.feature_importances_.values, 4)) == _importance sel = RecursiveFeatureElimination(_estimator, threshold=-100).fit(X, y) _importance.sort(reverse=False) assert list(np.round(sel.feature_importances_.values, 4)) == _importance
def test_classification_threshold_parameters(df_test): X, y = df_test sel = RecursiveFeatureAddition(RandomForestClassifier(random_state=1), threshold=0.001) sel.fit(X, y) # expected result Xtransformed = pd.DataFrame(X[['var_7', 'var_10']].copy()) # expected ordred features by importance, from most important # to least important ordered_features = [ 'var_7', 'var_4', 'var_6', 'var_9', 'var_0', 'var_8', 'var_1', 'var_10', 'var_5', 'var_11', 'var_2', 'var_3' ] # test init params assert sel.variables == [ "var_0", "var_1", "var_2", "var_3", "var_4", "var_5", "var_6", "var_7", "var_8", "var_9", "var_10", "var_11", ] assert sel.threshold == 0.001 assert sel.cv == 3 assert sel.scoring == "roc_auc" # test fit attrs assert np.round(sel.initial_model_performance_, 3) == 0.997 assert sel.selected_features_ == ['var_7', 'var_10'] assert list(sel.performance_drifts_.keys()) == ordered_features # test transform output pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_KFold_generators(df_test): X, y = df_test # Kfold sel = RecursiveFeatureAddition( RandomForestClassifier(random_state=1), threshold=0.001, cv=KFold(n_splits=3), ) sel.fit(X, y) Xtransformed = sel.transform(X) # test fit attrs assert sel.initial_model_performance_ > 0.995 assert isinstance(sel.features_to_drop_, list) assert all([x for x in sel.features_to_drop_ if x in X.columns]) assert len(sel.features_to_drop_) < X.shape[1] assert not Xtransformed.empty assert all([x for x in Xtransformed.columns if x not in sel.features_to_drop_]) assert isinstance(sel.performance_drifts_, dict) assert all([x for x in X.columns if x in sel.performance_drifts_.keys()]) assert all( [ isinstance(sel.performance_drifts_[var], (int, float)) for var in sel.performance_drifts_.keys() ] ) # Stratfied sel = RecursiveFeatureAddition( RandomForestClassifier(random_state=1), threshold=0.001, cv=StratifiedKFold(n_splits=3), ) sel.fit(X, y) Xtransformed = sel.transform(X) # test fit attrs assert sel.initial_model_performance_ > 0.995 assert isinstance(sel.features_to_drop_, list) assert all([x for x in sel.features_to_drop_ if x in X.columns]) assert len(sel.features_to_drop_) < X.shape[1] assert not Xtransformed.empty assert all([x for x in Xtransformed.columns if x not in sel.features_to_drop_]) assert isinstance(sel.performance_drifts_, dict) assert all([x for x in X.columns if x in sel.performance_drifts_.keys()]) assert all( [ isinstance(sel.performance_drifts_[var], (int, float)) for var in sel.performance_drifts_.keys() ] ) # None sel = RecursiveFeatureAddition( RandomForestClassifier(random_state=1), threshold=0.001, cv=None, ) sel.fit(X, y) Xtransformed = sel.transform(X) # test fit attrs assert sel.initial_model_performance_ > 0.995 assert isinstance(sel.features_to_drop_, list) assert all([x for x in sel.features_to_drop_ if x in X.columns]) assert len(sel.features_to_drop_) < X.shape[1] assert not Xtransformed.empty assert all([x for x in Xtransformed.columns if x not in sel.features_to_drop_]) assert isinstance(sel.performance_drifts_, dict) assert all([x for x in X.columns if x in sel.performance_drifts_.keys()]) assert all( [ isinstance(sel.performance_drifts_[var], (int, float)) for var in sel.performance_drifts_.keys() ] )
def test_automatic_variable_selection(df_test): X, y = df_test # add 2 additional categorical variables, these should not be evaluated by # the selector X["cat_1"] = "cat1" X["cat_2"] = "cat2" sel = RecursiveFeatureAddition( RandomForestClassifier(random_state=1), threshold=0.001 ) sel.fit(X, y) # expected result Xtransformed = X[["var_7", "var_10", "cat_1", "cat_2"]].copy() # expected ordered features by importance, from most important # to least important ordered_features = [ "var_7", "var_4", "var_6", "var_9", "var_0", "var_8", "var_1", "var_10", "var_5", "var_11", "var_2", "var_3", ] # test init params assert sel.variables is None assert sel.threshold == 0.001 assert sel.cv == 3 assert sel.scoring == "roc_auc" # test fit attrs assert sel.variables_ == [ "var_0", "var_1", "var_2", "var_3", "var_4", "var_5", "var_6", "var_7", "var_8", "var_9", "var_10", "var_11", ] assert np.round(sel.initial_model_performance_, 3) == 0.997 assert sel.features_to_drop_ == [ "var_0", "var_1", "var_2", "var_3", "var_4", "var_5", "var_6", "var_8", "var_9", "var_11", ] assert list(sel.performance_drifts_.keys()) == ordered_features # test transform output pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_raises_threshold_error(): with pytest.raises(ValueError): RecursiveFeatureAddition(RandomForestClassifier(random_state=1), threshold=None)
def test_non_fitted_error(df_test): # when fit is not called prior to transform with pytest.raises(NotFittedError): sel = RecursiveFeatureAddition(RandomForestClassifier(random_state=1)) sel.transform(df_test)
def test_raises_threshold_error(): with pytest.raises(ValueError): RecursiveFeatureAddition(threshold=None)
def test_raises_cv_error(): with pytest.raises(ValueError): RecursiveFeatureAddition(cv=0)
def test_non_fitted_error(df_test): # when fit is not called prior to transform with pytest.raises(NotFittedError): sel = RecursiveFeatureAddition() sel.transform(df_test)
SmartCorrelatedSelection, ) _logreg = LogisticRegression(C=0.0001, max_iter=2, random_state=1) _estimators = [ DropFeatures(features_to_drop=["0"]), DropConstantFeatures(missing_values="ignore"), DropDuplicateFeatures(), DropCorrelatedFeatures(), DropHighPSIFeatures(bins=5), SmartCorrelatedSelection(), SelectByShuffling(estimator=_logreg, scoring="accuracy"), SelectByTargetMeanPerformance(bins=3, regression=False), SelectBySingleFeaturePerformance(estimator=_logreg, scoring="accuracy"), RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"), RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100), ] _multivariate_estimators = [ DropDuplicateFeatures(), DropCorrelatedFeatures(), SmartCorrelatedSelection(), SelectByShuffling(estimator=_logreg, scoring="accuracy"), RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"), RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100), ] _univariate_estimators = [ DropFeatures(features_to_drop=["var_1"]), DropConstantFeatures(missing_values="ignore"),
# selectors @parametrize_with_checks([ DropFeatures(features_to_drop=["0"]), DropConstantFeatures(missing_values="ignore"), DropDuplicateFeatures(), DropCorrelatedFeatures(), SmartCorrelatedSelection(), DropHighPSIFeatures(bins=5), SelectByShuffling(LogisticRegression(max_iter=2, random_state=1), scoring="accuracy"), SelectBySingleFeaturePerformance(LogisticRegression(max_iter=2, random_state=1), scoring="accuracy"), RecursiveFeatureAddition(LogisticRegression(max_iter=2, random_state=1), scoring="accuracy"), RecursiveFeatureElimination( LogisticRegression(max_iter=2, random_state=1), scoring="accuracy", threshold=-100, ), SelectByTargetMeanPerformance(scoring="roc_auc", bins=3, regression=False), ]) def test_sklearn_compatible_selectors(estimator, check): check(estimator) # wrappers @parametrize_with_checks([SklearnTransformerWrapper(SimpleImputer())]) def test_sklearn_compatible_wrapper(estimator, check): check(estimator)
RecursiveFeatureAddition, RecursiveFeatureElimination, SelectByShuffling, SelectBySingleFeaturePerformance, SelectByTargetMeanPerformance, SmartCorrelatedSelection, ) @pytest.mark.parametrize( "Estimator", [ DropFeatures(features_to_drop=["0"]), DropConstantFeatures(), DropDuplicateFeatures(), DropCorrelatedFeatures(), SmartCorrelatedSelection(), SelectByShuffling(RandomForestClassifier(random_state=1), scoring="accuracy"), SelectBySingleFeaturePerformance( RandomForestClassifier(random_state=1), scoring="accuracy"), RecursiveFeatureAddition(RandomForestClassifier(random_state=1), scoring="accuracy"), RecursiveFeatureElimination(RandomForestClassifier(random_state=1), scoring="accuracy"), SelectByTargetMeanPerformance(scoring="r2_score", bins=3), ], ) def test_all_transformers(Estimator): return check_estimator(Estimator)