def test_default_params(df_correlated_single): transformer = DropCorrelatedFeatures(variables=None, method="pearson", threshold=0.8) X = transformer.fit_transform(df_correlated_single) # expected result df = df_correlated_single.drop("var_2", axis=1) # test init params assert transformer.method == "pearson" assert transformer.threshold == 0.8 assert transformer.variables == [ "var_0", "var_1", "var_2", "var_3", "var_4", "var_5", ] # test fit attrs pd.testing.assert_frame_equal(transformer.correlated_matrix_, df_correlated_single.corr()) assert transformer.correlated_features_ == {"var_2"} assert transformer.correlated_feature_sets_ == [{"var_1", "var_2"}] # test transform output pd.testing.assert_frame_equal(X, df)
def test_lower_threshold(df_correlated_single): transformer = DropCorrelatedFeatures(variables=None, method="pearson", threshold=0.6) X = transformer.fit_transform(df_correlated_single) # expected result df = df_correlated_single.drop(["var_2", "var_4"], axis=1) # test init params assert transformer.method == "pearson" assert transformer.threshold == 0.6 assert transformer.variables is None # test fit attrs assert transformer.variables_ == [ "var_0", "var_1", "var_2", "var_3", "var_4", "var_5", ] assert transformer.features_to_drop_ == {"var_2", "var_4"} assert transformer.correlated_feature_sets_ == [{ "var_1", "var_2", "var_4" }] # test transform output pd.testing.assert_frame_equal(X, df)
def clean_data(X): X.dropna(subset=['target'], inplace=True) y = X.pop('target') X.drop(columns='ID', inplace=True) X['v22'] = X['v22'].apply(az_to_int) cat_cols = X.select_dtypes(include=['object']).columns.tolist() con_cols = X.select_dtypes(include=['number']).columns.tolist() num_missing_imputer = SimpleImputer(strategy='median') cat_missing_imputer = CategoricalImputer(fill_value='__MISS__') rare_label_encoder = RareLabelEncoder(tol=0.01, n_categories=10, replace_with='__OTHER__') cat_freq_encoder = CountFrequencyEncoder(encoding_method="frequency") X[con_cols] = num_missing_imputer.fit_transform(X[con_cols]) X[cat_cols] = cat_missing_imputer.fit_transform(X[cat_cols]) X[cat_cols] = rare_label_encoder.fit_transform(X[cat_cols]) X[cat_cols] = cat_freq_encoder.fit_transform(X[cat_cols]) # more cleaning trimmer = Winsorizer(capping_method='quantiles', tail='both', fold=0.005) X = trimmer.fit_transform(X) undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=1234) X, Y = undersampler.fit_resample(X, y) quasi_constant = DropConstantFeatures(tol=0.998) X = quasi_constant.fit_transform(X) print(f"Quasi Features to drop {quasi_constant.features_to_drop_}") # Remove duplicated features¶ duplicates = DropDuplicateFeatures() X = duplicates.fit_transform(X) print(f"Duplicate feature sets {duplicates.duplicated_feature_sets_}") print(f"Dropping duplicate features {duplicates.features_to_drop_}") drop_corr = DropCorrelatedFeatures(method="pearson", threshold=0.95, missing_values="ignore") X = drop_corr.fit_transform(X) print(f"Drop correlated feature sets {drop_corr.correlated_feature_sets_}") print(f"Dropping correlared features {drop_corr.features_to_drop_}") X['target'] = Y return X
def test_callable_method(df_correlated_double, random_uniform_method): X = df_correlated_double transformer = DropCorrelatedFeatures(variables=None, method=random_uniform_method, threshold=0.6) Xt = transformer.fit_transform(X) # test no empty dataframe assert not Xt.empty # test fit attrs assert len(transformer.correlated_feature_sets_) > 0 assert len(transformer.features_to_drop_) > 0 assert len(transformer.variables_) > 0 assert transformer.n_features_in_ == len(X.columns)
def test_error_method_supplied(df_correlated_double): X = df_correlated_double method = "hola" transformer = DropCorrelatedFeatures(variables=None, method=method, threshold=0.8) with pytest.raises(ValueError) as errmsg: _ = transformer.fit_transform(X) exceptionmsg = errmsg.value.args[0] assert ( exceptionmsg == "method must be either 'pearson', 'spearman', 'kendall', or a callable," + f" '{method}' was supplied")
def test_more_than_1_correlated_group(df_correlated_double): transformer = DropCorrelatedFeatures(variables=None, method="pearson", threshold=0.6) X = transformer.fit_transform(df_correlated_double) # expected result df = df_correlated_double.drop(["var_6", "var_7", "var_8", "var_9"], axis=1) # test fit attrs assert transformer.features_to_drop_ == { "var_6", "var_7", "var_8", "var_9" } assert transformer.correlated_feature_sets_ == [ {"var_0", "var_8"}, {"var_4", "var_6", "var_7", "var_9"}, ] # test transform output pd.testing.assert_frame_equal(X, df)
def test_non_fitted_error(df_correlated_single): # when fit is not called prior to transform with pytest.raises(NotFittedError): transformer = DropCorrelatedFeatures() transformer.transform(df_correlated_single)
def test_error_if_fit_input_not_dataframe(): with pytest.raises(TypeError): # Next line needs review DropCorrelatedFeatures().fit({"Name": [1]})
DropHighPSIFeatures, RecursiveFeatureAddition, RecursiveFeatureElimination, SelectByShuffling, SelectBySingleFeaturePerformance, SelectByTargetMeanPerformance, SmartCorrelatedSelection, ) _logreg = LogisticRegression(C=0.0001, max_iter=2, random_state=1) _estimators = [ DropFeatures(features_to_drop=["0"]), DropConstantFeatures(missing_values="ignore"), DropDuplicateFeatures(), DropCorrelatedFeatures(), DropHighPSIFeatures(bins=5), SmartCorrelatedSelection(), SelectByShuffling(estimator=_logreg, scoring="accuracy"), SelectByTargetMeanPerformance(bins=3, regression=False), SelectBySingleFeaturePerformance(estimator=_logreg, scoring="accuracy"), RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"), RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100), ] _multivariate_estimators = [ DropDuplicateFeatures(), DropCorrelatedFeatures(), SmartCorrelatedSelection(), SelectByShuffling(estimator=_logreg, scoring="accuracy"), RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"),