def clean_data(X): X.dropna(subset=['target'], inplace=True) y = X.pop('target') X.drop(columns='ID', inplace=True) X['v22'] = X['v22'].apply(az_to_int) cat_cols = X.select_dtypes(include=['object']).columns.tolist() con_cols = X.select_dtypes(include=['number']).columns.tolist() num_missing_imputer = SimpleImputer(strategy='median') cat_missing_imputer = CategoricalImputer(fill_value='__MISS__') rare_label_encoder = RareLabelEncoder(tol=0.01, n_categories=10, replace_with='__OTHER__') cat_freq_encoder = CountFrequencyEncoder(encoding_method="frequency") X[con_cols] = num_missing_imputer.fit_transform(X[con_cols]) X[cat_cols] = cat_missing_imputer.fit_transform(X[cat_cols]) X[cat_cols] = rare_label_encoder.fit_transform(X[cat_cols]) X[cat_cols] = cat_freq_encoder.fit_transform(X[cat_cols]) # more cleaning trimmer = Winsorizer(capping_method='quantiles', tail='both', fold=0.005) X = trimmer.fit_transform(X) undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=1234) X, Y = undersampler.fit_resample(X, y) quasi_constant = DropConstantFeatures(tol=0.998) X = quasi_constant.fit_transform(X) print(f"Quasi Features to drop {quasi_constant.features_to_drop_}") # Remove duplicated features¶ duplicates = DropDuplicateFeatures() X = duplicates.fit_transform(X) print(f"Duplicate feature sets {duplicates.duplicated_feature_sets_}") print(f"Dropping duplicate features {duplicates.features_to_drop_}") drop_corr = DropCorrelatedFeatures(method="pearson", threshold=0.95, missing_values="ignore") X = drop_corr.fit_transform(X) print(f"Drop correlated feature sets {drop_corr.correlated_feature_sets_}") print(f"Dropping correlared features {drop_corr.features_to_drop_}") X['target'] = Y return X
def test_fit_attributes(df_duplicate_features): transformer = DropDuplicateFeatures() transformer.fit(df_duplicate_features) assert transformer.features_to_drop_ == {"dob", "dob3", "City2", "Age2"} assert transformer.duplicated_feature_sets_ == [ {"dob", "dob2", "dob3"}, {"City", "City2"}, {"Age", "Age2"}, ]
def test_drop_duplicates_features(df_duplicate_features): transformer = DropDuplicateFeatures() X = transformer.fit_transform(df_duplicate_features) # expected result df = pd.DataFrame({ "Name": ["tom", "nick", "krish", "jack"], "dob2": pd.date_range("2020-02-24", periods=4, freq="T"), "City": ["London", "Manchester", "Liverpool", "Bristol"], "Age": [20, 21, 19, 18], "Marks": [0.9, 0.8, 0.7, 0.6], }) pd.testing.assert_frame_equal(X, df)
def test_with_df_with_na(df_duplicate_features_with_na): transformer = DropDuplicateFeatures() X = transformer.fit_transform(df_duplicate_features_with_na) # expected result df = pd.DataFrame({ "Name": ["tom", "nick", "krish", "jack", np.nan], "dob2": pd.date_range("2020-02-24", periods=5, freq="T"), "City": ["London", "Manchester", "Liverpool", "Bristol", np.nan], "Age": [20, 21, np.nan, 18, 34], "Marks": [0.9, 0.8, 0.7, 0.6, 0.5], }) pd.testing.assert_frame_equal(X, df) assert transformer.duplicated_features_ == {"dob", "dob3", "City2", "Age2"} assert transformer.duplicated_feature_sets_ == [ {"dob", "dob2", "dob3"}, {"City", "City2"}, {"Age", "Age2"}, ] assert transformer.input_shape_ == (5, 9)
def test_non_fitted_error(df_duplicate_features): # test case 3: when fit is not called prior to transform with pytest.raises(NotFittedError): transformer = DropDuplicateFeatures() transformer.transform(df_duplicate_features)
def test_error_if_fit_input_not_dataframe(): with pytest.raises(TypeError): DropDuplicateFeatures().fit({"Name": ["Karthik"]})
def test_variables_assigned_correctly(df_duplicate_features): transformer = DropDuplicateFeatures() assert transformer.variables is None transformer.fit(df_duplicate_features) assert transformer.variables == (list(df_duplicate_features.columns))
def fake_columns(var_list,df): dupis = DropDuplicateFeatures() dupis_train = dupis.fit(df[var_list]) duplicates_train = list(dupis_train.features_to_drop_) return duplicates_train
DropFeatures, DropHighPSIFeatures, RecursiveFeatureAddition, RecursiveFeatureElimination, SelectByShuffling, SelectBySingleFeaturePerformance, SelectByTargetMeanPerformance, SmartCorrelatedSelection, ) _logreg = LogisticRegression(C=0.0001, max_iter=2, random_state=1) _estimators = [ DropFeatures(features_to_drop=["0"]), DropConstantFeatures(missing_values="ignore"), DropDuplicateFeatures(), DropCorrelatedFeatures(), DropHighPSIFeatures(bins=5), SmartCorrelatedSelection(), SelectByShuffling(estimator=_logreg, scoring="accuracy"), SelectByTargetMeanPerformance(bins=3, regression=False), SelectBySingleFeaturePerformance(estimator=_logreg, scoring="accuracy"), RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"), RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100), ] _multivariate_estimators = [ DropDuplicateFeatures(), DropCorrelatedFeatures(), SmartCorrelatedSelection(), SelectByShuffling(estimator=_logreg, scoring="accuracy"),