Ejemplo n.º 1
0
def test_default_params(df_correlated_single):
    transformer = DropCorrelatedFeatures(variables=None,
                                         method="pearson",
                                         threshold=0.8)
    X = transformer.fit_transform(df_correlated_single)

    # expected result
    df = df_correlated_single.drop("var_2", axis=1)

    # test init params
    assert transformer.method == "pearson"
    assert transformer.threshold == 0.8
    assert transformer.variables == [
        "var_0",
        "var_1",
        "var_2",
        "var_3",
        "var_4",
        "var_5",
    ]

    # test fit attrs
    pd.testing.assert_frame_equal(transformer.correlated_matrix_,
                                  df_correlated_single.corr())
    assert transformer.correlated_features_ == {"var_2"}
    assert transformer.correlated_feature_sets_ == [{"var_1", "var_2"}]
    # test transform output
    pd.testing.assert_frame_equal(X, df)
Ejemplo n.º 2
0
def test_lower_threshold(df_correlated_single):
    transformer = DropCorrelatedFeatures(variables=None,
                                         method="pearson",
                                         threshold=0.6)
    X = transformer.fit_transform(df_correlated_single)

    # expected result
    df = df_correlated_single.drop(["var_2", "var_4"], axis=1)

    # test init params
    assert transformer.method == "pearson"
    assert transformer.threshold == 0.6
    assert transformer.variables is None

    # test fit attrs
    assert transformer.variables_ == [
        "var_0",
        "var_1",
        "var_2",
        "var_3",
        "var_4",
        "var_5",
    ]
    assert transformer.features_to_drop_ == {"var_2", "var_4"}
    assert transformer.correlated_feature_sets_ == [{
        "var_1", "var_2", "var_4"
    }]
    # test transform output
    pd.testing.assert_frame_equal(X, df)
Ejemplo n.º 3
0
def clean_data(X):
    X.dropna(subset=['target'], inplace=True)
    y = X.pop('target')
    X.drop(columns='ID', inplace=True)
    X['v22'] = X['v22'].apply(az_to_int)
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()
    con_cols = X.select_dtypes(include=['number']).columns.tolist()
    num_missing_imputer = SimpleImputer(strategy='median')
    cat_missing_imputer = CategoricalImputer(fill_value='__MISS__')
    rare_label_encoder = RareLabelEncoder(tol=0.01, n_categories=10, replace_with='__OTHER__')
    cat_freq_encoder = CountFrequencyEncoder(encoding_method="frequency")
    X[con_cols] = num_missing_imputer.fit_transform(X[con_cols])
    X[cat_cols] = cat_missing_imputer.fit_transform(X[cat_cols])
    X[cat_cols] = rare_label_encoder.fit_transform(X[cat_cols])
    X[cat_cols] = cat_freq_encoder.fit_transform(X[cat_cols])
    # more cleaning
    trimmer = Winsorizer(capping_method='quantiles', tail='both', fold=0.005)
    X = trimmer.fit_transform(X)
    undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=1234)
    X, Y = undersampler.fit_resample(X, y)
    quasi_constant = DropConstantFeatures(tol=0.998)
    X = quasi_constant.fit_transform(X)
    print(f"Quasi Features to drop {quasi_constant.features_to_drop_}")
    # Remove duplicated features¶
    duplicates = DropDuplicateFeatures()
    X = duplicates.fit_transform(X)
    print(f"Duplicate feature sets {duplicates.duplicated_feature_sets_}")
    print(f"Dropping duplicate features {duplicates.features_to_drop_}")
    drop_corr = DropCorrelatedFeatures(method="pearson", threshold=0.95, missing_values="ignore")
    X = drop_corr.fit_transform(X)
    print(f"Drop correlated feature sets {drop_corr.correlated_feature_sets_}")
    print(f"Dropping correlared features {drop_corr.features_to_drop_}")
    X['target'] = Y
    return X
Ejemplo n.º 4
0
def test_callable_method(df_correlated_double, random_uniform_method):
    X = df_correlated_double

    transformer = DropCorrelatedFeatures(variables=None,
                                         method=random_uniform_method,
                                         threshold=0.6)

    Xt = transformer.fit_transform(X)

    # test no empty dataframe
    assert not Xt.empty

    # test fit attrs
    assert len(transformer.correlated_feature_sets_) > 0
    assert len(transformer.features_to_drop_) > 0
    assert len(transformer.variables_) > 0
    assert transformer.n_features_in_ == len(X.columns)
Ejemplo n.º 5
0
def test_error_method_supplied(df_correlated_double):

    X = df_correlated_double
    method = "hola"

    transformer = DropCorrelatedFeatures(variables=None,
                                         method=method,
                                         threshold=0.8)

    with pytest.raises(ValueError) as errmsg:
        _ = transformer.fit_transform(X)

    exceptionmsg = errmsg.value.args[0]

    assert (
        exceptionmsg ==
        "method must be either 'pearson', 'spearman', 'kendall', or a callable,"
        + f" '{method}' was supplied")
Ejemplo n.º 6
0
def test_more_than_1_correlated_group(df_correlated_double):
    transformer = DropCorrelatedFeatures(variables=None,
                                         method="pearson",
                                         threshold=0.6)
    X = transformer.fit_transform(df_correlated_double)

    # expected result
    df = df_correlated_double.drop(["var_6", "var_7", "var_8", "var_9"],
                                   axis=1)

    # test fit attrs
    assert transformer.features_to_drop_ == {
        "var_6", "var_7", "var_8", "var_9"
    }
    assert transformer.correlated_feature_sets_ == [
        {"var_0", "var_8"},
        {"var_4", "var_6", "var_7", "var_9"},
    ]
    # test transform output
    pd.testing.assert_frame_equal(X, df)
Ejemplo n.º 7
0
def test_non_fitted_error(df_correlated_single):
    # when fit is not called prior to transform
    with pytest.raises(NotFittedError):
        transformer = DropCorrelatedFeatures()
        transformer.transform(df_correlated_single)
Ejemplo n.º 8
0
def test_error_if_fit_input_not_dataframe():
    with pytest.raises(TypeError):
        # Next line needs review
        DropCorrelatedFeatures().fit({"Name": [1]})
Ejemplo n.º 9
0
    DropHighPSIFeatures,
    RecursiveFeatureAddition,
    RecursiveFeatureElimination,
    SelectByShuffling,
    SelectBySingleFeaturePerformance,
    SelectByTargetMeanPerformance,
    SmartCorrelatedSelection,
)

_logreg = LogisticRegression(C=0.0001, max_iter=2, random_state=1)

_estimators = [
    DropFeatures(features_to_drop=["0"]),
    DropConstantFeatures(missing_values="ignore"),
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
    DropHighPSIFeatures(bins=5),
    SmartCorrelatedSelection(),
    SelectByShuffling(estimator=_logreg, scoring="accuracy"),
    SelectByTargetMeanPerformance(bins=3, regression=False),
    SelectBySingleFeaturePerformance(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100),
]

_multivariate_estimators = [
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
    SmartCorrelatedSelection(),
    SelectByShuffling(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"),