def test_classification():

    X, y = df_classification()

    sel = SelectByTargetMeanPerformance(
        variables=None,
        scoring="accuracy",
        threshold=None,
        bins=2,
        strategy="equal_width",
        cv=2,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X[["cat_var_A", "num_var_A"]]

    performance_dict = {
        "cat_var_A": 1.0,
        "cat_var_B": 0.8,
        "num_var_A": 1.0,
        "num_var_B": 0.8,
    }
    features_to_drop = ["cat_var_B", "num_var_B"]

    assert sel.features_to_drop_ == features_to_drop
    assert sel.feature_performance_ == performance_dict
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)

    sel = SelectByTargetMeanPerformance(
        variables=["cat_var_A", "cat_var_B", "num_var_A", "num_var_B"],
        scoring="roc_auc",
        threshold=0.9,
        bins=2,
        strategy="equal_frequency",
        cv=2,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X[["cat_var_A", "cat_var_B", "num_var_A"]]

    performance_dict = {
        "cat_var_A": 1.0,
        "cat_var_B": 0.92,
        "num_var_A": 1.0,
        "num_var_B": 0.8,
    }
    features_to_drop = ["num_var_B"]

    assert sel.features_to_drop_ == features_to_drop
    assert sel.feature_performance_ == performance_dict
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_regression():

    X, y = df_regression()

    sel = SelectByTargetMeanPerformance(
        variables=None,
        bins=2,
        scoring="r2",
        regression=True,
        cv=2,
        strategy="equal_width",
        threshold=None,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X[["cat_var_A", "cat_var_B", "num_var_A"]]
    performance_dict = {
        "cat_var_A": 1.0,
        "cat_var_B": 0.8533333333333333,
        "num_var_A": 0.8,
        "num_var_B": 0.512,
    }

    assert sel.features_to_drop_ == ["num_var_B"]
    assert sel.feature_performance_ == performance_dict
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)

    X, y = df_regression()

    sel = SelectByTargetMeanPerformance(
        variables=["cat_var_A", "cat_var_B", "num_var_A", "num_var_B"],
        bins=2,
        scoring="neg_root_mean_squared_error",
        regression=True,
        cv=2,
        strategy="equal_width",
        threshold=-0.2,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X["cat_var_A"].to_frame()
    performance_dict = {
        "cat_var_A": 0.0,
        "cat_var_B": -0.42817441928883765,
        "num_var_A": -0.5,
        "num_var_B": -0.7810249675906654,
    }

    assert sel.features_to_drop_ == ["cat_var_B", "num_var_A", "num_var_B"]
    assert sel.feature_performance_ == performance_dict
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
Beispiel #3
0
def test_categorical_variables_roc_auc(df_test_num_cat):
    X, y = df_test_num_cat
    X = X[["var_A", "var_B"]]

    sel = SelectByTargetMeanPerformance(
        variables=None,
        scoring="roc_auc_score",
        threshold=0.78,
        cv=2,
        random_state=1,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X["var_A"].to_frame()
    # performance_dict = {"var_A": 0.841, "var_B": 0.776}

    # test init params
    assert sel.variables == list(X.columns)
    assert sel.scoring == "roc_auc_score"
    assert sel.threshold == 0.78
    assert sel.cv == 2
    assert sel.random_state == 1

    # test fit attrs
    assert sel.variables_categorical_ == list(X.columns)
    assert sel.variables_numerical_ == []
    assert sel.features_to_drop_ == ["var_B"]
    # assert all(
    #     np.round(sel.feature_performance_[f], 3) == performance_dict[f]
    #     for f in sel.feature_performance_.keys()
    # )
    # test transform output
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_numerical_variables_roc_auc(df_test):
    X, y = df_test

    sel = SelectByTargetMeanPerformance(
        variables=None,
        scoring="roc_auc_score",
        threshold=0.6,
        bins=5,
        strategy="equal_width",
        cv=3,
        random_state=1,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X[["var_0", "var_4", "var_6", "var_7", "var_9"]]
    # performance_dict = {
    #     "var_0": 0.628,
    #     "var_1": 0.548,
    #     "var_2": 0.513,
    #     "var_3": 0.474,
    #     "var_4": 0.973,
    #     "var_5": 0.496,
    #     "var_6": 0.97,
    #     "var_7": 0.992,
    #     "var_8": 0.536,
    #     "var_9": 0.931,
    #     "var_10": 0.466,
    #     "var_11": 0.517,
    # }

    # test init params
    assert sel.variables is None
    assert sel.scoring == "roc_auc_score"
    assert sel.threshold == 0.6
    assert sel.bins == 5
    assert sel.strategy == "equal_width"
    assert sel.cv == 3
    assert sel.random_state == 1

    # test fit attrs
    assert sel.variables_ == list(X.columns)
    assert sel.variables_categorical_ == []
    assert sel.variables_numerical_ == list(X.columns)
    assert sel.features_to_drop_ == [
        "var_1",
        "var_2",
        "var_3",
        "var_5",
        "var_8",
        "var_10",
        "var_11",
    ]
    # assert all(
    #     np.round(sel.feature_performance_[f], 3) == performance_dict[f]
    #     for f in sel.feature_performance_.keys()
    # )
    # test transform output
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_test_selector_with_one_variable():

    X, y = df_regression()

    sel = SelectByTargetMeanPerformance(
        variables=["cat_var_A"],
        bins=2,
        scoring="neg_root_mean_squared_error",
        regression=True,
        cv=2,
        strategy="equal_width",
        threshold=-0.2,
    )

    sel.fit(X, y)

    # expected result
    performance_dict = {"cat_var_A": 0.0}

    assert sel.features_to_drop_ == []
    assert sel.feature_performance_ == performance_dict
    pd.testing.assert_frame_equal(sel.transform(X), X)

    X, y = df_regression()

    sel = SelectByTargetMeanPerformance(
        variables=["cat_var_B"],
        bins=2,
        scoring="neg_root_mean_squared_error",
        regression=True,
        cv=2,
        strategy="equal_width",
        threshold=-0.2,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X.drop(columns=["cat_var_B"])
    performance_dict = {"cat_var_B": -0.42817441928883765}

    assert sel.features_to_drop_ == ["cat_var_B"]
    assert sel.feature_performance_ == performance_dict
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
def test_df_cat_and_num_variables_r2(df_test_num_cat):
    X, y = df_test_num_cat

    sel = SelectByTargetMeanPerformance(
        variables=None,
        scoring="r2_score",
        threshold=0.1,
        bins=3,
        strategy="equal_frequency",
        cv=2,
        random_state=1,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X[["var_A", "var_B"]]
    # performance_dict = {
    #     "var_A": 0.392,
    #     "var_B": 0.250,
    #     "var_C": -0.004,
    #     "var_D": -0.052,
    # }

    # test init params
    assert sel.variables is None
    assert sel.scoring == "r2_score"
    assert sel.threshold == 0.1
    assert sel.cv == 2
    assert sel.bins == 3
    assert sel.strategy == "equal_frequency"
    assert sel.random_state == 1

    # test fit attrs
    assert sel.variables_ == list(X.columns)
    assert sel.variables_categorical_ == ["var_A", "var_B"]
    assert sel.variables_numerical_ == ["var_C", "var_D"]
    assert sel.features_to_drop_ == ["var_C", "var_D"]
    # assert all(
    #     np.round(sel.feature_performance_[f], 3) == performance_dict[f]
    #     for f in sel.feature_performance_.keys()
    # )
    # test transform output
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
Beispiel #7
0
def test_df_cat_and_num_variables_roc_auc(df_test_num_cat):
    X, y = df_test_num_cat

    sel = SelectByTargetMeanPerformance(
        variables=None,
        scoring="roc_auc_score",
        threshold=0.6,
        bins=3,
        strategy="equal_width",
        cv=2,
        random_state=1,
    )

    sel.fit(X, y)

    # expected result
    Xtransformed = X[["var_A", "var_B"]]
    performance_dict = {
        "var_A": 0.841,
        "var_B": 0.776,
        "var_C": 0.481,
        "var_D": 0.496
    }

    # test init params
    assert sel.variables == list(X.columns)
    assert sel.scoring == "roc_auc_score"
    assert sel.threshold == 0.60
    assert sel.cv == 2
    assert sel.random_state == 1

    # test fit attrs
    assert sel.variables_categorical_ == ["var_A", "var_B"]
    assert sel.variables_numerical_ == ["var_C", "var_D"]
    assert sel.features_to_drop_ == ["var_C", "var_D"]
    assert all(
        np.round(sel.feature_performance_[f], 3) == performance_dict[f]
        for f in sel.feature_performance_.keys())
    # test transform output
    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
Beispiel #8
0
def test_not_fitted_error(df_test):
    with pytest.raises(NotFittedError):
        transformer = SelectByTargetMeanPerformance()
        transformer.transform(df_test)