Example #1
0
def test_target_leakage_types():
    leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8)

    y = pd.Series([1, 0, 1, 1])
    X = pd.DataFrame()
    X["a"] = ["a", "b", "a", "a"]
    X["b"] = y - 1
    X["c"] = [
        datetime.strptime("2015", "%Y"),
        datetime.strptime("2016", "%Y"),
        datetime.strptime("2015", "%Y"),
        datetime.strptime("2015", "%Y")
    ]
    X["d"] = ~y
    X["e"] = [0, 0, 0, 0]
    y = y.astype(bool)

    expected_messages = {
        "warnings": [
            DataCheckWarning(
                message=
                "Column 'a' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "a"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'b' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "b"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'c' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "c"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'd' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "d"
                }).to_dict()
        ],
        "errors": []
    }

    assert leakage_check.validate(X, y) == expected_messages
def test_target_leakage_data_check_empty(data_type, make_data_type):
    X = make_data_type(data_type, pd.DataFrame())
    y = make_data_type(data_type, pd.Series())
    leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8,
                                           method='mutual')
    assert leakage_check.validate(X, y) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }
Example #3
0
def test_target_leakage_multi():
    leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8)

    # test empty pd.DataFrame, empty pd.Series
    assert leakage_check.validate(pd.DataFrame(), pd.Series()) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }

    y = pd.Series([1, 0, 2, 1, 2, 0])
    X = pd.DataFrame()
    X["a"] = y * 3
    X["b"] = y - 1
    X["c"] = y / 10
    X["d"] = [0, 0, 0, 0, 0, 0]
    X["e"] = ["a", "b", "c", "a", "b", "c"]

    expected_messages = {
        "warnings": [
            DataCheckWarning(
                message=
                "Column 'a' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "a"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'b' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "b"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'c' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "c"
                }).to_dict()
        ],
        "errors": [],
        "actions": []
    }

    # test X as ww.DataTable, y as ww.DataColumn
    assert leakage_check.validate(ww.DataTable(X),
                                  ww.DataColumn(y)) == expected_messages

    #  test y as list
    assert leakage_check.validate(X, y.values) == expected_messages
def test_target_leakage_none():
    leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8)
    y = pd.Series([1, 0, 1, 1])
    X = pd.DataFrame()
    X["a"] = [1, 1, 1, 1]
    X["b"] = [0, 0, 0, 0]
    y = y.astype(bool)

    expected = {"warnings": [], "errors": [], "actions": []}

    assert leakage_check.validate(X, y) == expected
def test_target_leakage_data_check_init():
    target_leakage_check = TargetLeakageDataCheck()
    assert target_leakage_check.pct_corr_threshold == 0.95

    target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.0)
    assert target_leakage_check.pct_corr_threshold == 0

    target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5)
    assert target_leakage_check.pct_corr_threshold == 0.5

    target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=1.0)
    assert target_leakage_check.pct_corr_threshold == 1.0

    with pytest.raises(
            ValueError,
            match=
            "pct_corr_threshold must be a float between 0 and 1, inclusive."):
        TargetLeakageDataCheck(pct_corr_threshold=-0.1)
    with pytest.raises(
            ValueError,
            match=
            "pct_corr_threshold must be a float between 0 and 1, inclusive."):
        TargetLeakageDataCheck(pct_corr_threshold=1.1)

    with pytest.raises(ValueError, match="Method 'MUTUAL' not in"):
        TargetLeakageDataCheck(method='MUTUAL')
    with pytest.raises(ValueError, match="Method 'person' not in"):
        TargetLeakageDataCheck(method='person')
Example #6
0
def test_target_leakage_data_check_warnings():
    y = pd.Series([1, 0, 1, 1])
    X = pd.DataFrame()
    X["a"] = y * 3
    X["b"] = y - 1
    X["c"] = y / 10
    X["d"] = ~y
    X["e"] = [0, 0, 0, 0]
    y = y.astype(bool)

    leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5)
    assert leakage_check.validate(X, y) == {
        "warnings": [
            DataCheckWarning(
                message=
                "Column 'a' is 50.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "a"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'b' is 50.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "b"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'c' is 50.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "c"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'd' is 50.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "d"
                }).to_dict()
        ],
        "errors": [],
        "actions": []
    }
def test_target_leakage_data_check_input_formats_pearson():
    leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8,
                                           method='pearson')

    # test empty pd.DataFrame, empty pd.Series
    assert leakage_check.validate(pd.DataFrame(), pd.Series()) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }

    y = pd.Series([1, 0, 1, 1])
    X = pd.DataFrame()
    X["a"] = y * 3
    X["b"] = y - 1
    X["c"] = y / 10
    X["d"] = ~y
    X["e"] = [0, 0, 0, 0]
    y = y.astype(bool)

    expected = {
        "warnings": [
            DataCheckWarning(
                message=
                "Column 'a' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "a"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'b' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "b"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'c' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "c"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'd' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "d"
                }).to_dict()
        ],
        "errors": [],
        "actions": [
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'a'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'b'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'c'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'd'
                            }).to_dict()
        ]
    }

    # test X as np.array
    assert leakage_check.validate(X.values, y) == {
        "warnings": [
            DataCheckWarning(
                message=
                "Column '0' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": 0
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column '1' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": 1
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column '2' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": 2
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column '3' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": 3
                }).to_dict()
        ],
        "errors": [],
        "actions": [
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 0
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 1
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 2
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 3
                            }).to_dict()
        ]
    }

    # test X as ww.DataTable, y as ww.DataColumn
    assert leakage_check.validate(ww.DataTable(X),
                                  ww.DataColumn(y)) == expected

    #  test y as list
    assert leakage_check.validate(X, y.values) == expected
def test_target_leakage_regression():
    leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8)

    # test empty pd.DataFrame, empty pd.Series
    assert leakage_check.validate(pd.DataFrame(), pd.Series()) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }

    y = pd.Series([
        0.4, 0.1, 2.3, 4.3, 2.2, 1.8, 3.7, 3.6, 2.4, 0.9, 3.1, 2.8, 4.1, 1.6,
        1.2
    ])
    X = pd.DataFrame()
    X["a"] = y * 3
    X["b"] = y - 1
    X["c"] = y / 10
    X["d"] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    X["e"] = [
        "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n",
        "o"
    ]

    expected = {
        "warnings": [
            DataCheckWarning(
                message=
                "Column 'a' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "a"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'b' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "b"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'c' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "c"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'e' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "e"
                }).to_dict()
        ],
        "errors": [],
        "actions": [
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'a'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'b'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'c'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'e'
                            }).to_dict()
        ]
    }

    # test X as ww.DataTable, y as ww.DataColumn
    assert leakage_check.validate(ww.DataTable(X),
                                  ww.DataColumn(y)) == expected

    #  test y as list
    assert leakage_check.validate(X, y.values) == expected
def test_target_leakage_data_check_input_formats():
    leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8)
    y = pd.Series([1, 0, 1, 1])
    X = pd.DataFrame()
    X["a"] = y * 3
    X["b"] = y - 1
    X["c"] = y / 10
    X["d"] = ~y
    X["e"] = [0, 0, 0, 0]
    y = y.astype(bool)

    expected_messages = {
        "warnings": [
            DataCheckWarning(
                message=
                "Column 'a' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "a"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'b' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "b"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'c' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "c"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'd' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "d"
                }).to_dict()
        ],
        "errors": []
    }
    # test X as ww.DataTable, y as ww.DataColumn
    assert leakage_check.validate(ww.DataTable(X),
                                  ww.DataColumn(y)) == expected_messages

    # test y as list
    assert leakage_check.validate(X, y.values) == expected_messages

    # test X as np.array
    assert leakage_check.validate(X.to_numpy().astype(float), y) == {
        "warnings": [
            DataCheckWarning(
                message=
                "Column '0' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": 0
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column '1' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": 1
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column '2' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": 2
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column '3' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": 3
                }).to_dict()
        ],
        "errors": []
    }