Esempio n. 1
0
def test_default_data_checks_regression(input_type):
    X = pd.DataFrame({
        'lots_of_null': [None, None, None, None, "some data"],
        'all_null': [None, None, None, None, None],
        'also_all_null': [None, None, None, None, None],
        'no_null': [1, 2, 3, 5, 5],
        'id': [0, 1, 2, 3, 4],
        'has_label_leakage': [100, 200, 100, 200, 100]
    })
    y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2])
    y_no_variance = pd.Series([5] * 5)

    if input_type == "ww":
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
        y_no_variance = ww.DataColumn(y_no_variance)
    null_leakage = [
        DataCheckWarning(
            message=
            "Column 'lots_of_null' is 95.0% or more correlated with the target",
            data_check_name="TargetLeakageDataCheck",
            message_code=DataCheckMessageCode.TARGET_LEAKAGE,
            details={
                "column": "lots_of_null"
            }).to_dict()
    ]
    data_checks = DefaultDataChecks(
        "regression", get_default_primary_search_objective("regression"))
    assert data_checks.validate(X, y) == {
        "warnings": messages[:3],
        "errors": messages[3:]
    }

    # Skip Invalid Target
    assert data_checks.validate(X, y_no_variance) == {
        "warnings":
        messages[:3] + null_leakage,
        "errors":
        messages[4:] + [
            DataCheckError(message="Y has 1 unique value.",
                           data_check_name="NoVarianceDataCheck",
                           message_code=DataCheckMessageCode.NO_VARIANCE,
                           details={
                               "column": "Y"
                           }).to_dict()
        ]
    }

    data_checks = DataChecks(
        DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, {
            "InvalidTargetDataCheck": {
                "problem_type": "regression",
                "objective": get_default_primary_search_objective("regression")
            }
        })
    assert data_checks.validate(X, y) == {
        "warnings": messages[:3],
        "errors": messages[3:]
    }
Esempio n. 2
0
def test_default_data_checks_regression(input_type):
    X = pd.DataFrame({'lots_of_null': [None, None, None, None, "some data"],
                      'all_null': [None, None, None, None, None],
                      'also_all_null': [None, None, None, None, None],
                      'no_null': [1, 2, 3, 5, 5],
                      'id': [0, 1, 2, 3, 4],
                      'has_label_leakage': [100, 200, 100, 200, 100],
                      'natural_language_nan': [None,
                                               "string_that_is_long_enough_for_natural_language_1",
                                               "string_that_is_long_enough_for_natural_language_2",
                                               "string_that_is_long_enough_for_natural_language_3",
                                               "string_that_is_long_enough_for_natural_language_4"],
                      'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5))})
    X['nan_dt_col'][0] = None
    y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2])
    y_no_variance = pd.Series([5] * 5)

    if input_type == "ww":
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
        y_no_variance = ww.DataColumn(y_no_variance)
    null_leakage = [DataCheckWarning(message="Column 'lots_of_null' is 95.0% or more correlated with the target",
                                     data_check_name="TargetLeakageDataCheck",
                                     message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                     details={"column": "lots_of_null"}).to_dict()]
    data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression"))
    id_leakage_warning = [DataCheckWarning(message="Column 'id' is 95.0% or more correlated with the target",
                                           data_check_name="TargetLeakageDataCheck",
                                           message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                           details={"column": "id"}).to_dict()]
    nan_dt_leakage_warning = [DataCheckWarning(message="Column 'nan_dt_col' is 95.0% or more correlated with the target",
                                               data_check_name="TargetLeakageDataCheck",
                                               message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                               details={"column": "nan_dt_col"}).to_dict()]

    impute_action = DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, 'impute_strategy': 'mean'}).to_dict()
    nan_dt_action = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'nan_dt_col'}).to_dict()
    expected_actions_with_drop_and_impute = expected_actions[:3] + [nan_dt_action, impute_action] + expected_actions[4:]
    assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning,
                                          "errors": messages[3:],
                                          "actions": expected_actions_with_drop_and_impute}

    # Skip Invalid Target
    assert data_checks.validate(X, y_no_variance) == {
        "warnings": messages[:3] + null_leakage,
        "errors": messages[4:7] + [DataCheckError(message="Y has 1 unique value.",
                                                  data_check_name="NoVarianceDataCheck",
                                                  message_code=DataCheckMessageCode.NO_VARIANCE,
                                                  details={"column": "Y"}).to_dict()] + messages[7:],
        "actions": expected_actions[:3] + expected_actions[4:]
    }

    data_checks = DataChecks(DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES,
                             {"InvalidTargetDataCheck": {"problem_type": "regression",
                                                         "objective": get_default_primary_search_objective("regression")}})
    assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning,
                                          "errors": messages[3:],
                                          "actions": expected_actions_with_drop_and_impute}
Esempio n. 3
0
def test_default_data_checks_time_series_regression():
    regression_data_check_classes = [
        check.__class__ for check in DefaultDataChecks(
            "regression", get_default_primary_search_objective(
                "regression")).data_checks
    ]
    ts_regression_data_check_classes = [
        check.__class__ for check in DefaultDataChecks(
            "time series regression",
            get_default_primary_search_objective(
                "time series regression")).data_checks
    ]
    assert regression_data_check_classes == ts_regression_data_check_classes
def test_invalid_target_data_check_numeric_binary_classification_error():
    y = pd.Series([1, 5, 1, 5, 1, 1])
    X = pd.DataFrame({"col": range(len(y))})
    invalid_targets_check = InvalidTargetDataCheck(
        "binary", get_default_primary_search_objective("binary"))
    assert invalid_targets_check.validate(X, y) == {
        "warnings": [
            DataCheckWarning(
                message=
                "Numerical binary classification target classes must be [0, 1], got [1, 5] instead",
                data_check_name=invalid_targets_data_check_name,
                message_code=DataCheckMessageCode.TARGET_BINARY_INVALID_VALUES,
                details={
                    "target_values": [1, 5]
                }).to_dict()
        ],
        "errors": []
    }

    y = pd.Series([0, 5, np.nan, np.nan])
    X = pd.DataFrame({"col": range(len(y))})
    assert invalid_targets_check.validate(X, y) == {
        "warnings": [
            DataCheckWarning(
                message=
                "Numerical binary classification target classes must be [0, 1], got [5.0, 0.0] instead",
                data_check_name=invalid_targets_data_check_name,
                message_code=DataCheckMessageCode.TARGET_BINARY_INVALID_VALUES,
                details={
                    "target_values": [5.0, 0.0]
                }).to_dict()
        ],
        "errors": [
            DataCheckError(
                message="2 row(s) (50.0%) of target values are null",
                data_check_name=invalid_targets_data_check_name,
                message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                details={
                    "num_null_rows": 2,
                    "pct_null_rows": 50
                }).to_dict()
        ]
    }

    y = pd.Series([0, 1, 1, 0, 1, 2])
    X = pd.DataFrame({"col": range(len(y))})
    assert invalid_targets_check.validate(X, y) == {
        "warnings": [],
        "errors": [
            DataCheckError(
                message=
                "Binary class targets require exactly two unique values.",
                data_check_name=invalid_targets_data_check_name,
                message_code=DataCheckMessageCode.
                TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                details={
                    "target_values": [1, 0, 2]
                }).to_dict()
        ]
    }
def test_invalid_target_data_check_multiclass_problem_almostcontinuous_data():
    invalid_targets_check = InvalidTargetDataCheck("multiclass", get_default_primary_search_objective("multiclass"))
    y_multiclass_high_classes = pd.Series(list(range(0, 100)) * 3)  # 100 classes, 300 samples, .33 class/sample ratio
    X = pd.DataFrame({"col": range(len(y_multiclass_high_classes))})
    data_check_error = DataCheckWarning(
        message=f"Target has a large number of unique values, could be regression type problem.",
        data_check_name=invalid_targets_data_check_name,
        message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS,
        details={"class_to_value_ratio": 1 / 3}).to_dict()
    assert invalid_targets_check.validate(X, y=y_multiclass_high_classes) == {"warnings": [data_check_error],
                                                                              "errors": []}

    y_multiclass_med_classes = pd.Series(list(range(0, 5)) * 20)  # 5 classes, 100 samples, .05 class/sample ratio
    X = pd.DataFrame({"col": range(len(y_multiclass_med_classes))})
    data_check_error = DataCheckWarning(
        message=f"Target has a large number of unique values, could be regression type problem.",
        data_check_name=invalid_targets_data_check_name,
        message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS,
        details={"class_to_value_ratio": .05}).to_dict()
    assert invalid_targets_check.validate(X, y=y_multiclass_med_classes) == {"warnings": [data_check_error],
                                                                             "errors": []}

    y_multiclass_low_classes = pd.Series(list(range(0, 3)) * 100)  # 2 classes, 300 samples, .01 class/sample ratio
    X = pd.DataFrame({"col": range(len(y_multiclass_low_classes))})
    assert invalid_targets_check.validate(X, y=y_multiclass_low_classes) == {"warnings": [], "errors": []}
def test_invalid_target_data_check_invalid_pandas_data_types_error(pd_type):
    y = pd.Series([0, 1, 0, 0, 1, 0, 1, 0])
    y = y.astype(pd_type)
    X = pd.DataFrame({"col": range(len(y))})

    invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary"))

    assert invalid_targets_check.validate(X, y) == {"warnings": [], "errors": [], "actions": []}

    y = pd.Series(pd.date_range('2000-02-03', periods=5, freq='W'))
    X = pd.DataFrame({"col": range(len(y))})

    unique_values = y.value_counts().index.tolist()
    assert invalid_targets_check.validate(X, y) == {
        "warnings": [],
        "errors": [DataCheckError(message="Target is unsupported {} type. Valid Woodwork logical types include: {}"
                                  .format("Datetime",
                                          ", ".join([ltype.type_string for ltype in numeric_and_boolean_ww])),
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
                                  details={"unsupported_type": "datetime"}).to_dict(),
                   DataCheckError(message="Binary class targets require exactly two unique values.",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                                  details={"target_values": unique_values}).to_dict()],
        "actions": []
    }
def test_invalid_target_data_check_different_lengths():
    X = pd.DataFrame({"col": [1, 2, 3]})
    y_diff_len = pd.Series([0, 1])
    invalid_targets_check = InvalidTargetDataCheck(
        "binary", get_default_primary_search_objective("binary"))
    assert invalid_targets_check.validate(X, y_diff_len) == {
        "warnings": [
            DataCheckWarning(
                message="Input target and features have different lengths",
                data_check_name=invalid_targets_data_check_name,
                message_code=DataCheckMessageCode.MISMATCHED_LENGTHS,
                details={
                    "features_length": len(X.index),
                    "target_length": len(y_diff_len.index)
                }).to_dict(),
            DataCheckWarning(
                message="Input target and features have mismatched indices",
                data_check_name=invalid_targets_data_check_name,
                message_code=DataCheckMessageCode.MISMATCHED_INDICES,
                details={
                    "indices_not_in_features": [],
                    "indices_not_in_target": [2]
                }).to_dict()
        ],
        "errors": [],
        "actions": []
    }
def test_invalid_target_data_check_multiclass_two_examples_per_class():
    y = pd.Series([0] + [1] * 19 + [2] * 80)
    X = pd.DataFrame({"col": range(len(y))})
    invalid_targets_check = InvalidTargetDataCheck("multiclass", get_default_primary_search_objective("binary"))
    expected_message = "Target does not have at least two instances per class which is required for multiclass classification"

    # with 1 class not having min 2 instances
    assert invalid_targets_check.validate(X, y) == {
        "warnings": [],
        "errors": [DataCheckError(message=expected_message,
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS,
                                  details={"least_populated_class_labels": [0]}).to_dict()],
        "actions": []
    }

    y = pd.Series([0] + [1] + [2] * 98)
    X = pd.DataFrame({"col": range(len(y))})
    # with 2 classes not having min 2 instances
    assert invalid_targets_check.validate(X, y) == {
        "warnings": [],
        "errors": [DataCheckError(message=expected_message,
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS,
                                  details={"least_populated_class_labels": [0, 1]}).to_dict()],
        "actions": []
    }
def test_invalid_target_data_check_multiclass_problem_binary_data():
    y_multiclass = pd.Series([1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] * 25)
    y_binary = pd.Series([0, 1, 1, 1, 0, 0] * 25)

    data_check_error = DataCheckError(
        message=
        f"Target has two or less classes, which is too few for multiclass problems.  Consider changing to binary.",
        data_check_name=invalid_targets_data_check_name,
        message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES,
        details={
            "num_classes": len(set(y_binary))
        }).to_dict()

    invalid_targets_check = InvalidTargetDataCheck(
        "multiclass", get_default_primary_search_objective("multiclass"))
    assert invalid_targets_check.validate(X=pd.DataFrame(
        {"col": range(len(y_multiclass))}),
                                          y=y_multiclass) == {
                                              "warnings": [],
                                              "errors": [],
                                              "actions": []
                                          }
    assert invalid_targets_check.validate(X=pd.DataFrame(
        {"col": range(len(y_binary))}),
                                          y=y_binary) == {
                                              "warnings": [],
                                              "errors": [data_check_error],
                                              "actions": []
                                          }
def test_invalid_target_data_action_for_data_with_null(problem_type):
    y = pd.Series([None, None, None, 0, 0, 0, 0, 0, 0, 0])
    X = pd.DataFrame({"col": range(len(y))})
    invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type))
    impute_strategy = "mean" if is_regression(problem_type) else "most_frequent"

    expected = {
        "warnings": [],
        "errors": [DataCheckError(message="3 row(s) (30.0%) of target values are null",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                                  details={"num_null_rows": 3, "pct_null_rows": 30.0}).to_dict()],
        "actions": [DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": impute_strategy}).to_dict()]
    }
    if is_binary(problem_type):
        expected["errors"].append(DataCheckError(message="Binary class targets require exactly two unique values.",
                                                 data_check_name=invalid_targets_data_check_name,
                                                 message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                                                 details={"target_values": [0]}).to_dict())
    elif is_multiclass(problem_type):
        expected["errors"].append(DataCheckError(message=f"Target has two or less classes, which is too few for multiclass problems.  Consider changing to binary.",
                                                 data_check_name=invalid_targets_data_check_name,
                                                 message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES,
                                                 details={"num_classes": 1}).to_dict())
        expected["warnings"].append(DataCheckWarning(message=f"Target has a large number of unique values, could be regression type problem.",
                                                     data_check_name=invalid_targets_data_check_name,
                                                     message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS,
                                                     details={"class_to_value_ratio": 0.1}).to_dict())

    messages = invalid_targets_check.validate(X, y)
    assert messages == expected
def test_invalid_target_data_check_nan_error():
    X = pd.DataFrame({"col": [1, 2, 3]})
    invalid_targets_check = InvalidTargetDataCheck(
        "regression", get_default_primary_search_objective("regression"))

    assert invalid_targets_check.validate(X, y=pd.Series([1, 2, 3])) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }
    assert invalid_targets_check.validate(
        X, y=pd.Series([np.nan, np.nan, np.nan])) == {
            "warnings": [],
            "errors": [
                DataCheckError(
                    message="3 row(s) (100.0%) of target values are null",
                    data_check_name=invalid_targets_data_check_name,
                    message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                    details={
                        "num_null_rows": 3,
                        "pct_null_rows": 100
                    }).to_dict()
            ],
            "actions": []
        }
def test_invalid_target_data_check_invalid_n_unique():
    with pytest.raises(
            ValueError,
            match="`n_unique` must be a non-negative integer value."):
        InvalidTargetDataCheck(
            "regression",
            get_default_primary_search_objective("regression"),
            n_unique=-1)
def test_invalid_target_data_check_n_unique(problem_type):
    y = pd.Series(list(range(100, 200)) + list(range(200)))
    unique_values = y.value_counts().index.tolist()[:100]  # n_unique defaults to 100
    X = pd.DataFrame({"col": range(len(y))})

    invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type))
    # Test default value of n_unique
    assert invalid_targets_check.validate(X, y) == {
        "warnings": [],
        "errors": [DataCheckError(message="Binary class targets require exactly two unique values.",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                                  details={"target_values": unique_values}).to_dict()],
        "actions": []
    }

    # Test number of unique values < n_unique
    y = pd.Series(range(20))
    X = pd.DataFrame({"col": range(len(y))})

    unique_values = y.value_counts().index.tolist()
    assert invalid_targets_check.validate(X, y) == {
        "warnings": [],
        "errors": [DataCheckError(message="Binary class targets require exactly two unique values.",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                                  details={"target_values": unique_values}).to_dict()],
        "actions": []
    }

    # Test n_unique is None
    invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary"),
                                                   n_unique=None)
    y = pd.Series(range(150))
    X = pd.DataFrame({"col": range(len(y))})

    unique_values = y.value_counts().index.tolist()
    assert invalid_targets_check.validate(X, y) == {
        "warnings": [],
        "errors": [DataCheckError(message="Binary class targets require exactly two unique values.",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                                  details={"target_values": unique_values}).to_dict()],
        "actions": []
    }
def test_invalid_target_data_check_numeric_binary_does_not_return_warnings():
    y = pd.Series([1, 5, 1, 5, 1, 1])
    X = pd.DataFrame({"col": range(len(y))})
    invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary"))
    assert invalid_targets_check.validate(X, y) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }
def test_invalid_target_data_check_numeric_binary_classification_valid_float():
    X = pd.DataFrame()
    invalid_targets_check = InvalidTargetDataCheck(
        "binary", get_default_primary_search_objective("binary"))
    assert invalid_targets_check.validate(X, y=pd.Series([0.0, 1.0, 0.0,
                                                          1.0])) == {
                                                              "warnings": [],
                                                              "errors": []
                                                          }
def test_invalid_target_y_none():
    invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary"))
    assert invalid_targets_check.validate(pd.DataFrame(), y=None) == {
        "warnings": [],
        "errors": [DataCheckError(message="Target is None",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_IS_NONE,
                                  details={}).to_dict()],
        "actions": []
    }
def test_invalid_target_data_check_regression_problem_nonnumeric_data(
        problem_type):
    y_categorical = pd.Series(["Peace", "Is", "A", "Lie"] * 100)
    y_mixed_cat_numeric = pd.Series(["Peace", 2, "A", 4] * 100)
    y_integer = pd.Series([1, 2, 3, 4])
    y_float = pd.Series([1.1, 2.2, 3.3, 4.4])
    y_numeric = pd.Series([1, 2.2, 3, 4.4])

    data_check_error = DataCheckError(
        message=
        f"Target data type should be numeric for regression type problems.",
        data_check_name=invalid_targets_data_check_name,
        message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
        details={}).to_dict()

    invalid_targets_check = InvalidTargetDataCheck(
        problem_type, get_default_primary_search_objective(problem_type))
    assert invalid_targets_check.validate(X=pd.DataFrame(
        {"col": range(len(y_categorical))}),
                                          y=y_categorical) == {
                                              "warnings": [],
                                              "errors": [data_check_error],
                                              "actions": []
                                          }
    assert invalid_targets_check.validate(X=pd.DataFrame(
        {"col": range(len(y_mixed_cat_numeric))}),
                                          y=y_mixed_cat_numeric) == {
                                              "warnings": [],
                                              "errors": [data_check_error],
                                              "actions": []
                                          }
    assert invalid_targets_check.validate(X=pd.DataFrame(
        {"col": range(len(y_integer))}),
                                          y=y_integer) == {
                                              "warnings": [],
                                              "errors": [],
                                              "actions": []
                                          }
    assert invalid_targets_check.validate(X=pd.DataFrame(
        {"col": range(len(y_float))}),
                                          y=y_float) == {
                                              "warnings": [],
                                              "errors": [],
                                              "actions": []
                                          }
    assert invalid_targets_check.validate(X=pd.DataFrame(
        {"col": range(len(y_numeric))}),
                                          y=y_numeric) == {
                                              "warnings": [],
                                              "errors": [],
                                              "actions": []
                                          }
def test_invalid_target_data_check_nan_error():
    X = pd.DataFrame({"col": [1, 2, 3]})
    invalid_targets_check = InvalidTargetDataCheck("regression", get_default_primary_search_objective("regression"))

    assert invalid_targets_check.validate(X, y=pd.Series([1, 2, 3])) == {"warnings": [], "errors": [], "actions": []}
    assert invalid_targets_check.validate(X, y=pd.Series([np.nan, np.nan, np.nan])) == {
        "warnings": [],
        "errors": [DataCheckError(message="Target is either empty or fully null.",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL,
                                  details={}).to_dict()],
        "actions": []
    }
def test_invalid_target_data_action_for_all_null(problem_type):
    invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type))

    y_all_null = pd.Series([None, None, None])
    X = pd.DataFrame({"col": range(len(y_all_null))})

    expected = {
        "warnings": [],
        "errors": [DataCheckError(message="Target is either empty or fully null.",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL,
                                  details={}).to_dict()],
        "actions": []
    }
    messages = invalid_targets_check.validate(X, y_all_null)
    assert messages == expected
def test_invalid_target_data_input_formats():
    invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary"))

    # test empty pd.Series
    X = pd.DataFrame()
    messages = invalid_targets_check.validate(X, pd.Series())
    assert messages == {
        "warnings": [],
        "errors": [DataCheckError(message="Target is either empty or fully null.",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL,
                                  details={}).to_dict()],
        "actions": []
    }

    expected = {
        "warnings": [],
        "errors": [DataCheckError(message="3 row(s) (75.0%) of target values are null",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                                  details={"num_null_rows": 3, "pct_null_rows": 75}).to_dict(),
                   DataCheckError(message="Binary class targets require exactly two unique values.",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                                  details={"target_values": [0]}).to_dict()],
        "actions": [DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "most_frequent"}).to_dict()]
    }
    #  test Woodwork
    y = pd.Series([None, None, None, 0])
    X = pd.DataFrame({"col": range(len(y))})
    messages = invalid_targets_check.validate(X, y)
    assert messages == expected

    #  test list
    y = [None, None, None, 0]
    X = pd.DataFrame({"col": range(len(y))})

    messages = invalid_targets_check.validate(X, y)
    assert messages == expected

    # test np.array
    y = np.array([None, None, None, 0])
    X = pd.DataFrame({"col": range(len(y))})

    messages = invalid_targets_check.validate(X, y)
    assert messages == expected
Esempio n. 21
0
def test_default_data_checks_null_rows():
    class SeriesWrap():
        def __init__(self, series):
            self.series = series

        def __eq__(self, series_2):
            return all(self.series.eq(series_2.series))

    X = pd.DataFrame({'all_null': [None, None, None, None, None],
                      'also_all_null': [None, None, None, None, None]})
    y = pd.Series([0, 1, np.nan, 1, 0])
    data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression"))
    highly_null_rows = SeriesWrap(pd.Series([1.0, 1.0, 1.0, 1.0, 1.0]))
    expected = {
        "warnings": [DataCheckWarning(message="5 out of 5 rows are more than 95.0% null",
                                      data_check_name="HighlyNullDataCheck",
                                      message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS,
                                      details={"pct_null_cols": highly_null_rows}).to_dict(),
                     DataCheckWarning(message="Column 'all_null' is 95.0% or more null",
                                      data_check_name="HighlyNullDataCheck",
                                      message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
                                      details={"column": 'all_null', "pct_null_rows": 1.0}).to_dict(),
                     DataCheckWarning(message="Column 'also_all_null' is 95.0% or more null",
                                      data_check_name="HighlyNullDataCheck",
                                      message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
                                      details={"column": 'also_all_null', "pct_null_rows": 1.0}).to_dict()],
        "errors": [DataCheckError(message="1 row(s) (20.0%) of target values are null",
                                  data_check_name="InvalidTargetDataCheck",
                                  message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                                  details={"num_null_rows": 1, "pct_null_rows": 20.0}).to_dict(),
                   DataCheckError(message="all_null has 0 unique value.",
                                  data_check_name="NoVarianceDataCheck",
                                  message_code=DataCheckMessageCode.NO_VARIANCE,
                                  details={"column": "all_null"}).to_dict(),
                   DataCheckError(message="also_all_null has 0 unique value.",
                                  data_check_name="NoVarianceDataCheck",
                                  message_code=DataCheckMessageCode.NO_VARIANCE,
                                  details={"column": "also_all_null"}).to_dict()],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3, 4]}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'also_all_null'}).to_dict(),
                    DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "mean"}).to_dict()]}
    validation_results = data_checks.validate(X, y)
    validation_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validation_results['warnings'][0]['details']['pct_null_cols'])
    assert validation_results == expected
def test_invalid_target_data_check_mismatched_indices():
    X = pd.DataFrame({"col": [1, 2, 3]})
    y_same_index = pd.Series([1, 0, 1])
    y_diff_index = pd.Series([0, 1, 0], index=[1, 5, 10])
    y_diff_index_order = pd.Series([0, 1, 0], index=[0, 2, 1])

    invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary"))
    assert invalid_targets_check.validate(X=None, y=y_same_index) == {"warnings": [], "errors": [], "actions": []}
    assert invalid_targets_check.validate(X, y_same_index) == {"warnings": [], "errors": [], "actions": []}

    X_index_missing = list(set(y_diff_index.index) - set(X.index))
    y_index_missing = list(set(X.index) - set(y_diff_index.index))
    assert invalid_targets_check.validate(X, y_diff_index) == {
        "warnings": [DataCheckWarning(message="Input target and features have mismatched indices",
                                      data_check_name=invalid_targets_data_check_name,
                                      message_code=DataCheckMessageCode.MISMATCHED_INDICES,
                                      details={"indices_not_in_features": X_index_missing,
                                               "indices_not_in_target": y_index_missing}).to_dict()],
        "errors": [],
        "actions": []
    }
    assert invalid_targets_check.validate(X, y_diff_index_order) == {
        "warnings": [DataCheckWarning(message="Input target and features have mismatched indices order",
                                      data_check_name=invalid_targets_data_check_name,
                                      message_code=DataCheckMessageCode.MISMATCHED_INDICES_ORDER,
                                      details={}).to_dict()],
        "errors": [],
        "actions": []
    }

    # Test that we only store ten mismatches when there are more than 10 differences in indices found
    X_large = pd.DataFrame({"col": range(20)})
    y_more_than_ten_diff_indices = pd.Series([0, 1] * 10, index=range(20, 40))
    X_index_missing = list(set(y_more_than_ten_diff_indices.index) - set(X.index))
    y_index_missing = list(set(X_large.index) - set(y_more_than_ten_diff_indices.index))
    assert invalid_targets_check.validate(X_large, y_more_than_ten_diff_indices) == {
        "warnings": [DataCheckWarning(message="Input target and features have mismatched indices",
                                      data_check_name=invalid_targets_data_check_name,
                                      message_code=DataCheckMessageCode.MISMATCHED_INDICES,
                                      details={"indices_not_in_features": X_index_missing[:10],
                                               "indices_not_in_target": y_index_missing[:10]}).to_dict()],
        "errors": [],
        "actions": []
    }
def test_invalid_target_data_input_formats():
    invalid_targets_check = InvalidTargetDataCheck(
        "binary", get_default_primary_search_objective("binary"))
    X = pd.DataFrame()

    # test empty pd.Series
    messages = invalid_targets_check.validate(X, pd.Series())
    assert messages == {
        "warnings": [],
        "errors": [
            DataCheckError(
                message=
                "Binary class targets require exactly two unique values.",
                data_check_name=invalid_targets_data_check_name,
                message_code=DataCheckMessageCode.
                TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                details={
                    "target_values": []
                }).to_dict()
        ]
    }
    #  test Woodwork
    messages = invalid_targets_check.validate(X,
                                              pd.Series([None, None, None, 0]))
    assert messages == {
        "warnings": [],
        "errors": [
            DataCheckError(
                message="3 row(s) (75.0%) of target values are null",
                data_check_name=invalid_targets_data_check_name,
                message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                details={
                    "num_null_rows": 3,
                    "pct_null_rows": 75
                }).to_dict(),
            DataCheckError(
                message=
                "Binary class targets require exactly two unique values.",
                data_check_name=invalid_targets_data_check_name,
                message_code=DataCheckMessageCode.
                TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                details={
                    "target_values": [0]
                }).to_dict()
        ]
    }

    #  test list
    messages = invalid_targets_check.validate(X, [None, None, None, 0])
    assert messages == {
        "warnings": [],
        "errors": [
            DataCheckError(
                message="3 row(s) (75.0%) of target values are null",
                data_check_name=invalid_targets_data_check_name,
                message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                details={
                    "num_null_rows": 3,
                    "pct_null_rows": 75
                }).to_dict(),
            DataCheckError(
                message=
                "Binary class targets require exactly two unique values.",
                data_check_name=invalid_targets_data_check_name,
                message_code=DataCheckMessageCode.
                TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                details={
                    "target_values": [0]
                }).to_dict()
        ]
    }

    # test np.array
    messages = invalid_targets_check.validate(X,
                                              np.array([None, None, None, 0]))
    assert messages == {
        "warnings": [],
        "errors": [
            DataCheckError(
                message="3 row(s) (75.0%) of target values are null",
                data_check_name=invalid_targets_data_check_name,
                message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                details={
                    "num_null_rows": 3,
                    "pct_null_rows": 75
                }).to_dict(),
            DataCheckError(
                message=
                "Binary class targets require exactly two unique values.",
                data_check_name=invalid_targets_data_check_name,
                message_code=DataCheckMessageCode.
                TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                details={
                    "target_values": [0]
                }).to_dict()
        ]
    }
def test_invalid_target_y_none():
    invalid_targets_check = InvalidTargetDataCheck(
        "binary", get_default_primary_search_objective("binary"))
    with pytest.raises(ValueError, match="y cannot be None"):
        invalid_targets_check.validate(pd.DataFrame(), y=None)
Esempio n. 25
0
def test_default_data_checks_classification(input_type):
    X = pd.DataFrame({
        'lots_of_null': [None, None, None, None, "some data"],
        'all_null': [None, None, None, None, None],
        'also_all_null': [None, None, None, None, None],
        'no_null': [1, 2, 3, 4, 5],
        'id': [0, 1, 2, 3, 4],
        'has_label_leakage': [100, 200, 100, 200, 100],
        'natural_language_nan': [
            None, "string_that_is_long_enough_for_natural_language_1",
            "string_that_is_long_enough_for_natural_language_2",
            "string_that_is_long_enough_for_natural_language_3",
            "string_that_is_long_enough_for_natural_language_4"
        ],
        'nan_dt_col':
        pd.Series(pd.date_range('20200101', periods=5))
    })
    X['nan_dt_col'][0] = None

    y = pd.Series([0, 1, np.nan, 1, 0])
    y_multiclass = pd.Series([0, 1, np.nan, 2, 0])
    if input_type == "ww":
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
        y_multiclass = ww.DataColumn(y_multiclass)

    data_checks = DefaultDataChecks(
        "binary", get_default_primary_search_objective("binary"))
    imbalance = [
        DataCheckError(
            message=
            "The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0.0, 1.0]",
            data_check_name="ClassImbalanceDataCheck",
            message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS,
            details={
                "target_values": [0.0, 1.0]
            }).to_dict()
    ]

    assert data_checks.validate(X, y) == {
        "warnings": messages[:3],
        "errors": messages[3:] + imbalance,
        "actions": expected_actions
    }

    data_checks = DataChecks(
        DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, {
            "InvalidTargetDataCheck": {
                "problem_type": "binary",
                "objective": get_default_primary_search_objective("binary")
            }
        })
    assert data_checks.validate(X, y) == {
        "warnings": messages[:3],
        "errors": messages[3:],
        "actions": expected_actions
    }

    # multiclass
    imbalance = [
        DataCheckError(
            message=
            "The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0.0, 2.0, 1.0]",
            data_check_name="ClassImbalanceDataCheck",
            message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS,
            details={
                "target_values": [0.0, 2.0, 1.0]
            }).to_dict()
    ]
    min_2_class_count = [
        DataCheckError(
            message=
            "Target does not have at least two instances per class which is required for multiclass classification",
            data_check_name="InvalidTargetDataCheck",
            message_code=DataCheckMessageCode.
            TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS,
            details={
                "least_populated_class_labels": [2.0, 1.0]
            }).to_dict()
    ]
    high_class_to_sample_ratio = [
        DataCheckWarning(
            message=
            "Target has a large number of unique values, could be regression type problem.",
            data_check_name="InvalidTargetDataCheck",
            message_code=DataCheckMessageCode.
            TARGET_MULTICLASS_HIGH_UNIQUE_CLASS,
            details={
                'class_to_value_ratio': 0.6
            }).to_dict()
    ]
    # multiclass
    data_checks = DefaultDataChecks(
        "multiclass", get_default_primary_search_objective("multiclass"))
    assert data_checks.validate(X, y_multiclass) == {
        "warnings": messages[:3] + high_class_to_sample_ratio,
        "errors": [messages[3]] + min_2_class_count + messages[4:] + imbalance,
        "actions": expected_actions
    }

    data_checks = DataChecks(
        DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, {
            "InvalidTargetDataCheck": {
                "problem_type": "multiclass",
                "objective": get_default_primary_search_objective("multiclass")
            }
        })
    assert data_checks.validate(X, y_multiclass) == {
        "warnings": messages[:3] + high_class_to_sample_ratio,
        "errors": [messages[3]] + min_2_class_count + messages[4:],
        "actions": expected_actions
    }