def test_make_component_list_from_actions():
    assert _make_component_list_from_actions([]) == []

    actions = [
        DataCheckAction(DataCheckActionCode.DROP_COL,
                        {"columns": ['some col']})
    ]
    assert _make_component_list_from_actions(actions) == [
        DropColumns(columns=['some col'])
    ]

    actions = [
        DataCheckAction(DataCheckActionCode.DROP_COL,
                        metadata={"columns": ['some col']}),
        DataCheckAction(DataCheckActionCode.IMPUTE_COL,
                        metadata={
                            "column": None,
                            "is_target": True,
                            "impute_strategy": "most_frequent"
                        })
    ]
    assert _make_component_list_from_actions(actions) == [
        DropColumns(columns=['some col']),
        TargetImputer(impute_strategy="most_frequent")
    ]
def test_highly_null_data_check_input_formats():
    highly_null_check = HighlyNullDataCheck(pct_null_threshold=0.8)

    # test empty pd.DataFrame
    assert highly_null_check.validate(pd.DataFrame()) == {"warnings": [], "errors": [], "actions": []}

    expected = {
        "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": 0}).to_dict(),
                     DataCheckWarning(message="Column '1' is 80.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": 1}).to_dict(),
                     DataCheckWarning(message="Column '2' is 80.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": 2}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 0}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 1}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 2}).to_dict()]
    }
    #  test Woodwork
    ww_input = ww.DataTable(pd.DataFrame([[None, None, None, None, 0], [None, None, None, "hi", 5]]))
    assert highly_null_check.validate(ww_input) == expected

    #  test 2D list
    assert highly_null_check.validate([[None, None, None, None, 0], [None, None, None, "hi", 5]]) == expected

    # test np.array
    assert highly_null_check.validate(np.array([[None, None, None, None, 0], [None, None, None, "hi", 5]])) == expected
Esempio n. 3
0
def test_uniqueness_data_check_warnings():
    data = pd.DataFrame({'regression_unique_enough': [float(x) for x in range(100)],
                         'regression_not_unique_enough': [float(1) for x in range(100)]})

    uniqueness_check = UniquenessDataCheck(problem_type="regression")
    assert uniqueness_check.validate(data) == {
        "warnings": [DataCheckWarning(
            message="Input columns (regression_not_unique_enough) for regression problem type are not unique enough.",
            data_check_name=uniqueness_data_check_name,
            message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH,
            details={"column": "regression_not_unique_enough",
                     'uniqueness_score': 0.0}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 'regression_not_unique_enough'}).to_dict()]
    }

    data = pd.DataFrame({'multiclass_too_unique': ["Cats", "Are", "Absolutely", "The", "Best"] * 20,
                         'multiclass_not_too_unique': ["Cats", "Cats", "Best", "Best", "Best"] * 20})
    uniqueness_check = UniquenessDataCheck(problem_type="multiclass")
    assert uniqueness_check.validate(data) == {
        "warnings": [DataCheckWarning(
            message="Input columns (multiclass_too_unique) for multiclass problem type are too unique.",
            data_check_name=uniqueness_data_check_name,
            message_code=DataCheckMessageCode.TOO_UNIQUE,
            details={"column": "multiclass_too_unique",
                     'uniqueness_score': 0.7999999999999999}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 'multiclass_too_unique'}).to_dict()]
    }
Esempio n. 4
0
def test_sparsity_data_check_warnings():
    data = pd.DataFrame({
        'most_sparse': [float(x) for x in range(10)],  # [0,1,2,3,4,5,6,7,8,9]
        'more_sparse': [x % 5 for x in range(10)],  # [0,1,2,3,4,0,1,2,3,4]
        'sparse': [x % 3 for x in range(10)],  # [0,1,2,0,1,2,0,1,2,0]
        'less_sparse': [x % 2 for x in range(10)],  # [0,1,0,1,0,1,0,1,0,1]
        'not_sparse': [float(1) for x in range(10)]
    })  # [1,1,1,1,1,1,1,1,1,1]

    sparsity_check = SparsityDataCheck(problem_type="multiclass",
                                       threshold=.4,
                                       unique_count_threshold=3)

    assert sparsity_check.validate(data) == {
        "warnings": [
            DataCheckWarning(
                message=
                "Input columns (most_sparse) for multiclass problem type are too sparse.",
                data_check_name=sparsity_data_check_name,
                message_code=DataCheckMessageCode.TOO_SPARSE,
                details={
                    "column": "most_sparse",
                    'sparsity_score': 0
                }).to_dict(),
            DataCheckWarning(
                message=
                "Input columns (more_sparse) for multiclass problem type are too sparse.",
                data_check_name=sparsity_data_check_name,
                message_code=DataCheckMessageCode.TOO_SPARSE,
                details={
                    "column": "more_sparse",
                    'sparsity_score': 0
                }).to_dict(),
            DataCheckWarning(
                message=
                "Input columns (sparse) for multiclass problem type are too sparse.",
                data_check_name=sparsity_data_check_name,
                message_code=DataCheckMessageCode.TOO_SPARSE,
                details={
                    "column": "sparse",
                    'sparsity_score': 0.3333333333333333
                }).to_dict()
        ],
        "errors": [],
        "actions": [
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            details={
                                "column": 'most_sparse'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            details={
                                "column": 'more_sparse'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            details={
                                "column": 'sparse'
                            }).to_dict()
        ]
    }
Esempio n. 5
0
def test_data_check_action_inequality():
    data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL)
    data_check_action_diff = DataCheckAction(
        DataCheckActionCode.DROP_COL,
        metadata={"metadata": ["this is different"]})

    assert data_check_action != data_check_action_diff
    assert data_check_action_diff != data_check_action
Esempio n. 6
0
def test_default_data_checks_regression(input_type):
    X = pd.DataFrame({'lots_of_null': [None, None, None, None, "some data"],
                      'all_null': [None, None, None, None, None],
                      'also_all_null': [None, None, None, None, None],
                      'no_null': [1, 2, 3, 5, 5],
                      'id': [0, 1, 2, 3, 4],
                      'has_label_leakage': [100, 200, 100, 200, 100],
                      'natural_language_nan': [None,
                                               "string_that_is_long_enough_for_natural_language_1",
                                               "string_that_is_long_enough_for_natural_language_2",
                                               "string_that_is_long_enough_for_natural_language_3",
                                               "string_that_is_long_enough_for_natural_language_4"],
                      'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5))})
    X['nan_dt_col'][0] = None
    y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2])
    y_no_variance = pd.Series([5] * 5)

    if input_type == "ww":
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
        y_no_variance = ww.DataColumn(y_no_variance)
    null_leakage = [DataCheckWarning(message="Column 'lots_of_null' is 95.0% or more correlated with the target",
                                     data_check_name="TargetLeakageDataCheck",
                                     message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                     details={"column": "lots_of_null"}).to_dict()]
    data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression"))
    id_leakage_warning = [DataCheckWarning(message="Column 'id' is 95.0% or more correlated with the target",
                                           data_check_name="TargetLeakageDataCheck",
                                           message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                           details={"column": "id"}).to_dict()]
    nan_dt_leakage_warning = [DataCheckWarning(message="Column 'nan_dt_col' is 95.0% or more correlated with the target",
                                               data_check_name="TargetLeakageDataCheck",
                                               message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                               details={"column": "nan_dt_col"}).to_dict()]

    impute_action = DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, 'impute_strategy': 'mean'}).to_dict()
    nan_dt_action = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'nan_dt_col'}).to_dict()
    expected_actions_with_drop_and_impute = expected_actions[:3] + [nan_dt_action, impute_action] + expected_actions[4:]
    assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning,
                                          "errors": messages[3:],
                                          "actions": expected_actions_with_drop_and_impute}

    # Skip Invalid Target
    assert data_checks.validate(X, y_no_variance) == {
        "warnings": messages[:3] + null_leakage,
        "errors": messages[4:7] + [DataCheckError(message="Y has 1 unique value.",
                                                  data_check_name="NoVarianceDataCheck",
                                                  message_code=DataCheckMessageCode.NO_VARIANCE,
                                                  details={"column": "Y"}).to_dict()] + messages[7:],
        "actions": expected_actions[:3] + expected_actions[4:]
    }

    data_checks = DataChecks(DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES,
                             {"InvalidTargetDataCheck": {"problem_type": "regression",
                                                         "objective": get_default_primary_search_objective("regression")}})
    assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning,
                                          "errors": messages[3:],
                                          "actions": expected_actions_with_drop_and_impute}
Esempio n. 7
0
def test_make_component_list_from_actions():
    assert _make_component_list_from_actions([]) == []

    actions = [DataCheckAction(DataCheckActionCode.DROP_COL, {"columns": ['some col']})]
    assert _make_component_list_from_actions(actions) == [DropColumns(columns=['some col'])]

    actions_same_code = [DataCheckAction(DataCheckActionCode.DROP_COL, {"columns": ['some col']}),
                         DataCheckAction(DataCheckActionCode.DROP_COL, {"columns": ['some other col']})]
    assert _make_component_list_from_actions(actions_same_code) == [DropColumns(columns=['some col']),
                                                                    DropColumns(columns=['some other col'])]
Esempio n. 8
0
def test_data_check_action_to_dict():
    data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL)
    data_check_action_empty_details = DataCheckAction(DataCheckActionCode.DROP_COL, details={})
    data_check_action_with_details = DataCheckAction(DataCheckActionCode.DROP_COL, details={"some detail": ["this is different"]})

    assert data_check_action.to_dict() == {"code": DataCheckActionCode.DROP_COL.name, "details": {}}
    assert data_check_action_empty_details.to_dict() == {"code": DataCheckActionCode.DROP_COL.name, "details": {}}
    assert data_check_action_with_details.to_dict() == {"code": DataCheckActionCode.DROP_COL.name, "details": {"some detail": ["this is different"]}}
Esempio n. 9
0
def test_data_check_action_attributes():
    data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL)
    assert data_check_action.action_code == DataCheckActionCode.DROP_COL
    assert data_check_action.details == {}

    data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL, {})
    assert data_check_action.action_code == DataCheckActionCode.DROP_COL
    assert data_check_action.details == {}

    data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL, details={"columns": [1, 2]})
    assert data_check_action.action_code == DataCheckActionCode.DROP_COL
    assert data_check_action.details == {"columns": [1, 2]}
Esempio n. 10
0
def test_data_check_action_equality():
    data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL)
    data_check_action_eq = DataCheckAction(DataCheckActionCode.DROP_COL)

    assert data_check_action == data_check_action
    assert data_check_action == data_check_action_eq
    assert data_check_action_eq == data_check_action

    data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL, details={'same detail': 'same same same'})
    data_check_action_eq = DataCheckAction(DataCheckActionCode.DROP_COL, details={'same detail': 'same same same'})

    assert data_check_action == data_check_action
    assert data_check_action == data_check_action_eq
    assert data_check_action_eq == data_check_action
Esempio n. 11
0
def test_data_checks_do_not_duplicate_actions(X_y_binary):
    X, y = X_y_binary

    class MockDataCheck(DataCheck):
        def validate(self, X, y):
            return {
                "warnings": [],
                "errors": [],
                "actions": [
                    DataCheckAction(DataCheckActionCode.DROP_COL,
                                    metadata={
                                        "column": 'col_to_drop'
                                    }).to_dict()
                ]
            }

    class MockDataCheckWithSameAction(DataCheck):
        def validate(self, X, y):
            return {"warnings": [], "errors": [], "actions": []}

    data_checks_list = [MockDataCheck, MockDataCheckWithSameAction]
    data_checks = DataChecks(data_checks=data_checks_list)

    # Check duplicate actions are returned once
    assert data_checks.validate(X, y) == {
        "warnings": [],
        "errors": [],
        "actions": [
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'col_to_drop'
                            }).to_dict()
        ]
    }
Esempio n. 12
0
    def validate(self, X, y):
        """Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation.

        If `method='mutual'`, supports all target and feature types. Otherwise, if `method='pearson'` only supports binary with numeric and boolean dtypes.
        Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1].

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check
            y (ww.DataColumn, pd.Series, np.ndarray): The target data

        Returns:
            dict (DataCheckWarning): dict with a DataCheckWarning if target leakage is detected.

        Example:
            >>> import pandas as pd
            >>> X = pd.DataFrame({
            ...    'leak': [10, 42, 31, 51, 61],
            ...    'x': [42, 54, 12, 64, 12],
            ...    'y': [13, 5, 13, 74, 24],
            ... })
            >>> y = pd.Series([10, 42, 31, 51, 40])
            >>> target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.95)
            >>> assert target_leakage_check.validate(X, y) == {"warnings": [{"message": "Column 'leak' is 95.0% or more correlated with the target",\
                                                                             "data_check_name": "TargetLeakageDataCheck",\
                                                                             "level": "warning",\
                                                                             "code": "TARGET_LEAKAGE",\
                                                                             "details": {"column": "leak"}}],\
                                                               "errors": [],\
                                                               "actions": [{"code": "DROP_COL",\
                                                                            "metadata": {"column": "leak"}}]}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        X = infer_feature_types(X)
        y = infer_feature_types(y)

        if self.method == 'pearson':
            highly_corr_cols = self._calculate_pearson(X, y)
        else:
            X = _convert_woodwork_types_wrapper(X.to_dataframe())
            y = _convert_woodwork_types_wrapper(y.to_series())
            highly_corr_cols = self._calculate_mutual_information(X, y)

        warning_msg = "Column '{}' is {}% or more correlated with the target"
        results["warnings"].extend([
            DataCheckWarning(message=warning_msg.format(
                col_name, self.pct_corr_threshold * 100),
                             data_check_name=self.name,
                             message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                             details={
                                 "column": col_name
                             }).to_dict() for col_name in highly_corr_cols
        ])
        results["actions"].extend([
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": col_name
                            }).to_dict() for col_name in highly_corr_cols
        ])
        return results
def test_invalid_target_data_action_for_data_with_null(problem_type):
    y = pd.Series([None, None, None, 0, 0, 0, 0, 0, 0, 0])
    X = pd.DataFrame({"col": range(len(y))})
    invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type))
    impute_strategy = "mean" if is_regression(problem_type) else "most_frequent"

    expected = {
        "warnings": [],
        "errors": [DataCheckError(message="3 row(s) (30.0%) of target values are null",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                                  details={"num_null_rows": 3, "pct_null_rows": 30.0}).to_dict()],
        "actions": [DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": impute_strategy}).to_dict()]
    }
    if is_binary(problem_type):
        expected["errors"].append(DataCheckError(message="Binary class targets require exactly two unique values.",
                                                 data_check_name=invalid_targets_data_check_name,
                                                 message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                                                 details={"target_values": [0]}).to_dict())
    elif is_multiclass(problem_type):
        expected["errors"].append(DataCheckError(message=f"Target has two or less classes, which is too few for multiclass problems.  Consider changing to binary.",
                                                 data_check_name=invalid_targets_data_check_name,
                                                 message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES,
                                                 details={"num_classes": 1}).to_dict())
        expected["warnings"].append(DataCheckWarning(message=f"Target has a large number of unique values, could be regression type problem.",
                                                     data_check_name=invalid_targets_data_check_name,
                                                     message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS,
                                                     details={"class_to_value_ratio": 0.1}).to_dict())

    messages = invalid_targets_check.validate(X, y)
    assert messages == expected
Esempio n. 14
0
    def validate(self, X, y=None):
        """Checks if there are any highly-null columns in the input.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features
            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.

        Returns:
            dict: dict with a DataCheckWarning if there are any highly-null columns.

        Example:
            >>> import pandas as pd
            >>> df = pd.DataFrame({
            ...    'lots_of_null': [None, None, None, None, 5],
            ...    'no_null': [1, 2, 3, 4, 5]
            ... })
            >>> null_check = HighlyNullDataCheck(pct_null_threshold=0.8)
            >>> assert null_check.validate(df) == {"errors": [],\
                                                   "warnings": [{"message": "Column 'lots_of_null' is 80.0% or more null",\
                                                                 "data_check_name": "HighlyNullDataCheck",\
                                                                 "level": "warning",\
                                                                 "code": "HIGHLY_NULL",\
                                                                 "details": {"column": "lots_of_null"}}],\
                                                    "actions": [{"code": "DROP_COL",\
                                                                 "metadata": {"column": "lots_of_null"}}]}
        """
        results = {
            "warnings": [],
            "errors": [],
            "actions": []
        }

        X = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        percent_null = (X.isnull().mean()).to_dict()
        highly_null_cols = []
        if self.pct_null_threshold == 0.0:
            highly_null_cols = {key: value for key, value in percent_null.items() if value > 0.0}
            warning_msg = "Column '{}' is more than 0% null"
            results["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name),
                                                         data_check_name=self.name,
                                                         message_code=DataCheckMessageCode.HIGHLY_NULL,
                                                         details={"column": col_name}).to_dict()
                                        for col_name in highly_null_cols])
        else:
            highly_null_cols = {key: value for key, value in percent_null.items() if value >= self.pct_null_threshold}
            warning_msg = "Column '{}' is {}% or more null"
            results["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name, self.pct_null_threshold * 100),
                                                         data_check_name=self.name,
                                                         message_code=DataCheckMessageCode.HIGHLY_NULL,
                                                         details={"column": col_name}).to_dict()
                                        for col_name in highly_null_cols])

        results["actions"].extend([DataCheckAction(DataCheckActionCode.DROP_COL,
                                                   metadata={"column": col_name}).to_dict()
                                   for col_name in highly_null_cols])
        return results
Esempio n. 15
0
def test_default_data_checks_null_rows():
    class SeriesWrap():
        def __init__(self, series):
            self.series = series

        def __eq__(self, series_2):
            return all(self.series.eq(series_2.series))

    X = pd.DataFrame({'all_null': [None, None, None, None, None],
                      'also_all_null': [None, None, None, None, None]})
    y = pd.Series([0, 1, np.nan, 1, 0])
    data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression"))
    highly_null_rows = SeriesWrap(pd.Series([1.0, 1.0, 1.0, 1.0, 1.0]))
    expected = {
        "warnings": [DataCheckWarning(message="5 out of 5 rows are more than 95.0% null",
                                      data_check_name="HighlyNullDataCheck",
                                      message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS,
                                      details={"pct_null_cols": highly_null_rows}).to_dict(),
                     DataCheckWarning(message="Column 'all_null' is 95.0% or more null",
                                      data_check_name="HighlyNullDataCheck",
                                      message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
                                      details={"column": 'all_null', "pct_null_rows": 1.0}).to_dict(),
                     DataCheckWarning(message="Column 'also_all_null' is 95.0% or more null",
                                      data_check_name="HighlyNullDataCheck",
                                      message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
                                      details={"column": 'also_all_null', "pct_null_rows": 1.0}).to_dict()],
        "errors": [DataCheckError(message="1 row(s) (20.0%) of target values are null",
                                  data_check_name="InvalidTargetDataCheck",
                                  message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                                  details={"num_null_rows": 1, "pct_null_rows": 20.0}).to_dict(),
                   DataCheckError(message="all_null has 0 unique value.",
                                  data_check_name="NoVarianceDataCheck",
                                  message_code=DataCheckMessageCode.NO_VARIANCE,
                                  details={"column": "all_null"}).to_dict(),
                   DataCheckError(message="also_all_null has 0 unique value.",
                                  data_check_name="NoVarianceDataCheck",
                                  message_code=DataCheckMessageCode.NO_VARIANCE,
                                  details={"column": "also_all_null"}).to_dict()],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3, 4]}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'also_all_null'}).to_dict(),
                    DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "mean"}).to_dict()]}
    validation_results = data_checks.validate(X, y)
    validation_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validation_results['warnings'][0]['details']['pct_null_cols'])
    assert validation_results == expected
Esempio n. 16
0
def test_highly_null_data_check_warnings():
    data = pd.DataFrame({'lots_of_null': [None, None, None, None, 5],
                         'all_null': [None, None, None, None, None],
                         'no_null': [1, 2, 3, 4, 5]})
    no_null_check = HighlyNullDataCheck(pct_null_threshold=0.0)
    assert no_null_check.validate(data) == {
        "warnings": [DataCheckWarning(message="Column 'lots_of_null' is more than 0% null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": "lots_of_null"}).to_dict(),
                     DataCheckWarning(message="Column 'all_null' is more than 0% null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": "all_null"}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'lots_of_null'}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()]
    }

    some_null_check = HighlyNullDataCheck(pct_null_threshold=0.5)
    assert some_null_check.validate(data) == {
        "warnings": [DataCheckWarning(message="Column 'lots_of_null' is 50.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": "lots_of_null"}).to_dict(),
                     DataCheckWarning(message="Column 'all_null' is 50.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": "all_null"}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'lots_of_null'}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()]

    }

    all_null_check = HighlyNullDataCheck(pct_null_threshold=1.0)
    assert all_null_check.validate(data) == {
        "warnings": [DataCheckWarning(message="Column 'all_null' is 100.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": "all_null"}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()]
    }
Esempio n. 17
0
 def validate(self, X, y):
     return {
         "warnings": [],
         "errors": [],
         "actions": [
             DataCheckAction(DataCheckActionCode.DROP_COL,
                             metadata={
                                 "column": 'col_to_drop'
                             }).to_dict()
         ]
     }
Esempio n. 18
0
    def validate(self, X, y=None):
        """Calculates what percentage of each column's unique values exceed the count threshold and compare
        that percentage to the sparsity threshold stored in the class instance.
        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features.
            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.
        Returns:
            dict: dict with a DataCheckWarning if there are any sparse columns.
        Example:
            >>> import pandas as pd
            >>> df = pd.DataFrame({
            ...    'sparse': [float(x) for x in range(100)],
            ...    'not_sparse': [float(1) for x in range(100)]
            ... })
            >>> sparsity_check = SparsityDataCheck(problem_type="multiclass", threshold=0.5, unique_count_threshold=10)
            >>> assert sparsity_check.validate(df) == {"errors": [],\
                                                       "warnings": [{"message": "Input columns (sparse) for multiclass problem type are too sparse.",\
                                                            "data_check_name": "SparsityDataCheck",\
                                                            "level": "warning",\
                                                            "code": "TOO_SPARSE",\
                                                            "details": {"column": "sparse", 'sparsity_score': 0.0}}],\
                                                       "actions": [{"code": "DROP_COL",\
                                                                 "metadata": {"column": "sparse"}}]}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        X = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        res = X.apply(SparsityDataCheck.sparsity_score,
                      count_threshold=self.unique_count_threshold)
        too_sparse_cols = [col for col in res.index[res < self.threshold]]
        results["warnings"].extend([
            DataCheckWarning(message=warning_too_unique.format(
                col_name, self.problem_type),
                             data_check_name=self.name,
                             message_code=DataCheckMessageCode.TOO_SPARSE,
                             details={
                                 "column": col_name,
                                 "sparsity_score": res.loc[col_name]
                             }).to_dict() for col_name in too_sparse_cols
        ])
        results["actions"].extend([
            DataCheckAction(action_code=DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": col_name
                            }).to_dict() for col_name in too_sparse_cols
        ])
        return results
def test_invalid_target_data_input_formats():
    invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary"))

    # test empty pd.Series
    X = pd.DataFrame()
    messages = invalid_targets_check.validate(X, pd.Series())
    assert messages == {
        "warnings": [],
        "errors": [DataCheckError(message="Target is either empty or fully null.",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL,
                                  details={}).to_dict()],
        "actions": []
    }

    expected = {
        "warnings": [],
        "errors": [DataCheckError(message="3 row(s) (75.0%) of target values are null",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                                  details={"num_null_rows": 3, "pct_null_rows": 75}).to_dict(),
                   DataCheckError(message="Binary class targets require exactly two unique values.",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                                  details={"target_values": [0]}).to_dict()],
        "actions": [DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "most_frequent"}).to_dict()]
    }
    #  test Woodwork
    y = pd.Series([None, None, None, 0])
    X = pd.DataFrame({"col": range(len(y))})
    messages = invalid_targets_check.validate(X, y)
    assert messages == expected

    #  test list
    y = [None, None, None, 0]
    X = pd.DataFrame({"col": range(len(y))})

    messages = invalid_targets_check.validate(X, y)
    assert messages == expected

    # test np.array
    y = np.array([None, None, None, 0])
    X = pd.DataFrame({"col": range(len(y))})

    messages = invalid_targets_check.validate(X, y)
    assert messages == expected
def test_id_cols_data_check_input_formats():
    id_cols_check = IDColumnsDataCheck(id_threshold=0.8)

    # test empty pd.DataFrame
    assert id_cols_check.validate(pd.DataFrame()) == {"warnings": [], "errors": [], "actions": []}

    #  test Woodwork
    ww_input = ww.DataTable(np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]]))
    assert id_cols_check.validate(ww_input) == {
        "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": 0}).to_dict(),
                     DataCheckWarning(message="Column '1' is 80.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": 1}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 0}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 1}).to_dict()]
    }

    #  test 2D list
    assert id_cols_check.validate([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]]) == {
        "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": 0}).to_dict(),
                     DataCheckWarning("Column '1' is 80.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": 1}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 0}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 1}).to_dict()]
    }

    # test np.array
    assert id_cols_check.validate(np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]])) == {
        "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": 0}).to_dict(),
                     DataCheckWarning(message="Column '1' is 80.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": 1}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 0}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 1}).to_dict()]
    }
def test_id_columns_warning():
    X_dict = {'col_1_id': [0, 1, 2, 3],
              'col_2': [2, 3, 4, 5],
              'col_3_id': [1, 1, 2, 3],
              'Id': [3, 1, 2, 0],
              'col_5': [0, 0, 1, 2],
              'col_6': [0.1, 0.2, 0.3, 0.4]
              }
    X = pd.DataFrame.from_dict(X_dict)
    id_cols_check = IDColumnsDataCheck(id_threshold=0.95)
    assert id_cols_check.validate(X) == {
        "warnings": [DataCheckWarning(message="Column 'Id' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "Id"}).to_dict(),
                     DataCheckWarning(message="Column 'col_1_id' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_1_id"}).to_dict(),
                     DataCheckWarning(message="Column 'col_2' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_2"}).to_dict(),
                     DataCheckWarning(message="Column 'col_3_id' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_3_id"}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_2"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_3_id"}).to_dict()]
    }

    X = pd.DataFrame.from_dict(X_dict)
    id_cols_check = IDColumnsDataCheck(id_threshold=1.0)
    assert id_cols_check.validate(X) == {
        "warnings": [DataCheckWarning(message="Column 'Id' is 100.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "Id"}).to_dict(),
                     DataCheckWarning(message="Column 'col_1_id' is 100.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_1_id"}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict()]
    }
def test_id_columns_strings():
    X_dict = {'col_1_id': ["a", "b", "c", "d"],
              'col_2': ["w", "x", "y", "z"],
              'col_3_id': ["123456789012345", "234567890123456", "3456789012345678", "45678901234567"],
              'Id': ["z", "y", "x", "a"],
              'col_5': ["0", "0", "1", "2"],
              'col_6': [0.1, 0.2, 0.3, 0.4]
              }
    X = pd.DataFrame.from_dict(X_dict)
    id_cols_check = IDColumnsDataCheck(id_threshold=0.95)
    assert id_cols_check.validate(X) == {
        "warnings": [DataCheckWarning(message="Column 'Id' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "Id"}).to_dict(),
                     DataCheckWarning(message="Column 'col_1_id' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_1_id"}).to_dict(),
                     DataCheckWarning(message="Column 'col_2' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_2"}).to_dict(),
                     DataCheckWarning(message="Column 'col_3_id' is 95.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_3_id"}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_2"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_3_id"}).to_dict()]
    }

    id_cols_check = IDColumnsDataCheck(id_threshold=1.0)
    assert id_cols_check.validate(X) == {
        "warnings": [DataCheckWarning(message="Column 'Id' is 100.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "Id"}).to_dict(),
                     DataCheckWarning(message="Column 'col_1_id' is 100.0% or more likely to be an ID column",
                                      data_check_name=id_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                                      details={"column": "col_1_id"}).to_dict()],
        "errors": [],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict()]
    }
Esempio n. 23
0
    def validate(self, X, y):
        """Check if the target or any of the features have no variance (1 unique value).

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): The input features.
            y (ww.DataColumn, pd.Series, np.ndarray): The target data.

        Returns:
            dict: dict of warnings/errors corresponding to features or target with no variance.
        """
        results = {"warnings": [], "errors": [], "actions": []}

        X = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = infer_feature_types(y)
        y = _convert_woodwork_types_wrapper(y.to_series())

        unique_counts = X.nunique(dropna=self._dropnan).to_dict()
        any_nulls = (X.isnull().any()).to_dict()
        for col_name in unique_counts:
            message = self._check_for_errors(col_name, unique_counts[col_name],
                                             any_nulls[col_name])
            if not message:
                continue
            DataCheck._add_message(message, results)
            results["actions"].append(
                DataCheckAction(DataCheckActionCode.DROP_COL,
                                details={
                                    "column": col_name
                                }).to_dict())
        y_name = getattr(y, "name")
        if not y_name:
            y_name = "Y"
        target_message = self._check_for_errors(
            y_name, y.nunique(dropna=self._dropnan),
            y.isnull().any())
        if target_message:
            DataCheck._add_message(target_message, results)
        return results
        "column": "feature"
    }).to_dict()
labels_0_unique = DataCheckError(message="Y has 0 unique value.",
                                 data_check_name=no_variance_data_check_name,
                                 message_code=DataCheckMessageCode.NO_VARIANCE,
                                 details={
                                     "column": "Y"
                                 }).to_dict()
labels_1_unique = DataCheckError(message="Y has 1 unique value.",
                                 data_check_name=no_variance_data_check_name,
                                 message_code=DataCheckMessageCode.NO_VARIANCE,
                                 details={
                                     "column": "Y"
                                 }).to_dict()
drop_feature_action = DataCheckAction(DataCheckActionCode.DROP_COL,
                                      details={
                                          "column": "feature"
                                      }).to_dict()

cases = [
    (all_distinct_X, all_distinct_y, True, {
        "warnings": [],
        "errors": [],
        "actions": []
    }),
    ([[1], [2], [3], [4]], [1, 2, 3, 2], False, {
        "warnings": [],
        "errors": [],
        "actions": []
    }),
    (np.arange(12).reshape(4, 3), [1, 2, 3], True, {
        "warnings": [],
Esempio n. 25
0
    def validate(self, X, y):
        """Checks if the target data contains missing or invalid values.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored.
            y (ww.DataColumn, pd.Series, np.ndarray): Target data to check for invalid values.

        Returns:
            dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data.

        Example:
            >>> import pandas as pd
            >>> X = pd.DataFrame({"col": [1, 2, 3, 1]})
            >>> y = pd.Series([0, 1, None, None])
            >>> target_check = InvalidTargetDataCheck('binary', 'Log Loss Binary')
            >>> assert target_check.validate(X, y) == {"errors": [{"message": "2 row(s) (50.0%) of target values are null",\
                                                                   "data_check_name": "InvalidTargetDataCheck",\
                                                                   "level": "error",\
                                                                   "code": "TARGET_HAS_NULL",\
                                                                   "details": {"num_null_rows": 2, "pct_null_rows": 50}}],\
                                                       "warnings": [],\
                                                       "actions": [{'code': 'IMPUTE_COL', 'metadata': {'column': None, 'impute_strategy': 'most_frequent', 'is_target': True}}]}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        if y is None:
            results["errors"].append(
                DataCheckError(
                    message="Target is None",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_IS_NONE,
                    details={}).to_dict())
            return results

        y = infer_feature_types(y)
        is_supported_type = y.logical_type in numeric_and_boolean_ww + [
            ww.logical_types.Categorical
        ]
        if not is_supported_type:
            results["errors"].append(
                DataCheckError(
                    message=
                    "Target is unsupported {} type. Valid Woodwork logical types include: {}"
                    .format(
                        y.logical_type, ", ".join([
                            ltype.type_string
                            for ltype in numeric_and_boolean_ww
                        ])),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
                    details={
                        "unsupported_type": y.logical_type.type_string
                    }).to_dict())
        y_df = _convert_woodwork_types_wrapper(y.to_series())
        null_rows = y_df.isnull()
        if null_rows.all():
            results["errors"].append(
                DataCheckError(message="Target is either empty or fully null.",
                               data_check_name=self.name,
                               message_code=DataCheckMessageCode.
                               TARGET_IS_EMPTY_OR_FULLY_NULL,
                               details={}).to_dict())
            return results
        elif null_rows.any():
            num_null_rows = null_rows.sum()
            pct_null_rows = null_rows.mean() * 100
            results["errors"].append(
                DataCheckError(
                    message="{} row(s) ({}%) of target values are null".format(
                        num_null_rows, pct_null_rows),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                    details={
                        "num_null_rows": num_null_rows,
                        "pct_null_rows": pct_null_rows
                    }).to_dict())
            impute_strategy = "mean" if is_regression(
                self.problem_type) else "most_frequent"
            results["actions"].append(
                DataCheckAction(DataCheckActionCode.IMPUTE_COL,
                                metadata={
                                    "column": None,
                                    "is_target": True,
                                    "impute_strategy": impute_strategy
                                }).to_dict())

        value_counts = y_df.value_counts()
        unique_values = value_counts.index.tolist()

        if is_binary(self.problem_type) and len(value_counts) != 2:
            if self.n_unique is None:
                details = {"target_values": unique_values}
            else:
                details = {
                    "target_values":
                    unique_values[:min(self.n_unique, len(unique_values))]
                }
            results["errors"].append(
                DataCheckError(
                    message=
                    "Binary class targets require exactly two unique values.",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.
                    TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                    details=details).to_dict())

        if self.problem_type == ProblemTypes.REGRESSION and "numeric" not in y.semantic_tags:
            results["errors"].append(
                DataCheckError(
                    message=
                    "Target data type should be numeric for regression type problems.",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
                    details={}).to_dict())

        if is_multiclass(self.problem_type):
            if value_counts.min() <= 1:
                least_populated = value_counts[value_counts <= 1]
                details = {
                    "least_populated_class_labels":
                    least_populated.index.tolist()
                }
                results["errors"].append(
                    DataCheckError(
                        message=
                        "Target does not have at least two instances per class which is required for multiclass classification",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.
                        TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS,
                        details=details).to_dict())
            if len(unique_values) <= 2:
                details = {"num_classes": len(unique_values)}
                results["errors"].append(
                    DataCheckError(
                        message=
                        "Target has two or less classes, which is too few for multiclass problems.  Consider changing to binary.",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.
                        TARGET_MULTICLASS_NOT_ENOUGH_CLASSES,
                        details=details).to_dict())

            num_class_to_num_value_ratio = len(unique_values) / len(y)
            if num_class_to_num_value_ratio >= self.multiclass_continuous_threshold:
                details = {
                    "class_to_value_ratio": num_class_to_num_value_ratio
                }
                results["warnings"].append(
                    DataCheckWarning(
                        message=
                        "Target has a large number of unique values, could be regression type problem.",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.
                        TARGET_MULTICLASS_HIGH_UNIQUE_CLASS,
                        details=details).to_dict())

        any_neg = not (y_df > 0).all() if y.logical_type in [
            ww.logical_types.Integer, ww.logical_types.Double
        ] else None
        if any_neg and self.objective.positive_only:
            details = {
                "Count of offending values":
                sum(val <= 0 for val in y_df.values.flatten())
            }
            results["errors"].append(
                DataCheckError(
                    message=
                    f"Target has non-positive values which is not supported for {self.objective.name}",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.
                    TARGET_INCOMPATIBLE_OBJECTIVE,
                    details=details).to_dict())

        if X is not None:
            X = infer_feature_types(X)
            X_index = list(X.to_dataframe().index)
            y_index = list(y_df.index)
            X_length = len(X_index)
            y_length = len(y_index)
            if X_length != y_length:
                results["warnings"].append(
                    DataCheckWarning(
                        message=
                        "Input target and features have different lengths",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.MISMATCHED_LENGTHS,
                        details={
                            "features_length": X_length,
                            "target_length": y_length
                        }).to_dict())

            if X_index != y_index:
                if set(X_index) == set(y_index):
                    results["warnings"].append(
                        DataCheckWarning(
                            message=
                            "Input target and features have mismatched indices order",
                            data_check_name=self.name,
                            message_code=DataCheckMessageCode.
                            MISMATCHED_INDICES_ORDER,
                            details={}).to_dict())
                else:
                    index_diff_not_in_X = list(set(y_index) -
                                               set(X_index))[:10]
                    index_diff_not_in_y = list(set(X_index) -
                                               set(y_index))[:10]
                    results["warnings"].append(
                        DataCheckWarning(
                            message=
                            "Input target and features have mismatched indices",
                            data_check_name=self.name,
                            message_code=DataCheckMessageCode.
                            MISMATCHED_INDICES,
                            details={
                                "indices_not_in_features": index_diff_not_in_X,
                                "indices_not_in_target": index_diff_not_in_y
                            }).to_dict())

        return results
Esempio n. 26
0
    def validate(self, X, y=None):
        """Checks if there are any columns in the input that are too unique in the case of classification
        problems or not unique enough in the case of regression problems.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features.
            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.  Defaults to None.

        Returns:
            dict: dict with a DataCheckWarning if there are any too unique or not
                unique enough columns.

        Example:
            >>> import pandas as pd
            >>> df = pd.DataFrame({
            ...    'regression_unique_enough': [float(x) for x in range(100)],
            ...    'regression_not_unique_enough': [float(1) for x in range(100)]
            ... })
            >>> uniqueness_check = UniquenessDataCheck(problem_type="regression", threshold=0.8)
            >>> assert uniqueness_check.validate(df) == {"errors": [],\
                                                         "warnings": [{"message": "Input columns (regression_not_unique_enough) for regression problem type are not unique enough.",\
                                                                 "data_check_name": "UniquenessDataCheck",\
                                                                 "level": "warning",\
                                                                 "code": "NOT_UNIQUE_ENOUGH",\
                                                                 "details": {"column": "regression_not_unique_enough", 'uniqueness_score': 0.0}}],\
                                                         "actions": [{"code": "DROP_COL",\
                                                                      "metadata": {"column": "regression_not_unique_enough"}}]}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        X = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        res = X.apply(UniquenessDataCheck.uniqueness_score)

        if is_regression(self.problem_type):
            not_unique_enough_cols = list(res.index[res < self.threshold])
            results["warnings"].extend([
                DataCheckWarning(
                    message=warning_not_unique_enough.format(
                        col_name, self.problem_type),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH,
                    details={
                        "column": col_name,
                        "uniqueness_score": res.loc[col_name]
                    }).to_dict() for col_name in not_unique_enough_cols
            ])
            results["actions"].extend([
                DataCheckAction(action_code=DataCheckActionCode.DROP_COL,
                                metadata={
                                    "column": col_name
                                }).to_dict()
                for col_name in not_unique_enough_cols
            ])
        elif is_multiclass(self.problem_type):
            too_unique_cols = list(res.index[res > self.threshold])
            results["warnings"].extend([
                DataCheckWarning(message=warning_too_unique.format(
                    col_name, self.problem_type),
                                 data_check_name=self.name,
                                 message_code=DataCheckMessageCode.TOO_UNIQUE,
                                 details={
                                     "column": col_name,
                                     "uniqueness_score": res.loc[col_name]
                                 }).to_dict() for col_name in too_unique_cols
            ])
            results["actions"].extend([
                DataCheckAction(action_code=DataCheckActionCode.DROP_COL,
                                metadata={
                                    "column": col_name
                                }).to_dict() for col_name in too_unique_cols
            ])
        return results
def test_target_leakage_data_check_input_formats_pearson():
    leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8,
                                           method='pearson')

    # test empty pd.DataFrame, empty pd.Series
    assert leakage_check.validate(pd.DataFrame(), pd.Series()) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }

    y = pd.Series([1, 0, 1, 1])
    X = pd.DataFrame()
    X["a"] = y * 3
    X["b"] = y - 1
    X["c"] = y / 10
    X["d"] = ~y
    X["e"] = [0, 0, 0, 0]
    y = y.astype(bool)

    expected = {
        "warnings": [
            DataCheckWarning(
                message=
                "Column 'a' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "a"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'b' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "b"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'c' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "c"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'd' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "d"
                }).to_dict()
        ],
        "errors": [],
        "actions": [
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'a'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'b'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'c'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'd'
                            }).to_dict()
        ]
    }

    # test X as np.array
    assert leakage_check.validate(X.values, y) == {
        "warnings": [
            DataCheckWarning(
                message=
                "Column '0' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": 0
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column '1' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": 1
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column '2' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": 2
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column '3' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": 3
                }).to_dict()
        ],
        "errors": [],
        "actions": [
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 0
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 1
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 2
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 3
                            }).to_dict()
        ]
    }

    # test X as ww.DataTable, y as ww.DataColumn
    assert leakage_check.validate(ww.DataTable(X),
                                  ww.DataColumn(y)) == expected

    #  test y as list
    assert leakage_check.validate(X, y.values) == expected
def test_target_leakage_data_check_warnings_pearson():
    y = pd.Series([1, 0, 1, 1])
    X = pd.DataFrame()
    X["a"] = y * 3
    X["b"] = y - 1
    X["c"] = y / 10
    X["d"] = ~y
    X["e"] = [0, 0, 0, 0]
    y = y.astype(bool)

    leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5,
                                           method='pearson')
    assert leakage_check.validate(X, y) == {
        "warnings": [
            DataCheckWarning(
                message=
                "Column 'a' is 50.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "a"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'b' is 50.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "b"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'c' is 50.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "c"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'd' is 50.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "d"
                }).to_dict()
        ],
        "errors": [],
        "actions": [
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'a'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'b'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'c'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'd'
                            }).to_dict()
        ]
    }

    y = ["a", "b", "a", "a"]
    leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5,
                                           method='pearson')
    assert leakage_check.validate(X, y) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }
def test_target_leakage_regression():
    leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8)

    # test empty pd.DataFrame, empty pd.Series
    assert leakage_check.validate(pd.DataFrame(), pd.Series()) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }

    y = pd.Series([
        0.4, 0.1, 2.3, 4.3, 2.2, 1.8, 3.7, 3.6, 2.4, 0.9, 3.1, 2.8, 4.1, 1.6,
        1.2
    ])
    X = pd.DataFrame()
    X["a"] = y * 3
    X["b"] = y - 1
    X["c"] = y / 10
    X["d"] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    X["e"] = [
        "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n",
        "o"
    ]

    expected = {
        "warnings": [
            DataCheckWarning(
                message=
                "Column 'a' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "a"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'b' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "b"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'c' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "c"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'e' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "e"
                }).to_dict()
        ],
        "errors": [],
        "actions": [
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'a'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'b'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'c'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'e'
                            }).to_dict()
        ]
    }

    # test X as ww.DataTable, y as ww.DataColumn
    assert leakage_check.validate(ww.DataTable(X),
                                  ww.DataColumn(y)) == expected

    #  test y as list
    assert leakage_check.validate(X, y.values) == expected
def test_target_leakage_types():
    leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8)

    y = pd.Series([1, 0, 1, 1])
    X = pd.DataFrame()
    X["a"] = ["a", "b", "a", "a"]
    X["b"] = y - 1
    X["c"] = [
        datetime.strptime("2015", "%Y"),
        datetime.strptime("2016", "%Y"),
        datetime.strptime("2015", "%Y"),
        datetime.strptime("2015", "%Y")
    ]
    X["d"] = ~y
    X["e"] = [0, 0, 0, 0]
    y = y.astype(bool)

    expected = {
        "warnings": [
            DataCheckWarning(
                message=
                "Column 'a' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "a"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'b' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "b"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'c' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "c"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'd' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "d"
                }).to_dict()
        ],
        "errors": [],
        "actions": [
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'a'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'b'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'c'
                            }).to_dict(),
            DataCheckAction(DataCheckActionCode.DROP_COL,
                            metadata={
                                "column": 'd'
                            }).to_dict()
        ]
    }

    assert leakage_check.validate(X, y) == expected