Ejemplo n.º 1
0
def test_default_data_checks_time_series_regression():
    regression_data_check_classes = [
        check.__class__ for check in DefaultDataChecks(
            "regression", get_default_primary_search_objective(
                "regression")).data_checks
    ]
    ts_regression_data_check_classes = [
        check.__class__ for check in DefaultDataChecks(
            "time series regression",
            get_default_primary_search_objective(
                "time series regression")).data_checks
    ]
    assert regression_data_check_classes == ts_regression_data_check_classes
Ejemplo n.º 2
0
    def _validate_data_checks(self, data_checks):
        """Validate data_checks parameter.

        Arguments:
            data_checks (DataChecks, list(Datacheck), str, None): Input to validate. If not of the right type,
                raise an exception.

        Returns:
            An instance of DataChecks used to perform checks before search.
        """
        if isinstance(data_checks, DataChecks):
            return data_checks
        elif isinstance(data_checks, list):
            return AutoMLDataChecks(data_checks)
        elif isinstance(data_checks, str):
            if data_checks == "auto":
                return DefaultDataChecks(
                    problem_type=self.problem_type,
                    objective=self.objective,
                    n_splits=self.data_splitter.get_n_splits())
            elif data_checks == "disabled":
                return EmptyDataChecks()
            else:
                raise ValueError(
                    "If data_checks is a string, it must be either 'auto' or 'disabled'. "
                    f"Received '{data_checks}'.")
        elif data_checks is None:
            return EmptyDataChecks()
        else:
            return DataChecks(data_checks)
Ejemplo n.º 3
0
def test_default_data_checks_null_rows():
    class SeriesWrap():
        def __init__(self, series):
            self.series = series

        def __eq__(self, series_2):
            return all(self.series.eq(series_2.series))

    X = pd.DataFrame({'all_null': [None, None, None, None, None],
                      'also_all_null': [None, None, None, None, None]})
    y = pd.Series([0, 1, np.nan, 1, 0])
    data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression"))
    highly_null_rows = SeriesWrap(pd.Series([1.0, 1.0, 1.0, 1.0, 1.0]))
    expected = {
        "warnings": [DataCheckWarning(message="5 out of 5 rows are more than 95.0% null",
                                      data_check_name="HighlyNullDataCheck",
                                      message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS,
                                      details={"pct_null_cols": highly_null_rows}).to_dict(),
                     DataCheckWarning(message="Column 'all_null' is 95.0% or more null",
                                      data_check_name="HighlyNullDataCheck",
                                      message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
                                      details={"column": 'all_null', "pct_null_rows": 1.0}).to_dict(),
                     DataCheckWarning(message="Column 'also_all_null' is 95.0% or more null",
                                      data_check_name="HighlyNullDataCheck",
                                      message_code=DataCheckMessageCode.HIGHLY_NULL_COLS,
                                      details={"column": 'also_all_null', "pct_null_rows": 1.0}).to_dict()],
        "errors": [DataCheckError(message="1 row(s) (20.0%) of target values are null",
                                  data_check_name="InvalidTargetDataCheck",
                                  message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                                  details={"num_null_rows": 1, "pct_null_rows": 20.0}).to_dict(),
                   DataCheckError(message="all_null has 0 unique value.",
                                  data_check_name="NoVarianceDataCheck",
                                  message_code=DataCheckMessageCode.NO_VARIANCE,
                                  details={"column": "all_null"}).to_dict(),
                   DataCheckError(message="also_all_null has 0 unique value.",
                                  data_check_name="NoVarianceDataCheck",
                                  message_code=DataCheckMessageCode.NO_VARIANCE,
                                  details={"column": "also_all_null"}).to_dict()],
        "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3, 4]}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict(),
                    DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'also_all_null'}).to_dict(),
                    DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "mean"}).to_dict()]}
    validation_results = data_checks.validate(X, y)
    validation_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validation_results['warnings'][0]['details']['pct_null_cols'])
    assert validation_results == expected
Ejemplo n.º 4
0
def test_default_data_checks_regression(input_type):
    X = pd.DataFrame({
        'lots_of_null': [None, None, None, None, "some data"],
        'all_null': [None, None, None, None, None],
        'also_all_null': [None, None, None, None, None],
        'no_null': [1, 2, 3, 5, 5],
        'id': [0, 1, 2, 3, 4],
        'has_label_leakage': [100, 200, 100, 200, 100]
    })
    y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2])
    y_no_variance = pd.Series([5] * 5)

    if input_type == "ww":
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
        y_no_variance = ww.DataColumn(y_no_variance)
    null_leakage = [
        DataCheckWarning(
            message=
            "Column 'lots_of_null' is 95.0% or more correlated with the target",
            data_check_name="TargetLeakageDataCheck",
            message_code=DataCheckMessageCode.TARGET_LEAKAGE,
            details={
                "column": "lots_of_null"
            }).to_dict()
    ]
    data_checks = DefaultDataChecks(
        "regression", get_default_primary_search_objective("regression"))
    assert data_checks.validate(X, y) == {
        "warnings": messages[:3],
        "errors": messages[3:]
    }

    # Skip Invalid Target
    assert data_checks.validate(X, y_no_variance) == {
        "warnings":
        messages[:3] + null_leakage,
        "errors":
        messages[4:] + [
            DataCheckError(message="Y has 1 unique value.",
                           data_check_name="NoVarianceDataCheck",
                           message_code=DataCheckMessageCode.NO_VARIANCE,
                           details={
                               "column": "Y"
                           }).to_dict()
        ]
    }

    data_checks = DataChecks(
        DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, {
            "InvalidTargetDataCheck": {
                "problem_type": "regression",
                "objective": get_default_primary_search_objective("regression")
            }
        })
    assert data_checks.validate(X, y) == {
        "warnings": messages[:3],
        "errors": messages[3:]
    }
Ejemplo n.º 5
0
def test_default_data_checks_regression(input_type):
    X = pd.DataFrame({'lots_of_null': [None, None, None, None, "some data"],
                      'all_null': [None, None, None, None, None],
                      'also_all_null': [None, None, None, None, None],
                      'no_null': [1, 2, 3, 5, 5],
                      'id': [0, 1, 2, 3, 4],
                      'has_label_leakage': [100, 200, 100, 200, 100],
                      'natural_language_nan': [None,
                                               "string_that_is_long_enough_for_natural_language_1",
                                               "string_that_is_long_enough_for_natural_language_2",
                                               "string_that_is_long_enough_for_natural_language_3",
                                               "string_that_is_long_enough_for_natural_language_4"],
                      'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5))})
    X['nan_dt_col'][0] = None
    y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2])
    y_no_variance = pd.Series([5] * 5)

    if input_type == "ww":
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
        y_no_variance = ww.DataColumn(y_no_variance)
    null_leakage = [DataCheckWarning(message="Column 'lots_of_null' is 95.0% or more correlated with the target",
                                     data_check_name="TargetLeakageDataCheck",
                                     message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                     details={"column": "lots_of_null"}).to_dict()]
    data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression"))
    id_leakage_warning = [DataCheckWarning(message="Column 'id' is 95.0% or more correlated with the target",
                                           data_check_name="TargetLeakageDataCheck",
                                           message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                           details={"column": "id"}).to_dict()]
    nan_dt_leakage_warning = [DataCheckWarning(message="Column 'nan_dt_col' is 95.0% or more correlated with the target",
                                               data_check_name="TargetLeakageDataCheck",
                                               message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                               details={"column": "nan_dt_col"}).to_dict()]

    impute_action = DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, 'impute_strategy': 'mean'}).to_dict()
    nan_dt_action = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'nan_dt_col'}).to_dict()
    expected_actions_with_drop_and_impute = expected_actions[:3] + [nan_dt_action, impute_action] + expected_actions[4:]
    assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning,
                                          "errors": messages[3:],
                                          "actions": expected_actions_with_drop_and_impute}

    # Skip Invalid Target
    assert data_checks.validate(X, y_no_variance) == {
        "warnings": messages[:3] + null_leakage,
        "errors": messages[4:7] + [DataCheckError(message="Y has 1 unique value.",
                                                  data_check_name="NoVarianceDataCheck",
                                                  message_code=DataCheckMessageCode.NO_VARIANCE,
                                                  details={"column": "Y"}).to_dict()] + messages[7:],
        "actions": expected_actions[:3] + expected_actions[4:]
    }

    data_checks = DataChecks(DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES,
                             {"InvalidTargetDataCheck": {"problem_type": "regression",
                                                         "objective": get_default_primary_search_objective("regression")}})
    assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning,
                                          "errors": messages[3:],
                                          "actions": expected_actions_with_drop_and_impute}
Ejemplo n.º 6
0
def test_errors_warnings_in_invalid_target_data_check(objective, ts_data):
    X, y = ts_data
    y[0] = -1
    y = pd.Series(y)
    details = {"Count of offending values": sum(val <= 0 for val in y.values.flatten())}
    data_check_error = DataCheckError(message=f"Target has non-positive values which is not supported for {objective}",
                                      data_check_name="InvalidTargetDataCheck",
                                      message_code=DataCheckMessageCode.TARGET_INCOMPATIBLE_OBJECTIVE,
                                      details=details).to_dict()

    default_data_check = DefaultDataChecks(problem_type="time series regression", objective=objective).data_checks
    for check in default_data_check:
        if check.name == "InvalidTargetDataCheck":
            assert check.validate(X, y) == {"warnings": [], "errors": [data_check_error], "actions": []}
Ejemplo n.º 7
0
def test_default_data_checks_classification(input_type):
    X = pd.DataFrame({
        'lots_of_null': [None, None, None, None, "some data"],
        'all_null': [None, None, None, None, None],
        'also_all_null': [None, None, None, None, None],
        'no_null': [1, 2, 3, 4, 5],
        'id': [0, 1, 2, 3, 4],
        'has_label_leakage': [100, 200, 100, 200, 100],
        'natural_language_nan': [
            None, "string_that_is_long_enough_for_natural_language_1",
            "string_that_is_long_enough_for_natural_language_2",
            "string_that_is_long_enough_for_natural_language_3",
            "string_that_is_long_enough_for_natural_language_4"
        ],
        'nan_dt_col':
        pd.Series(pd.date_range('20200101', periods=5))
    })
    X['nan_dt_col'][0] = None

    y = pd.Series([0, 1, np.nan, 1, 0])
    y_multiclass = pd.Series([0, 1, np.nan, 2, 0])
    if input_type == "ww":
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
        y_multiclass = ww.DataColumn(y_multiclass)

    data_checks = DefaultDataChecks(
        "binary", get_default_primary_search_objective("binary"))
    imbalance = [
        DataCheckError(
            message=
            "The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0.0, 1.0]",
            data_check_name="ClassImbalanceDataCheck",
            message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS,
            details={
                "target_values": [0.0, 1.0]
            }).to_dict()
    ]

    assert data_checks.validate(X, y) == {
        "warnings": messages[:3],
        "errors": messages[3:] + imbalance,
        "actions": expected_actions
    }

    data_checks = DataChecks(
        DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, {
            "InvalidTargetDataCheck": {
                "problem_type": "binary",
                "objective": get_default_primary_search_objective("binary")
            }
        })
    assert data_checks.validate(X, y) == {
        "warnings": messages[:3],
        "errors": messages[3:],
        "actions": expected_actions
    }

    # multiclass
    imbalance = [
        DataCheckError(
            message=
            "The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0.0, 2.0, 1.0]",
            data_check_name="ClassImbalanceDataCheck",
            message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS,
            details={
                "target_values": [0.0, 2.0, 1.0]
            }).to_dict()
    ]
    min_2_class_count = [
        DataCheckError(
            message=
            "Target does not have at least two instances per class which is required for multiclass classification",
            data_check_name="InvalidTargetDataCheck",
            message_code=DataCheckMessageCode.
            TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS,
            details={
                "least_populated_class_labels": [2.0, 1.0]
            }).to_dict()
    ]
    high_class_to_sample_ratio = [
        DataCheckWarning(
            message=
            "Target has a large number of unique values, could be regression type problem.",
            data_check_name="InvalidTargetDataCheck",
            message_code=DataCheckMessageCode.
            TARGET_MULTICLASS_HIGH_UNIQUE_CLASS,
            details={
                'class_to_value_ratio': 0.6
            }).to_dict()
    ]
    # multiclass
    data_checks = DefaultDataChecks(
        "multiclass", get_default_primary_search_objective("multiclass"))
    assert data_checks.validate(X, y_multiclass) == {
        "warnings": messages[:3] + high_class_to_sample_ratio,
        "errors": [messages[3]] + min_2_class_count + messages[4:] + imbalance,
        "actions": expected_actions
    }

    data_checks = DataChecks(
        DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, {
            "InvalidTargetDataCheck": {
                "problem_type": "multiclass",
                "objective": get_default_primary_search_objective("multiclass")
            }
        })
    assert data_checks.validate(X, y_multiclass) == {
        "warnings": messages[:3] + high_class_to_sample_ratio,
        "errors": [messages[3]] + min_2_class_count + messages[4:],
        "actions": expected_actions
    }