def test_invalid_target_data_check_valid_labels_for_nonnegative_objectives(objective): X = pd.DataFrame({'column_one': [100, 100, 200, 300, 100, 200, 100] * 25}) y = pd.Series([2, 2, 3, 3, 1, 1, 1] * 25) data_checks = DataChecks([InvalidTargetDataCheck], {"InvalidTargetDataCheck": {"problem_type": "multiclass", "objective": objective}}) assert data_checks.validate(X, y) == {"warnings": [], "errors": [], "actions": []}
def test_data_checks_do_not_duplicate_actions(X_y_binary): X, y = X_y_binary class MockDataCheck(DataCheck): def validate(self, X, y): return { "warnings": [], "errors": [], "actions": [ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'col_to_drop' }).to_dict() ] } class MockDataCheckWithSameAction(DataCheck): def validate(self, X, y): return {"warnings": [], "errors": [], "actions": []} data_checks_list = [MockDataCheck, MockDataCheckWithSameAction] data_checks = DataChecks(data_checks=data_checks_list) # Check duplicate actions are returned once assert data_checks.validate(X, y) == { "warnings": [], "errors": [], "actions": [ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'col_to_drop' }).to_dict() ] }
def test_invalid_target_data_check_invalid_labels_for_nonnegative_objective_names(objective): X = pd.DataFrame({'column_one': [100, 200, 100, 200, 200, 100, 200, 100] * 25}) y = pd.Series([2, 2, 3, 3, -1, -1, 1, 1] * 25) data_checks = DataChecks([InvalidTargetDataCheck], {"InvalidTargetDataCheck": {"problem_type": "multiclass", "objective": objective}}) assert data_checks.validate(X, y) == { "warnings": [], "errors": [DataCheckError( message=f"Target has non-positive values which is not supported for {objective}", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_INCOMPATIBLE_OBJECTIVE, details={"Count of offending values": sum(val <= 0 for val in y.values.flatten())}).to_dict()], "actions": [] } X = pd.DataFrame({'column_one': [100, 200, 100, 200, 100]}) y = pd.Series([2, 3, 0, 1, 1]) invalid_targets_check = InvalidTargetDataCheck(problem_type="regression", objective=objective) assert invalid_targets_check.validate(X, y) == { "warnings": [], "errors": [DataCheckError( message=f"Target has non-positive values which is not supported for {objective}", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_INCOMPATIBLE_OBJECTIVE, details={"Count of offending values": sum(val <= 0 for val in y.values.flatten())}).to_dict()], "actions": [] }
def test_invalid_target_data_check_invalid_labels_for_objectives( time_series_core_objectives): X = pd.DataFrame( {'column_one': [100, 200, 100, 200, 200, 100, 200, 100] * 25}) y = pd.Series([2, 2, 3, 3, -1, -1, 1, 1] * 25) for objective in time_series_core_objectives: if not objective.positive_only: data_checks = DataChecks( [InvalidTargetDataCheck], { "InvalidTargetDataCheck": { "problem_type": "multiclass", "objective": objective } }) assert data_checks.validate(X, y) == {"warnings": [], "errors": []} X = pd.DataFrame({'column_one': [100, 200, 100, 200, 100]}) y = pd.Series([2, 3, 0, 1, 1]) for objective in time_series_core_objectives: if not objective.positive_only: invalid_targets_check = InvalidTargetDataCheck( problem_type="regression", objective=objective) assert invalid_targets_check.validate(X, y) == { "warnings": [], "errors": [] }
def test_data_checks(X_y_binary): X, y = X_y_binary class MockDataCheck(DataCheck): def validate(self, X, y): return {"warnings": [], "errors": [], "actions": []} class MockDataCheckWarning(DataCheck): def validate(self, X, y): return {"warnings": [DataCheckWarning(message="warning one", data_check_name=self.name, message_code=None).to_dict()], "errors": [], "actions": []} class MockDataCheckError(DataCheck): def validate(self, X, y): return {"warnings": [], "errors": [DataCheckError(message="error one", data_check_name=self.name, message_code=None).to_dict()], "actions": []} class MockDataCheckErrorAndWarning(DataCheck): def validate(self, X, y): return {"warnings": [DataCheckWarning(message="warning two", data_check_name=self.name, message_code=None).to_dict()], "errors": [DataCheckError(message="error two", data_check_name=self.name, message_code=None).to_dict()], "actions": []} data_checks_list = [MockDataCheck, MockDataCheckWarning, MockDataCheckError, MockDataCheckErrorAndWarning] data_checks = DataChecks(data_checks=data_checks_list) assert data_checks.validate(X, y) == { "warnings": [DataCheckWarning(message="warning one", data_check_name="MockDataCheckWarning").to_dict(), DataCheckWarning(message="warning two", data_check_name="MockDataCheckErrorAndWarning").to_dict()], "errors": [DataCheckError(message="error one", data_check_name="MockDataCheckError").to_dict(), DataCheckError(message="error two", data_check_name="MockDataCheckErrorAndWarning").to_dict()], "actions": [] }
def _validate_data_checks(self, data_checks): """Validate data_checks parameter. Arguments: data_checks (DataChecks, list(Datacheck), str, None): Input to validate. If not of the right type, raise an exception. Returns: An instance of DataChecks used to perform checks before search. """ if isinstance(data_checks, DataChecks): return data_checks elif isinstance(data_checks, list): return AutoMLDataChecks(data_checks) elif isinstance(data_checks, str): if data_checks == "auto": return DefaultDataChecks( problem_type=self.problem_type, objective=self.objective, n_splits=self.data_splitter.get_n_splits()) elif data_checks == "disabled": return EmptyDataChecks() else: raise ValueError( "If data_checks is a string, it must be either 'auto' or 'disabled'. " f"Received '{data_checks}'.") elif data_checks is None: return EmptyDataChecks() else: return DataChecks(data_checks)
def test_default_data_checks_regression(input_type): X = pd.DataFrame({ 'lots_of_null': [None, None, None, None, "some data"], 'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 5, 5], 'id': [0, 1, 2, 3, 4], 'has_label_leakage': [100, 200, 100, 200, 100] }) y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2]) y_no_variance = pd.Series([5] * 5) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) y_no_variance = ww.DataColumn(y_no_variance) null_leakage = [ DataCheckWarning( message= "Column 'lots_of_null' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "lots_of_null" }).to_dict() ] data_checks = DefaultDataChecks( "regression", get_default_primary_search_objective("regression")) assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:] } # Skip Invalid Target assert data_checks.validate(X, y_no_variance) == { "warnings": messages[:3] + null_leakage, "errors": messages[4:] + [ DataCheckError(message="Y has 1 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={ "column": "Y" }).to_dict() ] } data_checks = DataChecks( DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, { "InvalidTargetDataCheck": { "problem_type": "regression", "objective": get_default_primary_search_objective("regression") } }) assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:] }
def test_default_data_checks_regression(input_type): X = pd.DataFrame({'lots_of_null': [None, None, None, None, "some data"], 'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 5, 5], 'id': [0, 1, 2, 3, 4], 'has_label_leakage': [100, 200, 100, 200, 100], 'natural_language_nan': [None, "string_that_is_long_enough_for_natural_language_1", "string_that_is_long_enough_for_natural_language_2", "string_that_is_long_enough_for_natural_language_3", "string_that_is_long_enough_for_natural_language_4"], 'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5))}) X['nan_dt_col'][0] = None y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2]) y_no_variance = pd.Series([5] * 5) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) y_no_variance = ww.DataColumn(y_no_variance) null_leakage = [DataCheckWarning(message="Column 'lots_of_null' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "lots_of_null"}).to_dict()] data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression")) id_leakage_warning = [DataCheckWarning(message="Column 'id' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "id"}).to_dict()] nan_dt_leakage_warning = [DataCheckWarning(message="Column 'nan_dt_col' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "nan_dt_col"}).to_dict()] impute_action = DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, 'impute_strategy': 'mean'}).to_dict() nan_dt_action = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'nan_dt_col'}).to_dict() expected_actions_with_drop_and_impute = expected_actions[:3] + [nan_dt_action, impute_action] + expected_actions[4:] assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, "errors": messages[3:], "actions": expected_actions_with_drop_and_impute} # Skip Invalid Target assert data_checks.validate(X, y_no_variance) == { "warnings": messages[:3] + null_leakage, "errors": messages[4:7] + [DataCheckError(message="Y has 1 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": "Y"}).to_dict()] + messages[7:], "actions": expected_actions[:3] + expected_actions[4:] } data_checks = DataChecks(DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, {"InvalidTargetDataCheck": {"problem_type": "regression", "objective": get_default_primary_search_objective("regression")}}) assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, "errors": messages[3:], "actions": expected_actions_with_drop_and_impute}
def test_invalid_target_data_check_initialize_with_none_objective(): with pytest.raises(DataCheckInitError, match="Encountered the following error"): DataChecks( [InvalidTargetDataCheck], { "InvalidTargetDataCheck": { "problem_type": "multiclass", "objective": None } })
def test_data_checks_drop_index(X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) X['index_col'] = pd.Series(range(len(X))) X = ww.DataTable(X) X = X.set_index('index_col') class MockDataCheck(DataCheck): def validate(self, X, y): return {"warnings": [], "errors": [], "actions": []} assert MockDataCheck().validate(X, y) MockDataCheck.validate = MagicMock() checks = DataChecks([MockDataCheck, MockDataCheck, MockDataCheck]) checks.validate(X, y) validate_args = MockDataCheck.validate.call_args_list for arg in validate_args: assert 'index_col' not in arg[0][0].columns
def test_data_checks_init_from_classes(): def make_mock_data_check(check_name): class MockCheck(DataCheck): name = check_name def __init__(self, foo, bar, baz=3): self.foo = foo self.bar = bar self.baz = baz def validate(self, X, y=None): """Mock validate.""" return MockCheck data_checks = [ make_mock_data_check("check_1"), make_mock_data_check("check_2") ] checks = DataChecks(data_checks, data_check_params={ "check_1": { "foo": 1, "bar": 2 }, "check_2": { "foo": 3, "bar": 1, "baz": 4 } }) assert checks.data_checks[0].foo == 1 assert checks.data_checks[0].bar == 2 assert checks.data_checks[0].baz == 3 assert checks.data_checks[1].foo == 3 assert checks.data_checks[1].bar == 1 assert checks.data_checks[1].baz == 4
def test_data_checks_raises_value_errors_on_init(classes, params, expected_exception, expected_message): with pytest.raises(expected_exception, match=expected_message): DataChecks(classes, params)
def test_default_data_checks_classification(input_type): X = pd.DataFrame({ 'lots_of_null': [None, None, None, None, "some data"], 'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 4, 5], 'id': [0, 1, 2, 3, 4], 'has_label_leakage': [100, 200, 100, 200, 100], 'natural_language_nan': [ None, "string_that_is_long_enough_for_natural_language_1", "string_that_is_long_enough_for_natural_language_2", "string_that_is_long_enough_for_natural_language_3", "string_that_is_long_enough_for_natural_language_4" ], 'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5)) }) X['nan_dt_col'][0] = None y = pd.Series([0, 1, np.nan, 1, 0]) y_multiclass = pd.Series([0, 1, np.nan, 2, 0]) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) y_multiclass = ww.DataColumn(y_multiclass) data_checks = DefaultDataChecks( "binary", get_default_primary_search_objective("binary")) imbalance = [ DataCheckError( message= "The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0.0, 1.0]", data_check_name="ClassImbalanceDataCheck", message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, details={ "target_values": [0.0, 1.0] }).to_dict() ] assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:] + imbalance, "actions": expected_actions } data_checks = DataChecks( DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, { "InvalidTargetDataCheck": { "problem_type": "binary", "objective": get_default_primary_search_objective("binary") } }) assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:], "actions": expected_actions } # multiclass imbalance = [ DataCheckError( message= "The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0.0, 2.0, 1.0]", data_check_name="ClassImbalanceDataCheck", message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, details={ "target_values": [0.0, 2.0, 1.0] }).to_dict() ] min_2_class_count = [ DataCheckError( message= "Target does not have at least two instances per class which is required for multiclass classification", data_check_name="InvalidTargetDataCheck", message_code=DataCheckMessageCode. TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, details={ "least_populated_class_labels": [2.0, 1.0] }).to_dict() ] high_class_to_sample_ratio = [ DataCheckWarning( message= "Target has a large number of unique values, could be regression type problem.", data_check_name="InvalidTargetDataCheck", message_code=DataCheckMessageCode. TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details={ 'class_to_value_ratio': 0.6 }).to_dict() ] # multiclass data_checks = DefaultDataChecks( "multiclass", get_default_primary_search_objective("multiclass")) assert data_checks.validate(X, y_multiclass) == { "warnings": messages[:3] + high_class_to_sample_ratio, "errors": [messages[3]] + min_2_class_count + messages[4:] + imbalance, "actions": expected_actions } data_checks = DataChecks( DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, { "InvalidTargetDataCheck": { "problem_type": "multiclass", "objective": get_default_primary_search_objective("multiclass") } }) assert data_checks.validate(X, y_multiclass) == { "warnings": messages[:3] + high_class_to_sample_ratio, "errors": [messages[3]] + min_2_class_count + messages[4:], "actions": expected_actions }
def test_data_checks_not_list_error(X_y_binary): with pytest.raises(ValueError, match="Parameter data_checks must be a list."): DataChecks(data_checks=1)