def test_invalid_target_data_check_valid_labels_for_nonnegative_objectives(objective): X = pd.DataFrame({'column_one': [100, 100, 200, 300, 100, 200, 100] * 25}) y = pd.Series([2, 2, 3, 3, 1, 1, 1] * 25) data_checks = DataChecks([InvalidTargetDataCheck], {"InvalidTargetDataCheck": {"problem_type": "multiclass", "objective": objective}}) assert data_checks.validate(X, y) == {"warnings": [], "errors": [], "actions": []}
def test_invalid_target_data_check_invalid_labels_for_nonnegative_objective_names(objective): X = pd.DataFrame({'column_one': [100, 200, 100, 200, 200, 100, 200, 100] * 25}) y = pd.Series([2, 2, 3, 3, -1, -1, 1, 1] * 25) data_checks = DataChecks([InvalidTargetDataCheck], {"InvalidTargetDataCheck": {"problem_type": "multiclass", "objective": objective}}) assert data_checks.validate(X, y) == { "warnings": [], "errors": [DataCheckError( message=f"Target has non-positive values which is not supported for {objective}", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_INCOMPATIBLE_OBJECTIVE, details={"Count of offending values": sum(val <= 0 for val in y.values.flatten())}).to_dict()], "actions": [] } X = pd.DataFrame({'column_one': [100, 200, 100, 200, 100]}) y = pd.Series([2, 3, 0, 1, 1]) invalid_targets_check = InvalidTargetDataCheck(problem_type="regression", objective=objective) assert invalid_targets_check.validate(X, y) == { "warnings": [], "errors": [DataCheckError( message=f"Target has non-positive values which is not supported for {objective}", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_INCOMPATIBLE_OBJECTIVE, details={"Count of offending values": sum(val <= 0 for val in y.values.flatten())}).to_dict()], "actions": [] }
def test_invalid_target_data_check_invalid_labels_for_objectives( time_series_core_objectives): X = pd.DataFrame( {'column_one': [100, 200, 100, 200, 200, 100, 200, 100] * 25}) y = pd.Series([2, 2, 3, 3, -1, -1, 1, 1] * 25) for objective in time_series_core_objectives: if not objective.positive_only: data_checks = DataChecks( [InvalidTargetDataCheck], { "InvalidTargetDataCheck": { "problem_type": "multiclass", "objective": objective } }) assert data_checks.validate(X, y) == {"warnings": [], "errors": []} X = pd.DataFrame({'column_one': [100, 200, 100, 200, 100]}) y = pd.Series([2, 3, 0, 1, 1]) for objective in time_series_core_objectives: if not objective.positive_only: invalid_targets_check = InvalidTargetDataCheck( problem_type="regression", objective=objective) assert invalid_targets_check.validate(X, y) == { "warnings": [], "errors": [] }
def test_data_checks_do_not_duplicate_actions(X_y_binary): X, y = X_y_binary class MockDataCheck(DataCheck): def validate(self, X, y): return { "warnings": [], "errors": [], "actions": [ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'col_to_drop' }).to_dict() ] } class MockDataCheckWithSameAction(DataCheck): def validate(self, X, y): return {"warnings": [], "errors": [], "actions": []} data_checks_list = [MockDataCheck, MockDataCheckWithSameAction] data_checks = DataChecks(data_checks=data_checks_list) # Check duplicate actions are returned once assert data_checks.validate(X, y) == { "warnings": [], "errors": [], "actions": [ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": 'col_to_drop' }).to_dict() ] }
def test_data_checks(X_y_binary): X, y = X_y_binary class MockDataCheck(DataCheck): def validate(self, X, y): return {"warnings": [], "errors": [], "actions": []} class MockDataCheckWarning(DataCheck): def validate(self, X, y): return {"warnings": [DataCheckWarning(message="warning one", data_check_name=self.name, message_code=None).to_dict()], "errors": [], "actions": []} class MockDataCheckError(DataCheck): def validate(self, X, y): return {"warnings": [], "errors": [DataCheckError(message="error one", data_check_name=self.name, message_code=None).to_dict()], "actions": []} class MockDataCheckErrorAndWarning(DataCheck): def validate(self, X, y): return {"warnings": [DataCheckWarning(message="warning two", data_check_name=self.name, message_code=None).to_dict()], "errors": [DataCheckError(message="error two", data_check_name=self.name, message_code=None).to_dict()], "actions": []} data_checks_list = [MockDataCheck, MockDataCheckWarning, MockDataCheckError, MockDataCheckErrorAndWarning] data_checks = DataChecks(data_checks=data_checks_list) assert data_checks.validate(X, y) == { "warnings": [DataCheckWarning(message="warning one", data_check_name="MockDataCheckWarning").to_dict(), DataCheckWarning(message="warning two", data_check_name="MockDataCheckErrorAndWarning").to_dict()], "errors": [DataCheckError(message="error one", data_check_name="MockDataCheckError").to_dict(), DataCheckError(message="error two", data_check_name="MockDataCheckErrorAndWarning").to_dict()], "actions": [] }
def test_data_checks_drop_index(X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) X['index_col'] = pd.Series(range(len(X))) X = ww.DataTable(X) X = X.set_index('index_col') class MockDataCheck(DataCheck): def validate(self, X, y): return {"warnings": [], "errors": [], "actions": []} assert MockDataCheck().validate(X, y) MockDataCheck.validate = MagicMock() checks = DataChecks([MockDataCheck, MockDataCheck, MockDataCheck]) checks.validate(X, y) validate_args = MockDataCheck.validate.call_args_list for arg in validate_args: assert 'index_col' not in arg[0][0].columns