def test_sameness_check_strings_different_values_no_allowed_error(self) -> None: self.mock_client.run_query_async.return_value = [{"a": "a", "b": "b", "c": "c"}] job = DataValidationJob( region_code="US_XX", validation=SamenessDataValidationCheck( validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL, validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], partition_columns=[], sameness_check_type=SamenessDataValidationCheckType.STRINGS, view_builder=SimpleBigQueryViewBuilder( dataset_id="my_dataset", view_id="test_view", description="test_view description", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, result_details=SamenessStringsValidationResultDetails( num_error_rows=1, total_num_rows=1, max_allowed_error=0.0, non_null_counts_per_column_per_partition=[ (tuple(), {"a": 1, "b": 1, "c": 1}), ], ), ), )
def test_string_sameness_check_different_values_above_margin(self) -> None: num_bad_rows = 5 max_allowed_error = (num_bad_rows - 1) / 100 # Below the number of bad rows self.mock_client.run_query_async.return_value = ( self.return_string_values_with_num_bad_rows(num_bad_rows) ) job = DataValidationJob( region_code="US_VA", validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.STRINGS, max_allowed_error=max_allowed_error, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) actual_expected_error = num_bad_rows / 100 self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=False, failure_description=f"{num_bad_rows} out of 100 row(s) did not contain matching strings. " f"The acceptable margin of error is only {max_allowed_error}, but the " f"validation returned an error rate of {actual_expected_error}.", ), )
def test_string_sameness_check_strings_values_all_none(self) -> None: self.mock_client.run_query_async.return_value = [ {"a": None, "b": None, "c": None} ] job = DataValidationJob( region_code="US_VA", validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.STRINGS, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=True, failure_description=None ), )
def test_string_sameness_check_numbers_values_all_none(self) -> None: self.mock_client.run_query_async.return_value = [ {"a": None, "b": None, "c": None} ] job = DataValidationJob( region_code="US_VA", validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.NUMBERS, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) with self.assertRaises(ValueError) as e: _ = SamenessValidationChecker.run_check(job) self.assertEqual( str(e.exception), "Unexpected None value for column [a] in validation [test_view].", )
def test_string_sameness_check_different_values_handle_non_string_type( self, ) -> None: self.mock_client.run_query_async.return_value = [ {"a": "same", "b": "same", "c": 1245} ] job = DataValidationJob( region_code="US_VA", validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.STRINGS, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) with self.assertRaises(ValueError) as e: _ = SamenessValidationChecker.run_check(job) self.assertEqual( str(e.exception), "Unexpected type [<class 'int'>] for value [1245] in STRING validation [test_view].", )
def test_sameness_check_numbers_multiple_rows_above_margin(self) -> None: self.mock_client.run_query_async.return_value = [ {"a": 97, "b": 100, "c": 99}, {"a": 14, "b": 21, "c": 14}, ] job = DataValidationJob( region_code="US_VA", validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.NUMBERS, max_allowed_error=0.02, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=False, failure_description="2 row(s) had unacceptable margins of error. The acceptable margin " "of error is only 0.02, but the validation returned rows with " "errors as high as 0.3333.", ), )
def test_sameness_check_numbers_multiple_rows_above_margin(self): self.mock_client.run_query_async.return_value = [{ 'a': 97, 'b': 100, 'c': 99 }, { 'a': 14, 'b': 21, 'c': 14 }] job = DataValidationJob( region_code='US_VA', validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=['a', 'b', 'c'], sameness_check_type=SamenessDataValidationCheckType.NUMBERS, max_allowed_error=0.02, view=BigQueryView( dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything'))) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=False, failure_description= '2 row(s) had unacceptable margins of error. The acceptable margin ' 'of error is only 0.02, but the validation returned rows with ' 'errors as high as 0.3333.', ))
def test_string_sameness_check_different_values_within_margin(self) -> None: num_bad_rows = 2 max_allowed_error = num_bad_rows / 100 self.mock_client.run_query_async.return_value = ( self.return_string_values_with_num_bad_rows(num_bad_rows) ) job = DataValidationJob( region_code="US_VA", validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.STRINGS, max_allowed_error=max_allowed_error, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=True, failure_description=None ), )
def test_sameness_check_numbers_different_values_within_margin(self) -> None: self.mock_client.run_query_async.return_value = [{"a": 98, "b": 100, "c": 99}] job = DataValidationJob( region_code="US_XX", validation=SamenessDataValidationCheck( validation_category=ValidationCategory.EXTERNAL_AGGREGATE, validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.NUMBERS, max_allowed_error=0.02, view_builder=SimpleBigQueryViewBuilder( dataset_id="my_dataset", view_id="test_view", description="test_view description", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, result_details=SamenessNumbersValidationResultDetails( failed_rows=[], max_allowed_error=0.02 ), ), )
def test_sameness_check_numbers_one_none(self) -> None: self.mock_client.run_query_async.return_value = [{"a": 3, "b": 3, "c": None}] job = DataValidationJob( region_code="US_XX", validation=SamenessDataValidationCheck( validation_category=ValidationCategory.EXTERNAL_AGGREGATE, validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.NUMBERS, view_builder=SimpleBigQueryViewBuilder( dataset_id="my_dataset", view_id="test_view", description="test_view description", view_query_template="select * from literally_anything", ), ), ) with self.assertRaises(ValueError) as e: _ = SamenessValidationChecker.run_check(job) self.assertEqual( str(e.exception), "Unexpected None value for column [c] in validation [test_view].", )
def test_string_sameness_check_different_values_above_margin(self): num_bad_rows = 5 max_allowed_error = ( (num_bad_rows - 1) / 100) # Below the number of bad rows self.mock_client.run_query_async.return_value = self.return_string_values_with_num_bad_rows( num_bad_rows) job = DataValidationJob( region_code='US_VA', validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=['a', 'b', 'c'], sameness_check_type=SamenessDataValidationCheckType.STRINGS, max_allowed_error=max_allowed_error, view=BigQueryView( dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything'))) result = SamenessValidationChecker.run_check(job) actual_expected_error = (num_bad_rows / 100) self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=False, failure_description= f'{num_bad_rows} out of 100 row(s) did not contain matching strings. ' f'The acceptable margin of error is only {max_allowed_error}, but the ' f'validation returned an error rate of {actual_expected_error}.', ))
def test_string_sameness_check_different_values_handle_non_string_type( self): self.mock_client.run_query_async.return_value = [{ 'a': 'same', 'b': 'same', 'c': 1245 }] job = DataValidationJob( region_code='US_VA', validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=['a', 'b', 'c'], sameness_check_type=SamenessDataValidationCheckType.STRINGS, view=BigQueryView( dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything'))) with self.assertRaises(ValueError) as e: _ = SamenessValidationChecker.run_check(job) self.assertEqual( str(e.exception), 'Unexpected type [<class \'int\'>] for value [1245] in STRING validation [test_view].' )
def test_string_sameness_check_different_values_handle_empty_string(self): self.mock_client.run_query_async.return_value = [{ 'a': 'same', 'b': 'same', 'c': None }] job = DataValidationJob( region_code='US_VA', validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=['a', 'b', 'c'], sameness_check_type=SamenessDataValidationCheckType.STRINGS, view=BigQueryView( dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything'))) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=False, failure_description= '1 out of 1 row(s) did not contain matching strings. ' 'The acceptable margin of error is only 0.0, but the ' 'validation returned an error rate of 1.0.', ))
def test_string_sameness_check_different_values_handle_empty_string(self) -> None: self.mock_client.run_query_async.return_value = [ {"a": "same", "b": "same", "c": None} ] job = DataValidationJob( region_code="US_VA", validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.STRINGS, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=False, failure_description="1 out of 1 row(s) did not contain matching strings. " "The acceptable margin of error is only 0.0, but the " "validation returned an error rate of 1.0.", ), )
def test_sameness_check_numbers_different_values_within_margin(self): self.mock_client.run_query_async.return_value = [{ 'a': 98, 'b': 100, 'c': 99 }] job = DataValidationJob( region_code='US_VA', validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=['a', 'b', 'c'], sameness_check_type=SamenessDataValidationCheckType.NUMBERS, max_allowed_error=0.02, view=BigQueryView( dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything'))) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult(validation_job=job, was_successful=True, failure_description=None))
def test_sameness_check_strings_multiple_dates(self) -> None: self.mock_client.run_query_async.return_value = [ # January 2021 {"region": "US_XX", "date": "2021-01-31", "a": "00", "b": "00"}, {"region": "US_XX", "date": "2021-01-31", "a": "01", "b": None}, {"region": "US_XX", "date": "2021-01-31", "a": "02", "b": "02"}, {"region": "US_XX", "date": "2021-01-31", "a": None, "b": "03"}, # December 2020 {"region": "US_XX", "date": "2020-12-31", "a": "00", "b": "00"}, {"region": "US_XX", "date": "2020-12-31", "a": "02", "b": "02"}, {"region": "US_XX", "date": "2020-12-31", "a": None, "b": "04"}, ] job = DataValidationJob( region_code="US_XX", validation=SamenessDataValidationCheck( validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL, validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b"], partition_columns=["region", "date"], sameness_check_type=SamenessDataValidationCheckType.STRINGS, max_allowed_error=0.0, view_builder=SimpleBigQueryViewBuilder( dataset_id="my_dataset", view_id="test_view", description="test_view description", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, result_details=SamenessStringsValidationResultDetails( num_error_rows=3, total_num_rows=7, max_allowed_error=0.0, non_null_counts_per_column_per_partition=[ (("US_XX", "2021-01-31"), {"a": 3, "b": 3}), (("US_XX", "2020-12-31"), {"a": 2, "b": 3}), ], ), ), )
def test_string_sameness_check_numbers_one_none(self): self.mock_client.run_query_async.return_value = [{ 'a': 3, 'b': 3, 'c': None }] job = DataValidationJob( region_code='US_VA', validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=['a', 'b', 'c'], sameness_check_type=SamenessDataValidationCheckType.NUMBERS, view=BigQueryView( dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything'))) with self.assertRaises(ValueError) as e: _ = SamenessValidationChecker.run_check(job) self.assertEqual( str(e.exception), 'Unexpected None value for column [c] in validation [test_view].')
def test_string_sameness_check_strings_values_all_none(self): self.mock_client.run_query_async.return_value = [{ 'a': None, 'b': None, 'c': None }] job = DataValidationJob( region_code='US_VA', validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=['a', 'b', 'c'], sameness_check_type=SamenessDataValidationCheckType.STRINGS, view=BigQueryView( dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything'))) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult(validation_job=job, was_successful=True, failure_description=None))
# along with this program. If not, see <https://www.gnu.org/licenses/>. # ============================================================================= """Utilities for resolving the appropriate validation checker to use.""" from typing import Dict from recidiviz.validation.checks.existence_check import ExistenceValidationChecker from recidiviz.validation.checks.sameness_check import SamenessValidationChecker from recidiviz.validation.checks.validation_checker import ValidationChecker from recidiviz.validation.validation_models import ( ValidationCheckType, DataValidationJob, ) _CHECKER_FOR_TYPE: Dict[ValidationCheckType, ValidationChecker] = { ValidationCheckType.EXISTENCE: ExistenceValidationChecker(), ValidationCheckType.SAMENESS: SamenessValidationChecker(), } def _checker_for_type(check_type: ValidationCheckType) -> ValidationChecker: checker = _CHECKER_FOR_TYPE.get(check_type, None) if checker: return checker raise ValueError( f"No checker implementation enabled for check type {check_type}") def checker_for_validation( validation_job: DataValidationJob) -> ValidationChecker: """Retrieves the checker type associated with the given validation table.