コード例 #1
0
    def test_sameness_check_strings_different_values_no_allowed_error(self) -> None:
        self.mock_client.run_query_async.return_value = [{"a": "a", "b": "b", "c": "c"}]

        job = DataValidationJob(
            region_code="US_XX",
            validation=SamenessDataValidationCheck(
                validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                partition_columns=[],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    description="test_view description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                result_details=SamenessStringsValidationResultDetails(
                    num_error_rows=1,
                    total_num_rows=1,
                    max_allowed_error=0.0,
                    non_null_counts_per_column_per_partition=[
                        (tuple(), {"a": 1, "b": 1, "c": 1}),
                    ],
                ),
            ),
        )
コード例 #2
0
    def test_string_sameness_check_different_values_above_margin(self) -> None:
        num_bad_rows = 5
        max_allowed_error = (num_bad_rows - 1) / 100  # Below the number of bad rows

        self.mock_client.run_query_async.return_value = (
            self.return_string_values_with_num_bad_rows(num_bad_rows)
        )
        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                max_allowed_error=max_allowed_error,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        actual_expected_error = num_bad_rows / 100

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=f"{num_bad_rows} out of 100 row(s) did not contain matching strings. "
                f"The acceptable margin of error is only {max_allowed_error}, but the "
                f"validation returned an error rate of {actual_expected_error}.",
            ),
        )
コード例 #3
0
    def test_string_sameness_check_strings_values_all_none(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": None, "b": None, "c": None}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job, was_successful=True, failure_description=None
            ),
        )
コード例 #4
0
    def test_string_sameness_check_numbers_values_all_none(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": None, "b": None, "c": None}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )

        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            "Unexpected None value for column [a] in validation [test_view].",
        )
コード例 #5
0
    def test_string_sameness_check_different_values_handle_non_string_type(
        self,
    ) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": "same", "b": "same", "c": 1245}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            "Unexpected type [<class 'int'>] for value [1245] in STRING validation [test_view].",
        )
コード例 #6
0
    def test_sameness_check_numbers_multiple_rows_above_margin(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": 97, "b": 100, "c": 99},
            {"a": 14, "b": 21, "c": 14},
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                max_allowed_error=0.02,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description="2 row(s) had unacceptable margins of error. The acceptable margin "
                "of error is only 0.02, but the validation returned rows with "
                "errors as high as 0.3333.",
            ),
        )
コード例 #7
0
    def test_sameness_check_numbers_multiple_rows_above_margin(self):
        self.mock_client.run_query_async.return_value = [{
            'a': 97,
            'b': 100,
            'c': 99
        }, {
            'a': 14,
            'b': 21,
            'c': 14
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                max_allowed_error=0.02,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=
                '2 row(s) had unacceptable margins of error. The acceptable margin '
                'of error is only 0.02, but the validation returned rows with '
                'errors as high as 0.3333.',
            ))
コード例 #8
0
    def test_string_sameness_check_different_values_within_margin(self) -> None:
        num_bad_rows = 2
        max_allowed_error = num_bad_rows / 100

        self.mock_client.run_query_async.return_value = (
            self.return_string_values_with_num_bad_rows(num_bad_rows)
        )

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                max_allowed_error=max_allowed_error,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job, was_successful=True, failure_description=None
            ),
        )
コード例 #9
0
    def test_sameness_check_numbers_different_values_within_margin(self) -> None:
        self.mock_client.run_query_async.return_value = [{"a": 98, "b": 100, "c": 99}]

        job = DataValidationJob(
            region_code="US_XX",
            validation=SamenessDataValidationCheck(
                validation_category=ValidationCategory.EXTERNAL_AGGREGATE,
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                max_allowed_error=0.02,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    description="test_view description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                result_details=SamenessNumbersValidationResultDetails(
                    failed_rows=[], max_allowed_error=0.02
                ),
            ),
        )
コード例 #10
0
    def test_sameness_check_numbers_one_none(self) -> None:
        self.mock_client.run_query_async.return_value = [{"a": 3, "b": 3, "c": None}]

        job = DataValidationJob(
            region_code="US_XX",
            validation=SamenessDataValidationCheck(
                validation_category=ValidationCategory.EXTERNAL_AGGREGATE,
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    description="test_view description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )

        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            "Unexpected None value for column [c] in validation [test_view].",
        )
コード例 #11
0
    def test_string_sameness_check_different_values_above_margin(self):
        num_bad_rows = 5
        max_allowed_error = (
            (num_bad_rows - 1) / 100)  # Below the number of bad rows

        self.mock_client.run_query_async.return_value = self.return_string_values_with_num_bad_rows(
            num_bad_rows)
        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                max_allowed_error=max_allowed_error,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        actual_expected_error = (num_bad_rows / 100)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=
                f'{num_bad_rows} out of 100 row(s) did not contain matching strings. '
                f'The acceptable margin of error is only {max_allowed_error}, but the '
                f'validation returned an error rate of {actual_expected_error}.',
            ))
コード例 #12
0
    def test_string_sameness_check_different_values_handle_non_string_type(
            self):
        self.mock_client.run_query_async.return_value = [{
            'a': 'same',
            'b': 'same',
            'c': 1245
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            'Unexpected type [<class \'int\'>] for value [1245] in STRING validation [test_view].'
        )
コード例 #13
0
    def test_string_sameness_check_different_values_handle_empty_string(self):
        self.mock_client.run_query_async.return_value = [{
            'a': 'same',
            'b': 'same',
            'c': None
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=
                '1 out of 1 row(s) did not contain matching strings. '
                'The acceptable margin of error is only 0.0, but the '
                'validation returned an error rate of 1.0.',
            ))
コード例 #14
0
    def test_string_sameness_check_different_values_handle_empty_string(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": "same", "b": "same", "c": None}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description="1 out of 1 row(s) did not contain matching strings. "
                "The acceptable margin of error is only 0.0, but the "
                "validation returned an error rate of 1.0.",
            ),
        )
コード例 #15
0
    def test_sameness_check_numbers_different_values_within_margin(self):
        self.mock_client.run_query_async.return_value = [{
            'a': 98,
            'b': 100,
            'c': 99
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                max_allowed_error=0.02,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(validation_job=job,
                                    was_successful=True,
                                    failure_description=None))
コード例 #16
0
    def test_sameness_check_strings_multiple_dates(self) -> None:
        self.mock_client.run_query_async.return_value = [
            # January 2021
            {"region": "US_XX", "date": "2021-01-31", "a": "00", "b": "00"},
            {"region": "US_XX", "date": "2021-01-31", "a": "01", "b": None},
            {"region": "US_XX", "date": "2021-01-31", "a": "02", "b": "02"},
            {"region": "US_XX", "date": "2021-01-31", "a": None, "b": "03"},
            # December 2020
            {"region": "US_XX", "date": "2020-12-31", "a": "00", "b": "00"},
            {"region": "US_XX", "date": "2020-12-31", "a": "02", "b": "02"},
            {"region": "US_XX", "date": "2020-12-31", "a": None, "b": "04"},
        ]

        job = DataValidationJob(
            region_code="US_XX",
            validation=SamenessDataValidationCheck(
                validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b"],
                partition_columns=["region", "date"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                max_allowed_error=0.0,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    description="test_view description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                result_details=SamenessStringsValidationResultDetails(
                    num_error_rows=3,
                    total_num_rows=7,
                    max_allowed_error=0.0,
                    non_null_counts_per_column_per_partition=[
                        (("US_XX", "2021-01-31"), {"a": 3, "b": 3}),
                        (("US_XX", "2020-12-31"), {"a": 2, "b": 3}),
                    ],
                ),
            ),
        )
コード例 #17
0
    def test_string_sameness_check_numbers_one_none(self):
        self.mock_client.run_query_async.return_value = [{
            'a': 3,
            'b': 3,
            'c': None
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))

        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            'Unexpected None value for column [c] in validation [test_view].')
コード例 #18
0
    def test_string_sameness_check_strings_values_all_none(self):
        self.mock_client.run_query_async.return_value = [{
            'a': None,
            'b': None,
            'c': None
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(validation_job=job,
                                    was_successful=True,
                                    failure_description=None))
コード例 #19
0
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
# =============================================================================
"""Utilities for resolving the appropriate validation checker to use."""
from typing import Dict

from recidiviz.validation.checks.existence_check import ExistenceValidationChecker
from recidiviz.validation.checks.sameness_check import SamenessValidationChecker
from recidiviz.validation.checks.validation_checker import ValidationChecker
from recidiviz.validation.validation_models import (
    ValidationCheckType,
    DataValidationJob,
)

_CHECKER_FOR_TYPE: Dict[ValidationCheckType, ValidationChecker] = {
    ValidationCheckType.EXISTENCE: ExistenceValidationChecker(),
    ValidationCheckType.SAMENESS: SamenessValidationChecker(),
}


def _checker_for_type(check_type: ValidationCheckType) -> ValidationChecker:
    checker = _CHECKER_FOR_TYPE.get(check_type, None)
    if checker:
        return checker

    raise ValueError(
        f"No checker implementation enabled for check type {check_type}")


def checker_for_validation(
        validation_job: DataValidationJob) -> ValidationChecker:
    """Retrieves the checker type associated with the given validation table.