Ejemplo n.º 1
0
    def test_samneness_check_validation_name(self) -> None:
        check = SamenessDataValidationCheck(
            validation_type=ValidationCheckType.SAMENESS,
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=["a", "b", "c"],
            view=BigQueryView(
                dataset_id="my_dataset",
                view_id="test_view",
                view_query_template="select * from literally_anything",
            ),
        )
        self.assertEqual(check.validation_name, "test_view")

        check_with_name_suffix = SamenessDataValidationCheck(
            validation_type=ValidationCheckType.SAMENESS,
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            validation_name_suffix="b_c_only",
            comparison_columns=["b", "c"],
            view=BigQueryView(
                dataset_id="my_dataset",
                view_id="test_view",
                view_query_template="select * from literally_anything",
            ),
        )
        self.assertEqual(check_with_name_suffix.validation_name, "test_view_b_c_only")
Ejemplo n.º 2
0
    def test_sameness_check_validation_name(self) -> None:
        check = SamenessDataValidationCheck(
            validation_category=ValidationCategory.EXTERNAL_AGGREGATE,
            validation_type=ValidationCheckType.SAMENESS,
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=["a", "b", "c"],
            view_builder=SimpleBigQueryViewBuilder(
                dataset_id="my_dataset",
                view_id="test_view",
                description="test_view description",
                view_query_template="select * from literally_anything",
            ),
        )
        self.assertEqual(check.validation_name, "test_view")

        check_with_name_suffix = SamenessDataValidationCheck(
            validation_category=ValidationCategory.EXTERNAL_AGGREGATE,
            validation_type=ValidationCheckType.SAMENESS,
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            validation_name_suffix="b_c_only",
            comparison_columns=["b", "c"],
            view_builder=SimpleBigQueryViewBuilder(
                dataset_id="my_dataset",
                view_id="test_view",
                description="test_view description",
                view_query_template="select * from literally_anything",
            ),
        )
        self.assertEqual(check_with_name_suffix.validation_name, "test_view_b_c_only")
Ejemplo n.º 3
0
    def test_string_sameness_check_different_values_above_margin(self):
        num_bad_rows = 5
        max_allowed_error = (
            (num_bad_rows - 1) / 100)  # Below the number of bad rows

        self.mock_client.run_query_async.return_value = self.return_string_values_with_num_bad_rows(
            num_bad_rows)
        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                max_allowed_error=max_allowed_error,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        actual_expected_error = (num_bad_rows / 100)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=
                f'{num_bad_rows} out of 100 row(s) did not contain matching strings. '
                f'The acceptable margin of error is only {max_allowed_error}, but the '
                f'validation returned an error rate of {actual_expected_error}.',
            ))
Ejemplo n.º 4
0
    def test_string_sameness_check_different_values_handle_non_string_type(
            self):
        self.mock_client.run_query_async.return_value = [{
            'a': 'same',
            'b': 'same',
            'c': 1245
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            'Unexpected type [<class \'int\'>] for value [1245] in STRING validation [test_view].'
        )
Ejemplo n.º 5
0
    def test_string_sameness_check_different_values_handle_empty_string(self):
        self.mock_client.run_query_async.return_value = [{
            'a': 'same',
            'b': 'same',
            'c': None
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=
                '1 out of 1 row(s) did not contain matching strings. '
                'The acceptable margin of error is only 0.0, but the '
                'validation returned an error rate of 1.0.',
            ))
Ejemplo n.º 6
0
    def test_string_sameness_check_different_values_above_margin(self) -> None:
        num_bad_rows = 5
        max_allowed_error = (num_bad_rows - 1) / 100  # Below the number of bad rows

        self.mock_client.run_query_async.return_value = (
            self.return_string_values_with_num_bad_rows(num_bad_rows)
        )
        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                max_allowed_error=max_allowed_error,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        actual_expected_error = num_bad_rows / 100

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=f"{num_bad_rows} out of 100 row(s) did not contain matching strings. "
                f"The acceptable margin of error is only {max_allowed_error}, but the "
                f"validation returned an error rate of {actual_expected_error}.",
            ),
        )
Ejemplo n.º 7
0
    def test_string_sameness_check_different_values_within_margin(self) -> None:
        num_bad_rows = 2
        max_allowed_error = num_bad_rows / 100

        self.mock_client.run_query_async.return_value = (
            self.return_string_values_with_num_bad_rows(num_bad_rows)
        )

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                max_allowed_error=max_allowed_error,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job, was_successful=True, failure_description=None
            ),
        )
Ejemplo n.º 8
0
    def test_string_sameness_check_numbers_values_all_none(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": None, "b": None, "c": None}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )

        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            "Unexpected None value for column [a] in validation [test_view].",
        )
Ejemplo n.º 9
0
    def test_string_sameness_check_strings_values_all_none(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": None, "b": None, "c": None}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job, was_successful=True, failure_description=None
            ),
        )
Ejemplo n.º 10
0
    def test_sameness_check_numbers_different_values_within_margin(self):
        self.mock_client.run_query_async.return_value = [{
            'a': 98,
            'b': 100,
            'c': 99
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                max_allowed_error=0.02,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(validation_job=job,
                                    was_successful=True,
                                    failure_description=None))
Ejemplo n.º 11
0
    def test_string_sameness_check_different_values_handle_non_string_type(
        self,
    ) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": "same", "b": "same", "c": 1245}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            "Unexpected type [<class 'int'>] for value [1245] in STRING validation [test_view].",
        )
Ejemplo n.º 12
0
    def test_sameness_check_numbers_multiple_rows_above_margin(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": 97, "b": 100, "c": 99},
            {"a": 14, "b": 21, "c": 14},
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                max_allowed_error=0.02,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description="2 row(s) had unacceptable margins of error. The acceptable margin "
                "of error is only 0.02, but the validation returned rows with "
                "errors as high as 0.3333.",
            ),
        )
Ejemplo n.º 13
0
    def test_sameness_check_numbers_multiple_rows_above_margin(self):
        self.mock_client.run_query_async.return_value = [{
            'a': 97,
            'b': 100,
            'c': 99
        }, {
            'a': 14,
            'b': 21,
            'c': 14
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                max_allowed_error=0.02,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=
                '2 row(s) had unacceptable margins of error. The acceptable margin '
                'of error is only 0.02, but the validation returned rows with '
                'errors as high as 0.3333.',
            ))
Ejemplo n.º 14
0
    def test_sameness_check_numbers_one_none(self) -> None:
        self.mock_client.run_query_async.return_value = [{"a": 3, "b": 3, "c": None}]

        job = DataValidationJob(
            region_code="US_XX",
            validation=SamenessDataValidationCheck(
                validation_category=ValidationCategory.EXTERNAL_AGGREGATE,
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    description="test_view description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )

        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            "Unexpected None value for column [c] in validation [test_view].",
        )
Ejemplo n.º 15
0
    def test_sameness_check_strings_different_values_no_allowed_error(self) -> None:
        self.mock_client.run_query_async.return_value = [{"a": "a", "b": "b", "c": "c"}]

        job = DataValidationJob(
            region_code="US_XX",
            validation=SamenessDataValidationCheck(
                validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                partition_columns=[],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    description="test_view description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                result_details=SamenessStringsValidationResultDetails(
                    num_error_rows=1,
                    total_num_rows=1,
                    max_allowed_error=0.0,
                    non_null_counts_per_column_per_partition=[
                        (tuple(), {"a": 1, "b": 1, "c": 1}),
                    ],
                ),
            ),
        )
Ejemplo n.º 16
0
    def test_sameness_check_numbers_different_values_within_margin(self) -> None:
        self.mock_client.run_query_async.return_value = [{"a": 98, "b": 100, "c": 99}]

        job = DataValidationJob(
            region_code="US_XX",
            validation=SamenessDataValidationCheck(
                validation_category=ValidationCategory.EXTERNAL_AGGREGATE,
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                max_allowed_error=0.02,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    description="test_view description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                result_details=SamenessNumbersValidationResultDetails(
                    failed_rows=[], max_allowed_error=0.02
                ),
            ),
        )
Ejemplo n.º 17
0
    def test_string_sameness_check_different_values_handle_empty_string(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": "same", "b": "same", "c": None}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description="1 out of 1 row(s) did not contain matching strings. "
                "The acceptable margin of error is only 0.0, but the "
                "validation returned an error rate of 1.0.",
            ),
        )
Ejemplo n.º 18
0
 def test_samneness_check_no_comparison_columns(self):
     with self.assertRaises(ValueError) as e:
         _ = SamenessDataValidationCheck(
             validation_type=ValidationCheckType.SAMENESS,
             sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
             view=BigQueryView(
                 dataset_id='my_dataset',
                 view_id='test_view',
                 view_query_template='select * from literally_anything'))
     self.assertEqual(
         str(e.exception),
         'Found only [0] comparison columns, expected at least 2.')
Ejemplo n.º 19
0
    def test_samneness_check_validation_name(self):
        check = SamenessDataValidationCheck(
            validation_type=ValidationCheckType.SAMENESS,
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=['a', 'b', 'c'],
            view=BigQueryView(
                dataset_id='my_dataset',
                view_id='test_view',
                view_query_template='select * from literally_anything'))
        self.assertEqual(check.validation_name, 'test_view')

        check_with_name_suffix = SamenessDataValidationCheck(
            validation_type=ValidationCheckType.SAMENESS,
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            validation_name_suffix='b_c_only',
            comparison_columns=['b', 'c'],
            view=BigQueryView(
                dataset_id='my_dataset',
                view_id='test_view',
                view_query_template='select * from literally_anything'))
        self.assertEqual(check_with_name_suffix.validation_name,
                         'test_view_b_c_only')
Ejemplo n.º 20
0
 def test_samneness_check_bad_max_error(self):
     with self.assertRaises(ValueError) as e:
         _ = SamenessDataValidationCheck(
             validation_type=ValidationCheckType.SAMENESS,
             sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
             comparison_columns=['a', 'b', 'c'],
             view=BigQueryView(
                 dataset_id='my_dataset',
                 view_id='test_view',
                 view_query_template='select * from literally_anything'),
             max_allowed_error=1.5)
     self.assertEqual(
         str(e.exception),
         'Allowed error value must be between 0.0 and 1.0. Found instead: [1.5]'
     )
Ejemplo n.º 21
0
 def test_samneness_check_no_comparison_columns(self) -> None:
     with self.assertRaises(ValueError) as e:
         _ = SamenessDataValidationCheck(
             validation_category=ValidationCategory.EXTERNAL_AGGREGATE,
             validation_type=ValidationCheckType.SAMENESS,
             sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
             view_builder=SimpleBigQueryViewBuilder(
                 dataset_id="my_dataset",
                 view_id="test_view",
                 description="test_view description",
                 view_query_template="select * from literally_anything",
             ),
         )
     self.assertEqual(
         str(e.exception), "Found only [0] comparison columns, expected at least 2."
     )
Ejemplo n.º 22
0
    def test_sameness_check_strings_multiple_dates(self) -> None:
        self.mock_client.run_query_async.return_value = [
            # January 2021
            {"region": "US_XX", "date": "2021-01-31", "a": "00", "b": "00"},
            {"region": "US_XX", "date": "2021-01-31", "a": "01", "b": None},
            {"region": "US_XX", "date": "2021-01-31", "a": "02", "b": "02"},
            {"region": "US_XX", "date": "2021-01-31", "a": None, "b": "03"},
            # December 2020
            {"region": "US_XX", "date": "2020-12-31", "a": "00", "b": "00"},
            {"region": "US_XX", "date": "2020-12-31", "a": "02", "b": "02"},
            {"region": "US_XX", "date": "2020-12-31", "a": None, "b": "04"},
        ]

        job = DataValidationJob(
            region_code="US_XX",
            validation=SamenessDataValidationCheck(
                validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b"],
                partition_columns=["region", "date"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                max_allowed_error=0.0,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    description="test_view description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                result_details=SamenessStringsValidationResultDetails(
                    num_error_rows=3,
                    total_num_rows=7,
                    max_allowed_error=0.0,
                    non_null_counts_per_column_per_partition=[
                        (("US_XX", "2021-01-31"), {"a": 3, "b": 3}),
                        (("US_XX", "2020-12-31"), {"a": 2, "b": 3}),
                    ],
                ),
            ),
        )
Ejemplo n.º 23
0
 def test_samneness_check_bad_max_error(self) -> None:
     with self.assertRaises(ValueError) as e:
         _ = SamenessDataValidationCheck(
             validation_type=ValidationCheckType.SAMENESS,
             sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
             comparison_columns=["a", "b", "c"],
             view=BigQueryView(
                 dataset_id="my_dataset",
                 view_id="test_view",
                 view_query_template="select * from literally_anything",
             ),
             max_allowed_error=1.5,
         )
     self.assertEqual(
         str(e.exception),
         "Allowed error value must be between 0.0 and 1.0. Found instead: [1.5]",
     )
Ejemplo n.º 24
0
    def test_string_sameness_check_strings_values_all_none(self):
        self.mock_client.run_query_async.return_value = [{
            'a': None,
            'b': None,
            'c': None
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(validation_job=job,
                                    was_successful=True,
                                    failure_description=None))
Ejemplo n.º 25
0
    def test_string_sameness_check_numbers_one_none(self):
        self.mock_client.run_query_async.return_value = [{
            'a': 3,
            'b': 3,
            'c': None
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))

        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            'Unexpected None value for column [c] in validation [test_view].')
Ejemplo n.º 26
0
def get_all_validations() -> List[DataValidationCheck]:
    """Returns the full list of configured validations to perform. This is not built as a top-level variable because the
    views cannot be built locally being run inside of a local_project_id_override block.
    """

    all_data_validations: List[DataValidationCheck] = [
        ExistenceDataValidationCheck(
            view=INCARCERATION_ADMISSION_AFTER_OPEN_PERIOD_VIEW_BUILDER.build()
        ),
        ExistenceDataValidationCheck(
            view=INCARCERATION_ADMISSION_NULLS_VIEW_BUILDER.build()
        ),
        ExistenceDataValidationCheck(
            view=INCARCERATION_RELEASE_PRIOR_TO_ADMISSION_VIEW_BUILDER.build()
        ),
        ExistenceDataValidationCheck(
            view=INCARCERATION_RELEASE_REASON_NO_DATE_VIEW_BUILDER.build()
        ),
        ExistenceDataValidationCheck(
            view=OVERLAPPING_INCARCERATION_PERIODS_VIEW_BUILDER.build()
        ),
        ExistenceDataValidationCheck(
            view=INCARCERATION_RELEASE_REASON_NO_RELEASE_DATE_VIEW_BUILDER.build()
        ),
        ExistenceDataValidationCheck(
            view=PO_REPORT_AVGS_PER_DISTRICT_STATE_VIEW_BUILDER.build()
        ),
        ExistenceDataValidationCheck(
            view=PO_REPORT_DISTINCT_BY_OFFICER_MONTH_VIEW_BUILDER.build()
        ),
        ExistenceDataValidationCheck(
            view=SUPERVISION_TERMINATION_PRIOR_TO_START_VIEW_BUILDER.build()
        ),
        ExistenceDataValidationCheck(
            view=SUPERVISION_TERMINATION_REASON_NO_DATE_VIEW_BUILDER.build()
        ),
        ExistenceDataValidationCheck(
            view=OVERLAPPING_SUPERVISION_PERIODS_VIEW_BUILDER.build()
        ),
        SamenessDataValidationCheck(
            view=CASE_TERMINATIONS_BY_TYPE_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix="absconsions",
            comparison_columns=["absconsions_by_month", "absconsions_by_officer"],
        ),
        SamenessDataValidationCheck(
            view=CASE_TERMINATIONS_BY_TYPE_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix="discharges",
            comparison_columns=["discharges_by_month", "discharges_by_officer"],
            max_allowed_error=0.02,
        ),
        SamenessDataValidationCheck(
            view=FTR_REFERRALS_COMPARISON_VIEW_BUILDER.build(),
            comparison_columns=[
                "age_bucket_sum",
                "risk_level_sum",
                "gender_sum",
                "race_sum",
            ],
        ),
        SamenessDataValidationCheck(
            view=PO_REPORT_MISSING_FIELDS_VIEW_BUILDER.build(),
            comparison_columns=PO_REPORT_COMPARISON_COLUMNS,
        ),
        SamenessDataValidationCheck(
            view=REVOCATION_MATRIX_COMPARISON_REVOCATION_CELL_VS_CASELOAD_VIEW_BUILDER.build(),
            comparison_columns=["cell_sum", "caseload_sum", "caseload_num_rows"],
        ),
        SamenessDataValidationCheck(
            view=REVOCATION_MATRIX_COMPARISON_REVOCATION_CELL_VS_MONTH_VIEW_BUILDER.build(),
            comparison_columns=["cell_sum", "month_sum"],
        ),
        SamenessDataValidationCheck(
            view=REVOCATION_MATRIX_COMPARISON_SUPERVISION_POPULATION_VIEW_BUILDER.build(),
            comparison_columns=[
                "district_sum",
                "risk_level_sum",
                "gender_sum",
                "race_sum",
            ],
        ),
        SamenessDataValidationCheck(
            view=REVOCATION_MATRIX_COMPARISON_REVOCATIONS_BY_OFFICER_VIEW_BUILDER.build(),
            comparison_columns=["officer_sum", "caseload_sum"],
        ),
        SamenessDataValidationCheck(
            view=REVOCATION_MATRIX_DISTRIBUTION_BY_RACE_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix="revocation",
            comparison_columns=["revocation_count_all", "revocation_count_sum"],
        ),
        SamenessDataValidationCheck(
            view=REVOCATION_MATRIX_DISTRIBUTION_BY_RACE_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix="supervision",
            comparison_columns=[
                "supervision_count_all",
                "supervision_population_count_sum",
            ],
        ),
        SamenessDataValidationCheck(
            view=REVOCATION_MATRIX_DISTRIBUTION_BY_RACE_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix="recommendation",
            comparison_columns=[
                "recommended_for_revocation_count_all",
                "recommended_for_revocation_count_sum",
            ],
        ),
        SamenessDataValidationCheck(
            view=REVOCATION_MATRIX_DISTRIBUTION_BY_GENDER_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix="revocation",
            comparison_columns=["revocation_count_all", "revocation_count_sum"],
        ),
        SamenessDataValidationCheck(
            view=REVOCATION_MATRIX_DISTRIBUTION_BY_GENDER_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix="supervision",
            comparison_columns=[
                "supervision_count_all",
                "supervision_population_count_sum",
            ],
        ),
        SamenessDataValidationCheck(
            view=REVOCATION_MATRIX_DISTRIBUTION_BY_GENDER_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix="recommendation",
            comparison_columns=[
                "recommended_for_revocation_count_all",
                "recommended_for_revocation_count_sum",
            ],
        ),
        SamenessDataValidationCheck(
            view=REVOCATIONS_BY_PERIOD_DASHBOARD_COMPARISON_VIEW_BUILDER.build(),
            comparison_columns=[
                "dashboard_revocation_count",
                "public_dashboard_revocation_count",
            ],
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_SUCCESS_BY_MONTH_DASHBOARD_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix="termination",
            comparison_columns=[
                "dashboard_successful_termination",
                "public_dashboard_successful_termination",
            ],
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_SUCCESS_BY_MONTH_DASHBOARD_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix="completion",
            comparison_columns=[
                "dashboard_projected_completion",
                "public_dashboard_projected_completion",
            ],
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_SUCCESS_BY_PERIOD_DASHBOARD_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix="termination",
            comparison_columns=[
                "dashboard_successful_termination",
                "public_dashboard_successful_termination",
            ],
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_SUCCESS_BY_PERIOD_DASHBOARD_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix="completion",
            comparison_columns=[
                "dashboard_projected_completion",
                "public_dashboard_projected_completion",
            ],
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_BY_FACILITY_INTERNAL_COMPARISON_VIEW_BUILDER.build(),
            comparison_columns=[
                "covid_report_facility_population",
                "public_dashboard_facility_population",
            ],
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_BY_MONTH_INTERNAL_COMPARISON_VIEW_BUILDER.build(),
            comparison_columns=[
                "covid_report_population",
                "public_dashboard_population",
            ],
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_BY_DEMOGRAPHIC_INTERNAL_COMPARISON_VIEW_BUILDER.build(),
            comparison_columns=[
                "population_by_admission_reason_total_population",
                "population_by_facility_by_demographics_total_population",
            ],
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_BY_ADMISSION_REASON_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=[
                "metric_total",
                "age_bucket_breakdown_sum",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_BY_FACILITY_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=[
                "metric_total",
                "age_bucket_breakdown_sum",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_BY_PRIORITIZED_RACE_AND_ETHNICITY_BY_PERIOD_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=["metric_total", "race_or_ethnicity_breakdown_sum"],
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_POPULATION_BY_PRIORITIZED_RACE_AND_ETHNICITY_BY_PERIOD_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=["metric_total", "race_or_ethnicity_breakdown_sum"],
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_LENGTHS_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=[
                "metric_total",
                "age_bucket_breakdown_sum",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_RELEASES_BY_TYPE_BY_PERIOD_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=[
                "metric_total",
                "age_bucket_breakdown_sum",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_REVOCATIONS_BY_PERIOD_BY_TYPE_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=[
                "metric_total",
                "age_bucket_breakdown_sum",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
        ),
        SamenessDataValidationCheck(
            view=SENTENCE_TYPE_BY_DISTRICT_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=[
                "metric_total",
                "age_bucket_breakdown_sum",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_POPULATION_BY_DISTRICT_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=[
                "metric_total",
                "age_bucket_breakdown_sum",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
        ),
        # TODO(#3743): This validation will fail until we fix the view to handle people who age into new buckets
        SamenessDataValidationCheck(
            view=SUPERVISION_SUCCESS_BY_PERIOD_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=[
                "metric_total",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
        ),
        SamenessDataValidationCheck(
            view=ACTIVE_PROGRAM_PARTICIPATION_BY_REGION_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=["metric_total", "race_or_ethnicity_breakdown_sum"],
        ),
        # External comparison validations
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_BY_FACILITY_EXTERNAL_COMPARISON_VIEW_BUILDER.build(),
            comparison_columns=[
                "external_population_count",
                "internal_population_count",
            ],
            max_allowed_error=0.02,
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER.build(),
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_person_external_id",
                "internal_person_external_id",
            ],
            max_allowed_error=0.02,
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER.build(),
            validation_name_suffix="facility",
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=["external_facility", "internal_facility"],
            max_allowed_error=0.02,
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER.build(),
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_person_external_id",
                "internal_person_external_id",
            ],
            max_allowed_error=0.2,
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER.build(),
            validation_name_suffix="district",
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=["external_district", "internal_district"],
            max_allowed_error=0.01,
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER.build(),
            validation_name_suffix="supervision_level",
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_supervision_level",
                "internal_supervision_level",
            ],
            max_allowed_error=0.02,
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER.build(),
            validation_name_suffix="supervising_officer",
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_supervising_officer",
                "internal_supervising_officer",
            ],
            max_allowed_error=0.02,
        ),
        SamenessDataValidationCheck(
            view=RECIDIVISM_RELEASE_COHORT_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER.build(),
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_person_external_id",
                "internal_person_external_id",
            ],
            max_allowed_error=0.02,
        ),
        SamenessDataValidationCheck(
            view=RECIDIVISM_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER.build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=["external_recidivated", "internal_recidivated"],
            max_allowed_error=0.02,
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_TERMINATION_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER.build(),
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_person_external_id",
                "internal_person_external_id",
            ],
            max_allowed_error=0.02,
        ),
        SamenessDataValidationCheck(
            view=COUNTY_JAIL_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER.build(),
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_person_external_id",
                "internal_person_external_id",
            ],
            max_allowed_error=0.02,
        ),
        SamenessDataValidationCheck(
            view=COUNTY_JAIL_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER.build(),
            validation_name_suffix="facility",
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=["external_facility", "internal_facility"],
        ),
        SamenessDataValidationCheck(
            view=COUNTY_JAIL_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER.build(),
            validation_name_suffix="legal_status",
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=["external_legal_status", "internal_legal_status"],
        ),
        SamenessDataValidationCheck(
            view=POPULATION_PROJECTION_MONTHLY_POPULATION_EXTERNAL_COMPARISON_VIEW_BUILDER.build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                "external_total_population",
                "internal_total_population",
            ],
            max_allowed_error=0.02,
        ),
    ]

    return all_data_validations
Ejemplo n.º 27
0
def get_all_validations() -> List[DataValidationCheck]:
    """Returns the full list of configured validations to perform. This is not built as a top-level variable because the
     views cannot be built locally being run inside of a local_project_id_override block.
     """

    all_data_validations: List[DataValidationCheck] = [
        ExistenceDataValidationCheck(view=INCARCERATION_ADMISSION_AFTER_OPEN_PERIOD_VIEW_BUILDER.build()),
        ExistenceDataValidationCheck(view=INCARCERATION_ADMISSION_NULLS_VIEW_BUILDER.build()),
        ExistenceDataValidationCheck(view=INCARCERATION_RELEASE_PRIOR_TO_ADMISSION_VIEW_BUILDER.build()),
        ExistenceDataValidationCheck(view=INCARCERATION_RELEASE_REASON_NO_DATE_VIEW_BUILDER.build()),
        ExistenceDataValidationCheck(view=OVERLAPPING_INCARCERATION_PERIODS_VIEW_BUILDER.build()),

        # TODO(#4054): This should stop failing for MO once we fix the 600ish periods with end dates of 99999999
        ExistenceDataValidationCheck(view=INCARCERATION_RELEASE_REASON_NO_RELEASE_DATE_VIEW_BUILDER.build()),

        ExistenceDataValidationCheck(view=PO_REPORT_AVGS_PER_DISTRICT_STATE_VIEW_BUILDER.build()),
        ExistenceDataValidationCheck(view=PO_REPORT_DISTINCT_BY_OFFICER_MONTH_VIEW_BUILDER.build()),
        ExistenceDataValidationCheck(view=SUPERVISION_TERMINATION_PRIOR_TO_START_VIEW_BUILDER.build()),
        ExistenceDataValidationCheck(view=SUPERVISION_TERMINATION_REASON_NO_DATE_VIEW_BUILDER.build()),
        ExistenceDataValidationCheck(view=OVERLAPPING_SUPERVISION_PERIODS_VIEW_BUILDER.build()),

        SamenessDataValidationCheck(view=CASE_TERMINATIONS_BY_TYPE_COMPARISON_VIEW_BUILDER.build(),
                                    validation_name_suffix='absconsions',
                                    comparison_columns=['absconsions_by_month', 'absconsions_by_officer']),
        SamenessDataValidationCheck(view=CASE_TERMINATIONS_BY_TYPE_COMPARISON_VIEW_BUILDER.build(),
                                    validation_name_suffix='discharges',
                                    comparison_columns=['discharges_by_month', 'discharges_by_officer'],
                                    max_allowed_error=0.02),
        SamenessDataValidationCheck(view=FTR_REFERRALS_COMPARISON_VIEW_BUILDER.build(),
                                    comparison_columns=['age_bucket_sum', 'risk_level_sum', 'gender_sum', 'race_sum'],
                                    max_allowed_error=0.06),
        SamenessDataValidationCheck(view=PO_REPORT_MISSING_FIELDS_VIEW_BUILDER.build(),
                                    comparison_columns=PO_REPORT_COMPARISON_COLUMNS),
        SamenessDataValidationCheck(view=REVOCATION_MATRIX_COMPARISON_REVOCATION_CELL_VS_CASELOAD_VIEW_BUILDER.build(),
                                    comparison_columns=['cell_sum', 'caseload_sum']),
        SamenessDataValidationCheck(view=REVOCATION_MATRIX_COMPARISON_REVOCATION_CELL_VS_MONTH_VIEW_BUILDER.build(),
                                    comparison_columns=['cell_sum', 'month_sum'],
                                    max_allowed_error=0.03),
        # This version of this validation excludes the race column explicitly since we have chosen to count people with
        # multiple races in counts for each individual race, so the sum of the race breakdowns will not match the total.
        SamenessDataValidationCheck(view=REVOCATION_MATRIX_COMPARISON_SUPERVISION_POPULATION_VIEW_BUILDER.build(),
                                    comparison_columns=['district_sum', 'risk_level_sum', 'gender_sum', 'officer_sum', 'race_sum']),
        # This version of the validation checks to make sure the race sum isn't far off from the other sums, even
        # though we expect them to be different (e.g. make sure it isn't zero).
        SamenessDataValidationCheck(
            view=REVOCATION_MATRIX_COMPARISON_SUPERVISION_POPULATION_VIEW_BUILDER.build(),
            validation_name_suffix='with_race',
            comparison_columns=['district_sum', 'risk_level_sum', 'gender_sum', 'race_sum', 'officer_sum'],
            max_allowed_error=.05
        ),
        SamenessDataValidationCheck(
            view=REVOCATIONS_BY_PERIOD_DASHBOARD_COMPARISON_VIEW_BUILDER.build(),
            comparison_columns=['dashboard_revocation_count', 'public_dashboard_revocation_count']
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_SUCCESS_BY_MONTH_DASHBOARD_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix='termination',
            comparison_columns=['dashboard_successful_termination', 'public_dashboard_successful_termination']
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_SUCCESS_BY_MONTH_DASHBOARD_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix='completion',
            comparison_columns=['dashboard_projected_completion', 'public_dashboard_projected_completion']
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_SUCCESS_BY_PERIOD_DASHBOARD_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix='termination',
            comparison_columns=['dashboard_successful_termination', 'public_dashboard_successful_termination']
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_SUCCESS_BY_PERIOD_DASHBOARD_COMPARISON_VIEW_BUILDER.build(),
            validation_name_suffix='completion',
            comparison_columns=['dashboard_projected_completion', 'public_dashboard_projected_completion']
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_BY_FACILITY_INTERNAL_COMPARISON_VIEW_BUILDER.build(),
            comparison_columns=['covid_report_facility_population', 'public_dashboard_facility_population']
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_BY_MONTH_INTERNAL_COMPARISON_VIEW_BUILDER.build(),
            comparison_columns=['covid_report_population', 'public_dashboard_population']
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_BY_DEMOGRAPHIC_INTERNAL_COMPARISON_VIEW_BUILDER.build(),
            comparison_columns=['population_by_admission_reason_total_population',
                                'population_by_facility_by_demographics_total_population']
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_BY_ADMISSION_REASON_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=['metric_total', 'age_bucket_breakdown_sum',
                                'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum']
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_BY_FACILITY_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=['metric_total', 'age_bucket_breakdown_sum',
                                'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum']
        ),
        SamenessDataValidationCheck(
            # pylint: disable=line-too-long
            view=INCARCERATION_POPULATION_BY_PRIORITIZED_RACE_AND_ETHNICITY_BY_PERIOD_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=['metric_total', 'race_or_ethnicity_breakdown_sum']
        ),
        SamenessDataValidationCheck(
            # pylint: disable=line-too-long
            view=SUPERVISION_POPULATION_BY_PRIORITIZED_RACE_AND_ETHNICITY_BY_PERIOD_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=['metric_total', 'race_or_ethnicity_breakdown_sum']
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_LENGTHS_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=['metric_total', 'age_bucket_breakdown_sum',
                                'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum']
        ),
        SamenessDataValidationCheck(
            view=INCARCERATION_RELEASES_BY_TYPE_BY_PERIOD_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=['metric_total', 'age_bucket_breakdown_sum',
                                'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum']
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_REVOCATIONS_BY_PERIOD_BY_TYPE_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=['metric_total', 'age_bucket_breakdown_sum',
                                'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum']
        ),
        SamenessDataValidationCheck(
            view=SENTENCE_TYPE_BY_DISTRICT_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=['metric_total', 'age_bucket_breakdown_sum',
                                'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum']
        ),
        SamenessDataValidationCheck(
            view=SUPERVISION_POPULATION_BY_DISTRICT_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=['metric_total', 'age_bucket_breakdown_sum',
                                'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum']
        ),
        # TODO(#3743): This validation will fail until we fix the view to handle people who age into new buckets
        SamenessDataValidationCheck(
            view=SUPERVISION_SUCCESS_BY_PERIOD_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=['metric_total', 'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum']
        ),
        SamenessDataValidationCheck(
            view=ACTIVE_PROGRAM_PARTICIPATION_BY_REGION_INTERNAL_CONSISTENCY_VIEW_BUILDER.build(),
            comparison_columns=['metric_total', 'race_or_ethnicity_breakdown_sum']
        ),

        # External comparison validations
        SamenessDataValidationCheck(view=INCARCERATION_POPULATION_BY_FACILITY_EXTERNAL_COMPARISON_VIEW_BUILDER.build(),
                                    comparison_columns=['external_population_count', 'internal_population_count']),
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER.build(),
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=['external_person_external_id', 'internal_person_external_id'],
            max_allowed_error=0.02),
        SamenessDataValidationCheck(
            view=INCARCERATION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER.build(),
            validation_name_suffix='facility',
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=['external_facility', 'internal_facility'],
            max_allowed_error=0.02),
        SamenessDataValidationCheck(
            view=SUPERVISION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER.build(),
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=['external_person_external_id', 'internal_person_external_id'],
            max_allowed_error=0.2),
        SamenessDataValidationCheck(
            view=SUPERVISION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER.build(),
            validation_name_suffix='district',
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=['external_district', 'internal_district'],
            max_allowed_error=0.01),
        SamenessDataValidationCheck(
            view=SUPERVISION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER.build(),
            validation_name_suffix='supervision_level',
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=['external_supervision_level', 'internal_supervision_level'],
            max_allowed_error=0.02),
        SamenessDataValidationCheck(
            view=SUPERVISION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER.build(),
            validation_name_suffix='supervising_officer',
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=['external_supervising_officer', 'internal_supervising_officer'],
            max_allowed_error=0.02),
        SamenessDataValidationCheck(
            view=RECIDIVISM_RELEASE_COHORT_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER.build(),
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=['external_person_external_id', 'internal_person_external_id'],
            max_allowed_error=0.02),
        SamenessDataValidationCheck(
            view=RECIDIVISM_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER.build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=['external_recidivated', 'internal_recidivated'],
            max_allowed_error=0.02),
    ]

    return all_data_validations
Ejemplo n.º 28
0
def get_all_validations() -> List[DataValidationCheck]:
    """Returns the full list of configured validations to perform. This is not built as a top-level variable because the
    views cannot be built locally being run inside of a local_project_id_override block.
    """

    all_data_validations: List[DataValidationCheck] = [
        ExistenceDataValidationCheck(
            view_builder=INCARCERATION_ADMISSION_AFTER_OPEN_PERIOD_VIEW_BUILDER,
            validation_category=ValidationCategory.INVARIANT,
        ),
        ExistenceDataValidationCheck(
            view_builder=INCARCERATION_ADMISSION_NULLS_VIEW_BUILDER,
            validation_category=ValidationCategory.INVARIANT,
        ),
        ExistenceDataValidationCheck(
            view_builder=INCARCERATION_RELEASE_PRIOR_TO_ADMISSION_VIEW_BUILDER,
            validation_category=ValidationCategory.INVARIANT,
        ),
        ExistenceDataValidationCheck(
            view_builder=INCARCERATION_RELEASE_REASON_NO_DATE_VIEW_BUILDER,
            validation_category=ValidationCategory.INVARIANT,
        ),
        ExistenceDataValidationCheck(
            view_builder=OVERLAPPING_INCARCERATION_PERIODS_VIEW_BUILDER,
            validation_category=ValidationCategory.INVARIANT,
        ),
        ExistenceDataValidationCheck(
            view_builder=
            INCARCERATION_RELEASE_REASON_NO_RELEASE_DATE_VIEW_BUILDER,
            validation_category=ValidationCategory.INVARIANT,
        ),
        ExistenceDataValidationCheck(
            view_builder=PO_REPORT_AVGS_PER_DISTRICT_STATE_VIEW_BUILDER,
            validation_category=ValidationCategory.INVARIANT,
        ),
        ExistenceDataValidationCheck(
            view_builder=PO_REPORT_DISTINCT_BY_OFFICER_MONTH_VIEW_BUILDER,
            validation_category=ValidationCategory.INVARIANT,
        ),
        ExistenceDataValidationCheck(
            view_builder=SUPERVISION_TERMINATION_PRIOR_TO_START_VIEW_BUILDER,
            validation_category=ValidationCategory.INVARIANT,
        ),
        ExistenceDataValidationCheck(
            view_builder=SUPERVISION_TERMINATION_REASON_NO_DATE_VIEW_BUILDER,
            validation_category=ValidationCategory.INVARIANT,
        ),
        ExistenceDataValidationCheck(
            view_builder=OVERLAPPING_SUPERVISION_PERIODS_VIEW_BUILDER,
            validation_category=ValidationCategory.INVARIANT,
        ),
        ExistenceDataValidationCheck(
            view_builder=ACTIVE_IN_POPULATION_AFTER_DEATH_DATE_VIEW_BUILDER,
            validation_category=ValidationCategory.INVARIANT,
        ),
        ExistenceDataValidationCheck(
            view_builder=
            INVALID_ADMISSION_REASONS_FOR_TEMPORARY_CUSTODY_VIEW_BUILDER,
            validation_category=ValidationCategory.INVARIANT,
        ),
        ExistenceDataValidationCheck(
            view_builder=
            INVALID_ADMITTED_FROM_SUPERVISION_ADMISSION_REASON_VIEW_BUILDER,
            validation_category=ValidationCategory.INVARIANT,
        ),
        ExistenceDataValidationCheck(
            view_builder=
            INVALID_PFI_FOR_TEMPORARY_CUSTODY_ADMISSIONS_VIEW_BUILDER,
            validation_category=ValidationCategory.INVARIANT,
        ),
        ExistenceDataValidationCheck(
            view_builder=ASSESSMENT_FRESHNESS_VALIDATION_VIEW_BUILDER,
            validation_category=ValidationCategory.FRESHNESS,
        ),
        ExistenceDataValidationCheck(
            view_builder=CONTACT_FRESHNESS_VALIDATION_VIEW_BUILDER,
            validation_category=ValidationCategory.FRESHNESS,
        ),
        ExistenceDataValidationCheck(
            view_builder=EMPLOYMENT_FRESHNESS_VALIDATION_VIEW_BUILDER,
            validation_category=ValidationCategory.FRESHNESS,
        ),
        ExistenceDataValidationCheck(
            view_builder=ETL_FRESHNESS_VALIDATION_VIEW_BUILDER,
            validation_category=ValidationCategory.FRESHNESS,
        ),
        SamenessDataValidationCheck(
            view_builder=CASE_TERMINATIONS_BY_TYPE_COMPARISON_VIEW_BUILDER,
            validation_name_suffix="absconsions",
            comparison_columns=[
                "absconsions_by_month", "absconsions_from_po_report"
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=CASE_TERMINATIONS_BY_TYPE_COMPARISON_VIEW_BUILDER,
            validation_name_suffix="discharges",
            comparison_columns=[
                "discharges_by_month", "discharges_from_po_report"
            ],
            max_allowed_error=0.02,
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=FTR_REFERRALS_COMPARISON_VIEW_BUILDER,
            comparison_columns=[
                "age_bucket_sum",
                "risk_level_sum",
                "gender_sum",
                "race_sum",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=PO_REPORT_MISSING_FIELDS_VIEW_BUILDER,
            comparison_columns=PO_REPORT_COMPARISON_COLUMNS,
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            REVOCATION_MATRIX_CASELOAD_ADMISSION_HISTORY_VIEW_BUILDER,
            comparison_columns=[
                "total_revocation_admissions",
                "total_caseload_admissions",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            REVOCATION_MATRIX_COMPARISON_REVOCATION_CELL_VS_CASELOAD_VIEW_BUILDER,
            comparison_columns=[
                "cell_sum", "caseload_sum", "caseload_num_rows"
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            REVOCATION_MATRIX_COMPARISON_REVOCATION_CELL_VS_MONTH_VIEW_BUILDER,
            comparison_columns=["cell_sum", "month_sum"],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=REVOCATION_MATRIX_COMPARISON_BY_MONTH_VIEW_BUILDER,
            comparison_columns=["reference_sum", "month_sum"],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            REVOCATION_MATRIX_COMPARISON_SUPERVISION_POPULATION_VIEW_BUILDER,
            comparison_columns=[
                "district_sum",
                "risk_level_sum",
                "gender_sum",
                "race_sum",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            REVOCATION_MATRIX_COMPARISON_REVOCATIONS_BY_OFFICER_VIEW_BUILDER,
            comparison_columns=["officer_sum", "caseload_sum"],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            REVOCATION_MATRIX_DISTRIBUTION_BY_RACE_COMPARISON_VIEW_BUILDER,
            validation_name_suffix="revocation",
            comparison_columns=[
                "revocation_count_all", "revocation_count_sum"
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            REVOCATION_MATRIX_DISTRIBUTION_BY_RACE_COMPARISON_VIEW_BUILDER,
            validation_name_suffix="supervision",
            comparison_columns=[
                "supervision_count_all",
                "supervision_population_count_sum",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            REVOCATION_MATRIX_DISTRIBUTION_BY_RACE_COMPARISON_VIEW_BUILDER,
            validation_name_suffix="recommendation",
            comparison_columns=[
                "recommended_for_revocation_count_all",
                "recommended_for_revocation_count_sum",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            REVOCATION_MATRIX_DISTRIBUTION_BY_GENDER_COMPARISON_VIEW_BUILDER,
            validation_name_suffix="revocation",
            comparison_columns=[
                "revocation_count_all", "revocation_count_sum"
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            REVOCATION_MATRIX_DISTRIBUTION_BY_GENDER_COMPARISON_VIEW_BUILDER,
            validation_name_suffix="supervision",
            comparison_columns=[
                "supervision_count_all",
                "supervision_population_count_sum",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            REVOCATION_MATRIX_DISTRIBUTION_BY_GENDER_COMPARISON_VIEW_BUILDER,
            validation_name_suffix="recommendation",
            comparison_columns=[
                "recommended_for_revocation_count_all",
                "recommended_for_revocation_count_sum",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            REVOCATIONS_BY_PERIOD_DASHBOARD_COMPARISON_VIEW_BUILDER,
            comparison_columns=[
                "dashboard_revocation_count",
                "public_dashboard_revocation_count",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            SUPERVISION_SUCCESS_BY_MONTH_DASHBOARD_COMPARISON_VIEW_BUILDER,
            validation_name_suffix="termination",
            comparison_columns=[
                "dashboard_successful_termination",
                "public_dashboard_successful_termination",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            SUPERVISION_SUCCESS_BY_MONTH_DASHBOARD_COMPARISON_VIEW_BUILDER,
            validation_name_suffix="completion",
            comparison_columns=[
                "dashboard_projected_completion",
                "public_dashboard_projected_completion",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            SUPERVISION_SUCCESS_BY_PERIOD_DASHBOARD_COMPARISON_VIEW_BUILDER,
            validation_name_suffix="termination",
            comparison_columns=[
                "dashboard_successful_termination",
                "public_dashboard_successful_termination",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            SUPERVISION_SUCCESS_BY_PERIOD_DASHBOARD_COMPARISON_VIEW_BUILDER,
            validation_name_suffix="completion",
            comparison_columns=[
                "dashboard_projected_completion",
                "public_dashboard_projected_completion",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            INCARCERATION_POPULATION_BY_DEMOGRAPHIC_INTERNAL_COMPARISON_VIEW_BUILDER,
            comparison_columns=[
                "population_by_admission_reason_total_population",
                "population_by_facility_by_demographics_total_population",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            INCARCERATION_POPULATION_BY_ADMISSION_REASON_INTERNAL_CONSISTENCY_VIEW_BUILDER,
            comparison_columns=[
                "metric_total",
                "age_bucket_breakdown_sum",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            INCARCERATION_POPULATION_BY_FACILITY_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER,
            comparison_columns=[
                "metric_total",
                "age_bucket_breakdown_sum",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            INCARCERATION_POPULATION_BY_PRIORITIZED_RACE_AND_ETHNICITY_BY_PERIOD_INTERNAL_CONSISTENCY_VIEW_BUILDER,
            comparison_columns=[
                "metric_total", "race_or_ethnicity_breakdown_sum"
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            SUPERVISION_POPULATION_BY_PRIORITIZED_RACE_AND_ETHNICITY_BY_PERIOD_INTERNAL_CONSISTENCY_VIEW_BUILDER,
            comparison_columns=[
                "metric_total", "race_or_ethnicity_breakdown_sum"
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            INCARCERATION_LENGTHS_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER,
            comparison_columns=[
                "metric_total",
                "age_bucket_breakdown_sum",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            INCARCERATION_RELEASES_BY_TYPE_BY_PERIOD_INTERNAL_CONSISTENCY_VIEW_BUILDER,
            comparison_columns=[
                "metric_total",
                "age_bucket_breakdown_sum",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            SUPERVISION_REVOCATIONS_BY_PERIOD_BY_TYPE_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER,
            comparison_columns=[
                "metric_total",
                "age_bucket_breakdown_sum",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            SENTENCE_TYPE_BY_DISTRICT_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER,
            comparison_columns=[
                "metric_total",
                "age_bucket_breakdown_sum",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            SUPERVISION_POPULATION_BY_DISTRICT_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER,
            comparison_columns=[
                "metric_total",
                "age_bucket_breakdown_sum",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        # TODO(#3743): This validation will fail until we fix the view to handle people who age into new buckets
        SamenessDataValidationCheck(
            view_builder=
            SUPERVISION_SUCCESS_BY_PERIOD_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER,
            comparison_columns=[
                "metric_total",
                "race_or_ethnicity_breakdown_sum",
                "gender_breakdown_sum",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            ACTIVE_PROGRAM_PARTICIPATION_BY_REGION_INTERNAL_CONSISTENCY_VIEW_BUILDER,
            comparison_columns=[
                "metric_total", "race_or_ethnicity_breakdown_sum"
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            MOST_RECENT_ASSESSMENT_DATE_BY_PERSON_BY_STATE_COMPARISON_VIEW_BUILDER,
            comparison_columns=[
                "most_recent_etl_date",
                "most_recent_state_date",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            MOST_RECENT_ASSESSMENT_SCORE_BY_PERSON_BY_STATE_COMPARISON_VIEW_BUILDER,
            comparison_columns=[
                "most_recent_etl_score",
                "most_recent_state_score",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        SamenessDataValidationCheck(
            view_builder=
            MOST_RECENT_FACE_TO_FACE_CONTACT_DATE_BY_PERSON_BY_STATE_COMPARISON_VIEW_BUILDER,
            comparison_columns=[
                "most_recent_etl_face_to_face_contact_date",
                "most_recent_state_face_to_face_contact_date",
            ],
            validation_category=ValidationCategory.CONSISTENCY,
        ),
        # External comparison validations
        SamenessDataValidationCheck(
            view_builder=
            INCARCERATION_ADMISSION_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER,
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_data_person_id",
                "internal_data_person_id",
            ],
            partition_columns=["region_code", "admission_date"],
            max_allowed_error=0.02,
            validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
        ),
        SamenessDataValidationCheck(
            view_builder=
            INCARCERATION_POPULATION_BY_FACILITY_EXTERNAL_COMPARISON_VIEW_BUILDER,
            comparison_columns=[
                "external_population_count",
                "internal_population_count",
            ],
            max_allowed_error=0.02,
            validation_category=ValidationCategory.EXTERNAL_AGGREGATE,
        ),
        SamenessDataValidationCheck(
            view_builder=
            INCARCERATION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER,
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_data_person_id",
                "internal_data_person_id",
            ],
            partition_columns=["region_code", "date_of_stay"],
            max_allowed_error=0.02,
            validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
        ),
        SamenessDataValidationCheck(
            view_builder=
            INCARCERATION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER,
            validation_name_suffix="facility",
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=["external_facility", "internal_facility"],
            partition_columns=["region_code", "date_of_stay"],
            max_allowed_error=0.02,
            validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
        ),
        SamenessDataValidationCheck(
            view_builder=
            INCARCERATION_RELEASE_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER,
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_data_person_id",
                "internal_data_person_id",
            ],
            partition_columns=["region_code", "release_date"],
            max_allowed_error=0.02,
            validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
        ),
        SamenessDataValidationCheck(
            view_builder=
            SUPERVISION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER,
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_person_external_id",
                "internal_person_external_id",
            ],
            partition_columns=["region_code", "date_of_supervision"],
            max_allowed_error=0.2,
            validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
        ),
        SamenessDataValidationCheck(
            view_builder=
            SUPERVISION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER,
            validation_name_suffix="district",
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=["external_district", "internal_district"],
            partition_columns=["region_code", "date_of_supervision"],
            max_allowed_error=0.01,
            validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
        ),
        SamenessDataValidationCheck(
            view_builder=
            SUPERVISION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER,
            validation_name_suffix="supervision_level",
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_supervision_level",
                "internal_supervision_level",
            ],
            partition_columns=["region_code", "date_of_supervision"],
            max_allowed_error=0.02,
            validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
        ),
        SamenessDataValidationCheck(
            view_builder=
            SUPERVISION_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER,
            validation_name_suffix="supervising_officer",
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_supervising_officer",
                "internal_supervising_officer",
            ],
            partition_columns=["region_code", "date_of_supervision"],
            max_allowed_error=0.02,
            validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
        ),
        SamenessDataValidationCheck(
            view_builder=
            RECIDIVISM_RELEASE_COHORT_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER,
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_person_external_id",
                "internal_person_external_id",
            ],
            partition_columns=[
                "region_code", "release_cohort", "follow_up_period"
            ],
            max_allowed_error=0.02,
            validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
        ),
        SamenessDataValidationCheck(
            view_builder=
            RECIDIVISM_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER,
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                "external_recidivated", "internal_recidivated"
            ],
            max_allowed_error=0.02,
            validation_category=ValidationCategory.EXTERNAL_AGGREGATE,
        ),
        SamenessDataValidationCheck(
            view_builder=
            SUPERVISION_START_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER,
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_person_external_id",
                "internal_person_external_id",
            ],
            partition_columns=["region_code", "start_date"],
            max_allowed_error=0.02,
            validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
        ),
        SamenessDataValidationCheck(
            view_builder=
            SUPERVISION_TERMINATION_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER,
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_person_external_id",
                "internal_person_external_id",
            ],
            partition_columns=["region_code", "termination_date"],
            max_allowed_error=0.02,
            validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
        ),
        SamenessDataValidationCheck(
            view_builder=
            COUNTY_JAIL_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_VIEW_BUILDER,
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_person_external_id",
                "internal_person_external_id",
            ],
            partition_columns=["region_code", "date_of_stay"],
            max_allowed_error=0.02,
            validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
        ),
        SamenessDataValidationCheck(
            view_builder=
            COUNTY_JAIL_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER,
            validation_name_suffix="facility",
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=["external_facility", "internal_facility"],
            partition_columns=["region_code", "date_of_stay"],
            validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
        ),
        SamenessDataValidationCheck(
            view_builder=
            COUNTY_JAIL_POPULATION_PERSON_LEVEL_EXTERNAL_COMPARISON_MATCHING_PEOPLE_VIEW_BUILDER,
            validation_name_suffix="legal_status",
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=[
                "external_legal_status", "internal_legal_status"
            ],
            partition_columns=["region_code", "date_of_stay"],
            validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
        ),
        SamenessDataValidationCheck(
            view_builder=
            POPULATION_PROJECTION_MONTHLY_POPULATION_EXTERNAL_COMPARISON_VIEW_BUILDER,
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                "external_total_population",
                "internal_total_population",
            ],
            max_allowed_error=0.02,
            validation_category=ValidationCategory.EXTERNAL_AGGREGATE,
        ),
        SamenessDataValidationCheck(
            view_builder=
            INCARCERATION_POPULATION_BY_STATE_BY_DATE_JUSTICE_COUNTS_COMPARISON_VIEW_BUILDER,
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                "justice_counts_total_population",
                "internal_total_population",
            ],
            max_allowed_error=0.06,
            validation_category=ValidationCategory.EXTERNAL_AGGREGATE,
        ),
    ]

    return all_data_validations
    def test_from_successful_result(self) -> None:
        # Arrange
        job_result = DataValidationJobResult(
            validation_job=DataValidationJob(
                region_code="US_XX",
                validation=SamenessDataValidationCheck(
                    validation_category=ValidationCategory.EXTERNAL_AGGREGATE,
                    validation_type=ValidationCheckType.SAMENESS,
                    comparison_columns=["a", "b", "c"],
                    sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                    view_builder=SimpleBigQueryViewBuilder(
                        dataset_id="my_dataset",
                        view_id="test_view",
                        description="test_view description",
                        view_query_template="select * from literally_anything",
                    ),
                ),
            ),
            result_details=SamenessNumbersValidationResultDetails(
                failed_rows=[], max_allowed_error=0.0
            ),
        )

        # Act
        result = ValidationResultForStorage.from_validation_result(
            run_id="abc123",
            run_datetime=datetime.datetime(2000, 1, 1, 0, 0, 0),
            result=job_result,
        )

        # Assert
        self.assertEqual(
            ValidationResultForStorage(
                run_id="abc123",
                run_date=datetime.date(2000, 1, 1),
                run_datetime=datetime.datetime(2000, 1, 1, 0, 0, 0),
                system_version="v1.0.0",
                check_type=ValidationCheckType.SAMENESS,
                validation_name="test_view",
                region_code="US_XX",
                did_run=True,
                was_successful=True,
                failure_description=None,
                result_details_type="SamenessNumbersValidationResultDetails",
                result_details=SamenessNumbersValidationResultDetails(
                    failed_rows=[], max_allowed_error=0.0
                ),
                validation_category=ValidationCategory.EXTERNAL_AGGREGATE,
            ),
            result,
        )
        self.assertEqual(
            {
                "run_id": "abc123",
                "run_date": "2000-01-01",
                "run_datetime": "2000-01-01T00:00:00",
                "system_version": "v1.0.0",
                "check_type": "SAMENESS",
                "validation_name": "test_view",
                "region_code": "US_XX",
                "did_run": True,
                "was_successful": True,
                "failure_description": None,
                "result_details_type": "SamenessNumbersValidationResultDetails",
                "result_details": '{"failed_rows": [], "max_allowed_error": 0.0}',
                "validation_category": "EXTERNAL_AGGREGATE",
            },
            result.to_serializable(),
        )
Ejemplo n.º 30
0
def get_all_validations() -> List[DataValidationCheck]:
    """Returns the full list of configured validations to perform. This is not built as a top-level variable because the
     views cannot be built locally being run inside of a local_project_id_override block.
     """

    all_data_validations: List[DataValidationCheck] = [
        ExistenceDataValidationCheck(
            view=INCARCERATION_ADMISSION_AFTER_OPEN_PERIOD_VIEW_BUILDER.build(
            )),
        ExistenceDataValidationCheck(
            view=INCARCERATION_ADMISSION_NULLS_VIEW_BUILDER.build()),
        ExistenceDataValidationCheck(
            view=INCARCERATION_RELEASE_PRIOR_TO_ADMISSION_VIEW_BUILDER.build(
            )),

        # TODO(2981): This should stop failing for MO once we fix the 600ish periods with end dates of 99999999
        ExistenceDataValidationCheck(
            view=INCARCERATION_RELEASE_REASON_NO_RELEASE_DATE_VIEW_BUILDER.
            build()),
        ExistenceDataValidationCheck(
            view=PO_REPORT_AVGS_PER_DISTRICT_STATE_VIEW_BUILDER.build()),
        ExistenceDataValidationCheck(
            view=PO_REPORT_DISTINCT_BY_OFFICER_MONTH_VIEW_BUILDER.build()),
        ExistenceDataValidationCheck(
            view=SUPERVISION_TERMINATION_PRIOR_TO_START_VIEW_BUILDER.build()),
        SamenessDataValidationCheck(
            view=CASE_TERMINATIONS_BY_TYPE_COMPARISON_VIEW_BUILDER.build(),
            comparison_columns=[
                'absconsions_by_month', 'absconsions_by_officer'
            ],
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS),
        SamenessDataValidationCheck(
            view=CASE_TERMINATIONS_BY_TYPE_COMPARISON_VIEW_BUILDER.build(),
            comparison_columns=[
                'discharges_by_month', 'discharges_by_officer'
            ],
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            max_allowed_error=0.02),
        SamenessDataValidationCheck(
            view=FTR_REFERRALS_COMPARISON_VIEW_BUILDER.build(),
            comparison_columns=[
                'age_bucket_sum', 'risk_level_sum', 'gender_sum', 'race_sum'
            ],
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            max_allowed_error=0.06),
        SamenessDataValidationCheck(
            view=
            INCARCERATION_POPULATION_BY_FACILITY_EXTERNAL_COMPARISON_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'external_population_count', 'internal_population_count'
            ]),
        SamenessDataValidationCheck(
            view=PO_REPORT_MISSING_FIELDS_VIEW_BUILDER.build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=PO_REPORT_COMPARISON_COLUMNS),
        SamenessDataValidationCheck(
            view=
            REVOCATION_MATRIX_COMPARISON_REVOCATION_CELL_VS_CASELOAD_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=['cell_sum', 'caseload_sum']),
        SamenessDataValidationCheck(
            view=
            REVOCATION_MATRIX_COMPARISON_REVOCATION_CELL_VS_MONTH_VIEW_BUILDER.
            build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=['cell_sum', 'month_sum'],
            max_allowed_error=0.03),
        SamenessDataValidationCheck(
            view=REVOCATION_MATRIX_COMPARISON_SUPERVISION_POPULATION_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'district_sum', 'risk_level_sum', 'gender_sum', 'race_sum'
            ]),
        SamenessDataValidationCheck(
            view=
            SUPERVISION_EOM_POPULATION_PERSON_LEVEL_DISTRICT_EXTERNAL_COMPARISON_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.STRINGS,
            comparison_columns=['external_district', 'internal_district'],
            max_allowed_error=0.01),
        SamenessDataValidationCheck(
            view=REVOCATIONS_BY_PERIOD_DASHBOARD_COMPARISON_VIEW_BUILDER.build(
            ),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'dashboard_revocation_count',
                'public_dashboard_revocation_count'
            ]),
        SamenessDataValidationCheck(
            view=SUPERVISION_SUCCESS_BY_MONTH_DASHBOARD_COMPARISON_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'dashboard_successful_termination',
                'public_dashboard_successful_termination'
            ]),
        SamenessDataValidationCheck(
            view=SUPERVISION_SUCCESS_BY_MONTH_DASHBOARD_COMPARISON_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'dashboard_projected_completion',
                'public_dashboard_projected_completion'
            ]),
        SamenessDataValidationCheck(
            view=SUPERVISION_SUCCESS_BY_PERIOD_DASHBOARD_COMPARISON_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'dashboard_successful_termination',
                'public_dashboard_successful_termination'
            ]),
        SamenessDataValidationCheck(
            view=SUPERVISION_SUCCESS_BY_PERIOD_DASHBOARD_COMPARISON_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'dashboard_projected_completion',
                'public_dashboard_projected_completion'
            ]),
        SamenessDataValidationCheck(
            view=
            INCARCERATION_POPULATION_BY_FACILITY_INTERNAL_COMPARISON_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'covid_report_facility_population',
                'public_dashboard_facility_population'
            ]),
        SamenessDataValidationCheck(
            view=
            INCARCERATION_POPULATION_BY_DEMOGRAPHIC_INTERNAL_COMPARISON_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'population_by_admission_reason_total_population',
                'population_by_facility_by_demographics_total_population'
            ]),
        SamenessDataValidationCheck(
            view=
            INCARCERATION_POPULATION_BY_ADMISSION_REASON_INTERNAL_CONSISTENCY_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'metric_total', 'age_bucket_breakdown_sum',
                'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum'
            ]),
        SamenessDataValidationCheck(
            view=
            INCARCERATION_POPULATION_BY_FACILITY_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'metric_total', 'age_bucket_breakdown_sum',
                'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum'
            ]),
        SamenessDataValidationCheck(
            # pylint: disable=line-too-long
            view=
            INCARCERATION_POPULATION_BY_PRIORITIZED_RACE_AND_ETHNICITY_BY_PERIOD_INTERNAL_CONSISTENCY_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'metric_total', 'race_or_ethnicity_breakdown_sum'
            ]),
        SamenessDataValidationCheck(
            # pylint: disable=line-too-long
            view=
            SUPERVISION_POPULATION_BY_PRIORITIZED_RACE_AND_ETHNICITY_BY_PERIOD_INTERNAL_CONSISTENCY_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'metric_total', 'race_or_ethnicity_breakdown_sum'
            ]),
        SamenessDataValidationCheck(
            view=
            INCARCERATION_LENGTHS_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'metric_total', 'age_bucket_breakdown_sum',
                'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum'
            ]),
        SamenessDataValidationCheck(
            view=
            INCARCERATION_RELEASES_BY_TYPE_BY_PERIOD_INTERNAL_CONSISTENCY_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'metric_total', 'age_bucket_breakdown_sum',
                'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum'
            ]),
        SamenessDataValidationCheck(
            view=
            SUPERVISION_REVOCATIONS_BY_PERIOD_BY_TYPE_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'metric_total', 'age_bucket_breakdown_sum',
                'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum'
            ]),
        SamenessDataValidationCheck(
            view=
            SENTENCE_TYPE_BY_DISTRICT_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'metric_total', 'age_bucket_breakdown_sum',
                'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum'
            ]),
        SamenessDataValidationCheck(
            view=
            SUPERVISION_POPULATION_BY_DISTRICT_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'metric_total', 'age_bucket_breakdown_sum',
                'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum'
            ]),
        # TODO(3743): This validation will fail until we fix the view to handle people who age into new buckets
        SamenessDataValidationCheck(
            view=
            SUPERVISION_SUCCESS_BY_PERIOD_BY_DEMOGRAPHICS_INTERNAL_CONSISTENCY_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'metric_total', 'age_bucket_breakdown_sum',
                'race_or_ethnicity_breakdown_sum', 'gender_breakdown_sum'
            ]),
        SamenessDataValidationCheck(
            view=
            ACTIVE_PROGRAM_PARTICIPATION_BY_REGION_INTERNAL_CONSISTENCY_VIEW_BUILDER
            .build(),
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=[
                'metric_total', 'race_or_ethnicity_breakdown_sum'
            ])
    ]

    return all_data_validations