def test_dag_with_cycle_after_root(self) -> None:
     view_1 = BigQueryView(
         dataset_id="dataset_1",
         view_id="table_1",
         view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
     )
     view_2 = BigQueryView(
         dataset_id="dataset_2",
         view_id="table_2",
         view_query_template="""
         SELECT * FROM `{project_id}.dataset_1.table_1`
         JOIN `{project_id}.dataset_3.table_3`
         USING (col)""",
     )
     view_3 = BigQueryView(
         dataset_id="dataset_3",
         view_id="table_3",
         view_query_template="SELECT * FROM `{project_id}.dataset_2.table_2`",
     )
     with self.assertRaises(ValueError) as e:
         _ = BigQueryViewDagWalker([view_1, view_2, view_3])
     self.assertEqual(
         str(e.exception),
         "Detected cycle in graph reachable from ('dataset_1', 'table_1'): "
         "[('dataset_2', 'table_2'), ('dataset_3', 'table_3')]",
     )
 def test_parse_view_materialized_parent(self) -> None:
     view = BigQueryView(
         dataset_id="my_dataset",
         view_id="my_view_id",
         description="my view description",
         view_query_template="SELECT * FROM `{project_id}.some_dataset.some_table_materialized`",
     )
     parent_view = BigQueryView(
         dataset_id="some_dataset",
         view_id="some_table",
         description="my parent view description",
         view_query_template="SELECT * FROM UNNEST([])",
         should_materialize=True,
     )
     node = BigQueryViewDagNode(view)
     if not parent_view.materialized_address:
         raise ValueError("Null materialized_address for view [{parent_view}]")
     node.set_materialized_addresss(
         {parent_view.materialized_address: DagKey.for_view(parent_view)}
     )
     self.assertEqual(
         node.parent_keys,
         {
             DagKey(
                 view_address=BigQueryAddress(
                     dataset_id="some_dataset", table_id="some_table"
                 )
             )
         },
     )
 def test_dag_two_views_same_materialized_address(self) -> None:
     view_1 = BigQueryView(
         dataset_id="dataset_1",
         view_id="table_1",
         description="table_1 description",
         should_materialize=True,
         materialized_address_override=BigQueryAddress(
             dataset_id="other_dataset", table_id="other_table"
         ),
         view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
     )
     view_2 = BigQueryView(
         dataset_id="dataset_2",
         view_id="table_2",
         description="table_2 description",
         should_materialize=True,
         materialized_address_override=BigQueryAddress(
             dataset_id="other_dataset", table_id="other_table"
         ),
         view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table_2`",
     )
     with self.assertRaises(ValueError) as e:
         _ = BigQueryViewDagWalker([view_1, view_2])
     self.assertTrue(
         str(e.exception).startswith(
             "Found materialized view address for view [('dataset_2', 'table_2')] "
             "that matches materialized_address of another view: "
             "[('dataset_1', 'table_1')]."
         )
     )
    def test_union_dags_same_view_different_object(self) -> None:
        view = BigQueryView(
            dataset_id="dataset_1",
            view_id="table_1",
            description="table_1 description",
            should_materialize=True,
            materialized_address_override=BigQueryAddress(
                dataset_id="other_dataset_1", table_id="other_table_1"
            ),
            view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
        )

        unioned_dag = BigQueryViewDagWalker.union_dags(
            BigQueryViewDagWalker([view]),
            BigQueryViewDagWalker(
                [
                    BigQueryView(
                        dataset_id="dataset_1",
                        view_id="table_1",
                        description="table_1 description",
                        should_materialize=True,
                        materialized_address_override=BigQueryAddress(
                            dataset_id="other_dataset_1", table_id="other_table_1"
                        ),
                        view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
                    )
                ]
            ),
        )

        self.assertCountEqual([view], unioned_dag.views)
    def test_samneness_check_validation_name(self) -> None:
        check = SamenessDataValidationCheck(
            validation_type=ValidationCheckType.SAMENESS,
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            comparison_columns=["a", "b", "c"],
            view=BigQueryView(
                dataset_id="my_dataset",
                view_id="test_view",
                view_query_template="select * from literally_anything",
            ),
        )
        self.assertEqual(check.validation_name, "test_view")

        check_with_name_suffix = SamenessDataValidationCheck(
            validation_type=ValidationCheckType.SAMENESS,
            sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
            validation_name_suffix="b_c_only",
            comparison_columns=["b", "c"],
            view=BigQueryView(
                dataset_id="my_dataset",
                view_id="test_view",
                view_query_template="select * from literally_anything",
            ),
        )
        self.assertEqual(check_with_name_suffix.validation_name, "test_view_b_c_only")
Example #6
0
    def test_copy_bq_views_raw_project_id(
            self, mock_table_exists: mock.MagicMock,
            mock_copy_view: mock.MagicMock) -> None:
        """Check that copy_view is called, even when the project_id is in the view_query_template."""
        view_with_project_id = BigQueryView(
            project_id=self.mock_source_project_id,
            dataset_id=self.mock_source_dataset_id,
            view_id="test_view",
            description="test_view description",
            view_query_template=
            f"SELECT * FROM {self.mock_source_project_id}.other_dataset.table LIMIT 0",
            should_materialize=True,
        )

        self.mock_client.list_tables.return_value = [view_with_project_id]
        self.mock_client.get_table.return_value = view_with_project_id
        mock_table_exists.side_effect = self.table_exists_side_effect

        copy_bq_views(
            source_project_id=self.mock_source_project_id,
            source_dataset_id=self.mock_source_dataset_id,
            destination_project_id=self.mock_destination_project_id,
            destination_dataset_id=self.mock_destination_dataset_id,
        )

        expected_view = BigQueryView(
            project_id=self.mock_destination_project_id,
            dataset_id=self.mock_destination_dataset_id,
            view_id="test_view",
            description="test_view description",
            view_query_template=
            f"SELECT * FROM {self.mock_destination_project_id}.other_dataset.table LIMIT 0",
            should_materialize=True,
        )

        expected_destination_dataset_ref = bigquery.DatasetReference(
            project=self.mock_destination_project_id,
            dataset_id=self.mock_destination_dataset_id,
        )

        mock_copy_view.assert_called()
        self.assertEqual(expected_view,
                         mock_copy_view.call_args_list[0][1].get("view"))
        self.assertEqual(
            self.mock_destination_project_id,
            mock_copy_view.call_args_list[0][1].get(
                "destination_client").project_id,
        )
        self.assertEqual(
            expected_destination_dataset_ref,
            mock_copy_view.call_args_list[0][1].get("destination_dataset_ref"),
        )
    def test_populate_node_family_full_parentage_complex_dependencies(self) -> None:
        view_1 = BigQueryView(
            dataset_id="dataset_1",
            view_id="table_1",
            description="table_1 description",
            view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
        )
        view_2 = BigQueryView(
            dataset_id="dataset_2",
            view_id="table_2",
            description="table_2 description",
            view_query_template="SELECT * FROM `{project_id}.dataset_1.table_1`",
        )
        view_3 = BigQueryView(
            dataset_id="dataset_3",
            view_id="table_3",
            description="table_3 description",
            view_query_template="""
                           SELECT * FROM `{project_id}.dataset_1.table_1`
                           JOIN `{project_id}.dataset_2.table_2`
                           USING (col)""",
        )
        view_4 = BigQueryView(
            dataset_id="dataset_4",
            view_id="table_4",
            description="table_4 description",
            view_query_template="""
                           SELECT * FROM `{project_id}.dataset_2.table_2`
                           JOIN `{project_id}.dataset_3.table_3`
                           USING (col)""",
        )

        dag_walker = BigQueryViewDagWalker([view_1, view_2, view_3, view_4])
        start_node = dag_walker.node_for_view(view_4)

        dag_walker.populate_node_family_for_node(
            node=start_node, view_source_table_datasets={"source_dataset"}
        )
        expected_parent_nodes = {
            DagKey(
                view_address=BigQueryAddress(
                    dataset_id="source_dataset", table_id="source_table"
                )
            ),
            DagKey.for_view(view_1),
            DagKey.for_view(view_2),
            DagKey.for_view(view_3),
        }
        self.assertEqual(expected_parent_nodes, start_node.node_family.full_parentage)
    def test_dag_parents_materialized_non_default(self) -> None:
        self.maxDiff = None
        view_1 = BigQueryView(
            dataset_id="dataset_1",
            view_id="table_1",
            description="table_1 description",
            should_materialize=True,
            materialized_address_override=BigQueryAddress(
                dataset_id="other_dataset_1", table_id="other_table_1"
            ),
            view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
        )
        view_2 = BigQueryView(
            dataset_id="dataset_2",
            view_id="table_2",
            description="table_2 description",
            should_materialize=True,
            materialized_address_override=BigQueryAddress(
                dataset_id="other_dataset_2", table_id="other_table_2"
            ),
            view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table_2`",
        )
        view_3 = BigQueryView(
            dataset_id="dataset_3",
            view_id="table_3",
            description="table_3 description",
            view_query_template="""
                SELECT * FROM `{project_id}.dataset_1.table_1`
                JOIN `{project_id}.other_dataset_2.other_table_2`
                USING (col)""",
        )
        walker = BigQueryViewDagWalker([view_1, view_2, view_3])

        def process_simple(
            view: BigQueryView, parent_results: Dict[BigQueryView, DagKey]
        ) -> str:
            if view == view_3:
                # View 3 should have two parents
                self.assertEqual(
                    {view_1: view_1.view_id, view_2: view_2.view_id}, parent_results
                )

            return view.view_id

        result = walker.process_dag(process_simple)
        self.assertEqual(
            {view_1: view_1.view_id, view_2: view_2.view_id, view_3: view_3.view_id},
            result,
        )
Example #9
0
    def test_copy_bq_views(self, mock_table_exists, mock_copy_view):
        """Check that copy_view is called when the view does not exist in the destination dataset."""
        self.mock_client.list_tables.return_value = [self.mock_view]
        self.mock_client.get_table.return_value = self.mock_view
        mock_table_exists.side_effect = self.table_exists_side_effect

        copy_bq_views(source_project_id=self.mock_source_project_id,
                      source_dataset_id=self.mock_source_dataset_id,
                      destination_project_id=self.mock_destination_project_id,
                      destination_dataset_id=self.mock_destination_dataset_id)

        expected_view = BigQueryView(
            project_id=self.mock_destination_project_id,
            dataset_id=self.mock_destination_dataset_id,
            view_id=self.mock_view.view_id,
            view_query_template=self.mock_view.view_query,
            should_materialize=True)

        expected_destination_dataset_ref = bigquery.DatasetReference(
            project=self.mock_destination_project_id,
            dataset_id=self.mock_destination_dataset_id)

        mock_copy_view.assert_called()
        self.assertEqual(expected_view,
                         mock_copy_view.call_args_list[0][1].get('view'))
        self.assertEqual(
            self.mock_destination_project_id, mock_copy_view.call_args_list[0]
            [1].get('destination_client').project_id)
        self.assertEqual(
            expected_destination_dataset_ref,
            mock_copy_view.call_args_list[0][1].get('destination_dataset_ref'))
    def test_string_sameness_check_different_values_above_margin(self) -> None:
        num_bad_rows = 5
        max_allowed_error = (num_bad_rows - 1) / 100  # Below the number of bad rows

        self.mock_client.run_query_async.return_value = (
            self.return_string_values_with_num_bad_rows(num_bad_rows)
        )
        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                max_allowed_error=max_allowed_error,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        actual_expected_error = num_bad_rows / 100

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=f"{num_bad_rows} out of 100 row(s) did not contain matching strings. "
                f"The acceptable margin of error is only {max_allowed_error}, but the "
                f"validation returned an error rate of {actual_expected_error}.",
            ),
        )
    def test_string_sameness_check_different_values_handle_non_string_type(
        self,
    ) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": "same", "b": "same", "c": 1245}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            "Unexpected type [<class 'int'>] for value [1245] in STRING validation [test_view].",
        )
    def test_string_sameness_check_numbers_values_all_none(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": None, "b": None, "c": None}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )

        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            "Unexpected None value for column [a] in validation [test_view].",
        )
    def test_sameness_check_numbers_multiple_rows_above_margin(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": 97, "b": 100, "c": 99},
            {"a": 14, "b": 21, "c": 14},
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                max_allowed_error=0.02,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description="2 row(s) had unacceptable margins of error. The acceptable margin "
                "of error is only 0.02, but the validation returned rows with "
                "errors as high as 0.3333.",
            ),
        )
def main(*, source_project_id, source_dataset_id, destination_project_id,
         destination_dataset_id):
    """Copies all views from the source_project_id.source_dataset_id to the
    destination_project_id.destination_dataset_id."""

    # Construct a BigQuery client with the source_project_id
    source_client = BigQueryClientImpl(project_id=source_project_id)

    # Construct a BigQuery client with the destination_project_id
    destination_client = BigQueryClientImpl(project_id=destination_project_id)

    destination_dataset = bigquery.DatasetReference(destination_project_id,
                                                    destination_dataset_id)

    tables_in_source_dataset = source_client.list_tables(source_dataset_id)

    for table_ref in tables_in_source_dataset:
        table = source_client.get_table(
            source_client.dataset_ref_for_id(table_ref.dataset_id),
            table_ref.table_id)

        # Only copy this view if there is a view_query to replicate and the view doesn't already exist in the
        # destination dataset
        if table.view_query and not destination_client.table_exists(
                destination_dataset, table_id=table.table_id):
            # Retrieve all of the information about the view
            source_client.copy_view(
                view=BigQueryView(dataset_id=table_ref.dataset_id,
                                  view_id=table.table_id,
                                  view_query_template=table.view_query),
                destination_client=destination_client,
                destination_dataset_ref=destination_dataset)
 def test_parse_view_multiple_parents(self) -> None:
     view = BigQueryView(
         dataset_id="my_dataset",
         view_id="my_view_id",
         description="my view description",
         view_query_template="""SELECT * FROM `{project_id}.some_dataset.some_table`
         LEFT OUTER JOIN `{project_id}.some_dataset.other_table`
         USING (some_col);
         """,
     )
     node = BigQueryViewDagNode(view)
     node.set_materialized_addresss({})
     self.assertEqual(
         node.parent_keys,
         {
             DagKey(
                 view_address=BigQueryAddress(
                     dataset_id="some_dataset", table_id="some_table"
                 )
             ),
             DagKey(
                 view_address=BigQueryAddress(
                     dataset_id="some_dataset", table_id="other_table"
                 )
             ),
         },
     )
Example #16
0
    def test_existence_check_failures(self) -> None:
        self.mock_client.run_query_async.return_value = [
            "some result row",
            "some other result row",
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=ExistenceDataValidationCheck(
                validation_type=ValidationCheckType.EXISTENCE,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = ExistenceValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=
                "Found [2] invalid rows, though [0] were expected",
            ),
        )
    def setUp(self):
        project_id = 'fake-recidiviz-project'
        self.mock_dataset_name = 'base_dataset'
        self.mock_dataset = bigquery.dataset.DatasetReference(
            project_id, self.mock_dataset_name)

        self.metadata_patcher = mock.patch('recidiviz.utils.metadata.project_id')
        self.mock_project_id_fn = self.metadata_patcher.start()
        self.mock_project_id_fn.return_value = project_id

        self.client_patcher = mock.patch(
            'recidiviz.calculator.query.state.dashboard_export_manager.BigQueryClientImpl')
        self.mock_client = self.client_patcher.start().return_value

        self.mock_client.dataset_ref_for_id.return_value = self.mock_dataset

        self.mock_view = BigQueryView(dataset_id=self.mock_dataset.dataset_id,
                                      view_id='test_view',
                                      view_query_template='SELECT NULL LIMIT 0')

        self.views_to_export = [self.mock_view]
        dashboard_export_config_values = {
            'STATES_TO_EXPORT': ['US_CA'],
            'VIEWS_TO_EXPORT': self.views_to_export,
        }
        self.dashboard_export_config_patcher = mock.patch(
            'recidiviz.calculator.query.state.dashboard_export_manager.dashboard_export_config',
            **dashboard_export_config_values)
        self.mock_export_config = self.dashboard_export_config_patcher.start()

        self.views_to_update = {self.mock_dataset_name: self.views_to_export}
Example #18
0
    def setUp(self) -> None:
        self.location = 'US'
        self.mock_project_id = 'fake-recidiviz-project'
        self.mock_dataset_id = 'fake-dataset'
        self.mock_table_id = 'test_table'
        self.mock_dataset_ref = bigquery.dataset.DatasetReference(
            self.mock_project_id, self.mock_dataset_id)
        self.mock_table = self.mock_dataset_ref.table(self.mock_table_id)

        self.metadata_patcher = mock.patch(
            'recidiviz.utils.metadata.project_id')
        self.mock_project_id_fn = self.metadata_patcher.start()
        self.mock_project_id_fn.return_value = self.mock_project_id

        self.client_patcher = mock.patch(
            'recidiviz.big_query.big_query_client.client')
        self.mock_client = self.client_patcher.start().return_value

        self.mock_view = BigQueryView(
            dataset_id='dataset',
            view_id='test_view',
            view_query_template='SELECT NULL LIMIT 0',
            should_materialize=True)

        self.bq_client = BigQueryClientImpl()
Example #19
0
    def test_sameness_check_numbers_different_values_within_margin(self):
        self.mock_client.run_query_async.return_value = [{
            'a': 98,
            'b': 100,
            'c': 99
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                max_allowed_error=0.02,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(validation_job=job,
                                    was_successful=True,
                                    failure_description=None))
Example #20
0
    def test_sameness_check_numbers_multiple_rows_above_margin(self):
        self.mock_client.run_query_async.return_value = [{
            'a': 97,
            'b': 100,
            'c': 99
        }, {
            'a': 14,
            'b': 21,
            'c': 14
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                max_allowed_error=0.02,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=
                '2 row(s) had unacceptable margins of error. The acceptable margin '
                'of error is only 0.02, but the validation returned rows with '
                'errors as high as 0.3333.',
            ))
Example #21
0
    def test_string_sameness_check_different_values_handle_empty_string(self):
        self.mock_client.run_query_async.return_value = [{
            'a': 'same',
            'b': 'same',
            'c': None
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=
                '1 out of 1 row(s) did not contain matching strings. '
                'The acceptable margin of error is only 0.0, but the '
                'validation returned an error rate of 1.0.',
            ))
    def test_string_sameness_check_strings_values_all_none(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": None, "b": None, "c": None}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job, was_successful=True, failure_description=None
            ),
        )
Example #23
0
    def test_string_sameness_check_different_values_handle_non_string_type(
            self):
        self.mock_client.run_query_async.return_value = [{
            'a': 'same',
            'b': 'same',
            'c': 1245
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            'Unexpected type [<class \'int\'>] for value [1245] in STRING validation [test_view].'
        )
    def test_string_sameness_check_different_values_handle_empty_string(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": "same", "b": "same", "c": None}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description="1 out of 1 row(s) did not contain matching strings. "
                "The acceptable margin of error is only 0.0, but the "
                "validation returned an error rate of 1.0.",
            ),
        )
Example #25
0
    def test_string_sameness_check_different_values_above_margin(self):
        num_bad_rows = 5
        max_allowed_error = (
            (num_bad_rows - 1) / 100)  # Below the number of bad rows

        self.mock_client.run_query_async.return_value = self.return_string_values_with_num_bad_rows(
            num_bad_rows)
        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                max_allowed_error=max_allowed_error,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        actual_expected_error = (num_bad_rows / 100)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=
                f'{num_bad_rows} out of 100 row(s) did not contain matching strings. '
                f'The acceptable margin of error is only {max_allowed_error}, but the '
                f'validation returned an error rate of {actual_expected_error}.',
            ))
    def test_string_sameness_check_different_values_within_margin(self) -> None:
        num_bad_rows = 2
        max_allowed_error = num_bad_rows / 100

        self.mock_client.run_query_async.return_value = (
            self.return_string_values_with_num_bad_rows(num_bad_rows)
        )

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                max_allowed_error=max_allowed_error,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job, was_successful=True, failure_description=None
            ),
        )
Example #27
0
    def test_existence_check_failures_below_threshold(self) -> None:
        self.mock_client.run_query_async.return_value = [
            "some result row",
            "some other result row",
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=ExistenceDataValidationCheck(
                validation_type=ValidationCheckType.EXISTENCE,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
                num_allowed_rows=2,
            ),
        )
        result = ExistenceValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(validation_job=job,
                                    was_successful=True,
                                    failure_description=None),
        )
Example #28
0
    def test_create_dataset_and_update_views(self):
        """Test that create_dataset_and_update_views creates a dataset if necessary, and updates all views."""
        dataset = bigquery.dataset.DatasetReference(_PROJECT_ID, _DATASET_NAME)

        sample_views = [
            {
                'view_id': 'my_fake_view',
                'view_query': 'SELECT NULL LIMIT 0'
            },
            {
                'view_id': 'my_other_fake_view',
                'view_query': 'SELECT NULL LIMIT 0'
            },
        ]
        mock_views = [
            BigQueryView(dataset_id=_DATASET_NAME,
                         view_query_template='a',
                         **view) for view in sample_views
        ]

        self.mock_client.dataset_ref_for_id.return_value = dataset

        # pylint: disable=protected-access
        view_update_manager._create_dataset_and_update_views(mock_views)

        self.mock_client.dataset_ref_for_id.assert_called_with(_DATASET_NAME)
        self.mock_client.create_dataset_if_necessary.assert_called_with(
            dataset, None)
        self.mock_client.create_or_update_view.assert_has_calls(
            [mock.call(dataset, view) for view in mock_views])
Example #29
0
    def setUp(self) -> None:
        self.location = "US"
        self.mock_project_id = "fake-recidiviz-project"
        self.mock_dataset_id = "fake-dataset"
        self.mock_table_id = "test_table"
        self.mock_dataset_ref = bigquery.dataset.DatasetReference(
            self.mock_project_id, self.mock_dataset_id
        )
        self.mock_table = self.mock_dataset_ref.table(self.mock_table_id)

        self.metadata_patcher = mock.patch("recidiviz.utils.metadata.project_id")
        self.mock_project_id_fn = self.metadata_patcher.start()
        self.mock_project_id_fn.return_value = self.mock_project_id

        self.client_patcher = mock.patch("recidiviz.big_query.big_query_client.client")
        self.mock_client = self.client_patcher.start().return_value

        self.mock_view = BigQueryView(
            dataset_id="dataset",
            view_id="test_view",
            view_query_template="SELECT NULL LIMIT 0",
            should_materialize=True,
        )

        self.bq_client = BigQueryClientImpl()
 def test_parse_view_materialized_parent(self) -> None:
     view = BigQueryView(
         dataset_id="my_dataset",
         view_id="my_view_id",
         view_query_template="SELECT * FROM `{project_id}.some_dataset.some_table_materialized`",
     )
     node = BigQueryViewDagNode(view)
     self.assertEqual(node.parent_keys, {("some_dataset", "some_table")})