def update_dataflow_metric_tables_schemas( dataflow_metrics_dataset_id: str = DATAFLOW_METRICS_DATASET, ) -> None: """For each table that stores Dataflow metric output in the |dataflow_metrics_dataset_id|, ensures that all attributes on the corresponding metric are present in the table in BigQuery. If no |dataflow_metrics_dataset_id| is provided, defaults to the DATAFLOW_METRICS_DATASET.""" bq_client = BigQueryClientImpl() dataflow_metrics_dataset_ref = bq_client.dataset_ref_for_id( dataflow_metrics_dataset_id ) bq_client.create_dataset_if_necessary(dataflow_metrics_dataset_ref) for metric_class, table_id in DATAFLOW_METRICS_TO_TABLES.items(): schema_for_metric_class = metric_class.bq_schema_for_metric_table() if bq_client.table_exists(dataflow_metrics_dataset_ref, table_id): # Compare schema derived from metric class to existing dataflow views and update if necessary. bq_client.update_schema( dataflow_metrics_dataset_id, table_id, schema_for_metric_class ) else: # Create a table with this schema bq_client.create_table_with_schema( dataflow_metrics_dataset_id, table_id, schema_for_metric_class )
def main(*, source_project_id, source_dataset_id, destination_project_id, destination_dataset_id): """Copies all views from the source_project_id.source_dataset_id to the destination_project_id.destination_dataset_id.""" # Construct a BigQuery client with the source_project_id source_client = BigQueryClientImpl(project_id=source_project_id) # Construct a BigQuery client with the destination_project_id destination_client = BigQueryClientImpl(project_id=destination_project_id) destination_dataset = bigquery.DatasetReference(destination_project_id, destination_dataset_id) tables_in_source_dataset = source_client.list_tables(source_dataset_id) for table_ref in tables_in_source_dataset: table = source_client.get_table( source_client.dataset_ref_for_id(table_ref.dataset_id), table_ref.table_id) # Only copy this view if there is a view_query to replicate and the view doesn't already exist in the # destination dataset if table.view_query and not destination_client.table_exists( destination_dataset, table_id=table.table_id): # Retrieve all of the information about the view source_client.copy_view( view=BigQueryView(dataset_id=table_ref.dataset_id, view_id=table.table_id, view_query_template=table.view_query), destination_client=destination_client, destination_dataset_ref=destination_dataset)
def move_old_dataflow_metrics_to_cold_storage(): """Moves old output in Dataflow metrics tables to tables in a cold storage dataset. We only keep the MAX_DAYS_IN_DATAFLOW_METRICS_TABLE days worth of data in a Dataflow metric table at once. All other output is moved to cold storage. """ bq_client = BigQueryClientImpl() dataflow_metrics_dataset = DATAFLOW_METRICS_DATASET cold_storage_dataset = DATAFLOW_METRICS_COLD_STORAGE_DATASET dataflow_metrics_tables = bq_client.list_tables(dataflow_metrics_dataset) for table_ref in dataflow_metrics_tables: table_id = table_ref.table_id filter_clause = """WHERE created_on NOT IN (SELECT DISTINCT created_on FROM `{project_id}.{dataflow_metrics_dataset}.{table_id}` ORDER BY created_on DESC LIMIT {day_count_limit})""".format( project_id=table_ref.project, dataflow_metrics_dataset=table_ref.dataset_id, table_id=table_ref.table_id, day_count_limit=MAX_DAYS_IN_DATAFLOW_METRICS_TABLE) cold_storage_dataset_ref = bq_client.dataset_ref_for_id( cold_storage_dataset) if bq_client.table_exists(cold_storage_dataset_ref, table_id): # Move data from the Dataflow metrics dataset into the cold storage dataset insert_job = bq_client.insert_into_table_from_table_async( source_dataset_id=dataflow_metrics_dataset, source_table_id=table_id, destination_dataset_id=cold_storage_dataset, destination_table_id=table_id, source_data_filter_clause=filter_clause, allow_field_additions=True) # Wait for the insert job to complete before running the delete job insert_job.result() else: # This table doesn't yet exist in cold storage. Create it. table_query = f"SELECT * FROM `{bq_client.project_id}.{dataflow_metrics_dataset}.{table_id}` " \ f"{filter_clause}" create_job = bq_client.create_table_from_query_async( cold_storage_dataset, table_id, table_query, query_parameters=[]) # Wait for the create job to complete before running the delete job create_job.result() # Delete that data from the Dataflow dataset delete_job = bq_client.delete_from_table_async( dataflow_metrics_dataset, table_ref.table_id, filter_clause) # Wait for the delete job to complete before moving on delete_job.result()
def update_dataflow_metric_tables_schemas() -> None: """For each table that stores Dataflow metric output, ensures that all attributes on the corresponding metric are present in the table in BigQuery.""" bq_client = BigQueryClientImpl() dataflow_metrics_dataset_id = DATAFLOW_METRICS_DATASET dataflow_metrics_dataset_ref = bq_client.dataset_ref_for_id(dataflow_metrics_dataset_id) bq_client.create_dataset_if_necessary(dataflow_metrics_dataset_ref) for metric_class, table_id in DATAFLOW_METRICS_TO_TABLES.items(): schema_for_metric_class = metric_class.bq_schema_for_metric_table() if bq_client.table_exists(dataflow_metrics_dataset_ref, table_id): # Add any missing fields to the table's schema bq_client.add_missing_fields_to_schema(dataflow_metrics_dataset_id, table_id, schema_for_metric_class) else: # Create a table with this schema bq_client.create_table_with_schema(dataflow_metrics_dataset_id, table_id, schema_for_metric_class)
def copy_bq_views( source_project_id: str, source_dataset_id: str, destination_project_id: str, destination_dataset_id: str, ) -> None: """Copies all views from the source_project_id.source_dataset_id to the destination_project_id.destination_dataset_id.""" # Construct a BigQuery client with the source_project_id source_client = BigQueryClientImpl(project_id=source_project_id) # Construct a BigQuery client with the destination_project_id destination_client = BigQueryClientImpl(project_id=destination_project_id) destination_dataset = bigquery.DatasetReference(destination_project_id, destination_dataset_id) tables_in_source_dataset = source_client.list_tables(source_dataset_id) for table_ref in tables_in_source_dataset: table = source_client.get_table( source_client.dataset_ref_for_id(table_ref.dataset_id), table_ref.table_id) view_query = table.view_query # Only copy this view if there is a view_query to replicate and the view doesn't already exist in the # destination dataset if view_query and not destination_client.table_exists( destination_dataset, table_id=table.table_id): # Remove any references to the source_project_id from the view_query updated_view_query = view_query.replace(source_project_id, "{project_id}") # Retrieve all of the information about the view source_client.copy_view( view=BigQueryView( project_id=destination_project_id, dataset_id=destination_dataset_id, view_id=table.table_id, description=table.description, view_query_template=updated_view_query, ), destination_client=destination_client, destination_dataset_ref=destination_dataset, )
class BigQueryClientImplTest(unittest.TestCase): """Tests for BigQueryClientImpl""" def setUp(self): self.mock_project_id = 'fake-recidiviz-project' self.mock_dataset_id = 'fake-dataset' self.mock_table_id = 'test_table' self.mock_dataset = bigquery.dataset.DatasetReference( self.mock_project_id, self.mock_dataset_id) self.mock_table = self.mock_dataset.table(self.mock_table_id) self.metadata_patcher = mock.patch('recidiviz.utils.metadata.project_id') self.mock_project_id_fn = self.metadata_patcher.start() self.mock_project_id_fn.return_value = self.mock_project_id self.client_patcher = mock.patch( 'recidiviz.big_query.big_query_client.client') self.mock_client = self.client_patcher.start().return_value self.mock_view = BigQueryView( dataset_id='dataset', view_id='test_view', view_query_template='SELECT NULL LIMIT 0', materialized_view_table_id='test_view_table' ) self.bq_client = BigQueryClientImpl() def tearDown(self): self.client_patcher.stop() self.metadata_patcher.stop() def test_create_dataset_if_necessary(self): """Check that a dataset is created if it does not exist.""" self.mock_client.get_dataset.side_effect = exceptions.NotFound('!') self.bq_client.create_dataset_if_necessary(self.mock_dataset) self.mock_client.create_dataset.assert_called() def test_create_dataset_if_necessary_dataset_exists(self): """Check that a dataset is not created if it already exists.""" self.mock_client.get_dataset.side_effect = None self.bq_client.create_dataset_if_necessary(self.mock_dataset) self.mock_client.create_dataset.assert_not_called() def test_table_exists(self): """Check that table_exists returns True if the table exists.""" self.mock_client.get_table.side_effect = None self.assertTrue( self.bq_client.table_exists(self.mock_dataset, self.mock_table_id)) def test_table_exists_does_not_exist(self): """Check that table_exists returns False if the table does not exist.""" self.mock_client.get_table.side_effect = exceptions.NotFound('!') with self.assertLogs(level='WARNING'): table_exists = self.bq_client.table_exists( self.mock_dataset, self.mock_table_id) self.assertFalse(table_exists) def test_create_or_update_view_creates_view(self): """create_or_update_view creates a View if it does not exist.""" self.mock_client.get_table.side_effect = exceptions.NotFound('!') self.bq_client.create_or_update_view(self.mock_dataset, self.mock_view) self.mock_client.create_table.assert_called() self.mock_client.update_table.assert_not_called() def test_create_or_update_view_updates_view(self): """create_or_update_view updates a View if it already exist.""" self.mock_client.get_table.side_effect = None self.bq_client.create_or_update_view(self.mock_dataset, self.mock_view) self.mock_client.update_table.assert_called() self.mock_client.create_table.assert_not_called() def test_export_to_cloud_storage(self): """export_to_cloud_storage extracts the table corresponding to the view.""" self.assertIsNotNone(self.bq_client.export_table_to_cloud_storage_async( source_table_dataset_ref=self.mock_dataset, source_table_id='source-table', destination_uri=f'gs://{self.mock_project_id}-bucket/destination_path.json', destination_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON )) self.mock_client.extract_table.assert_called() def test_export_to_cloud_storage_no_table(self): """export_to_cloud_storage does not extract from a table if the table does not exist.""" self.mock_client.get_table.side_effect = exceptions.NotFound('!') with self.assertLogs(level='WARNING'): self.assertIsNone(self.bq_client.export_table_to_cloud_storage_async( source_table_dataset_ref=self.mock_dataset, source_table_id='source-table', destination_uri=f'gs://{self.mock_project_id}-bucket/destination_path.json', destination_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON)) self.mock_client.extract_table.assert_not_called() def test_load_table_async_create_dataset(self): """Test that load_table_from_cloud_storage_async tries to create a parent dataset.""" self.mock_client.get_dataset.side_effect = exceptions.NotFound('!') self.bq_client.load_table_from_cloud_storage_async( destination_dataset_ref=self.mock_dataset, destination_table_id=self.mock_table_id, destination_table_schema=[SchemaField('my_column', 'STRING', 'NULLABLE', None, ())], source_uri='gs://bucket/export-uri') self.mock_client.create_dataset.assert_called() self.mock_client.load_table_from_uri.assert_called() def test_load_table_async_dataset_exists(self): """Test that load_table_from_cloud_storage_async does not try to create a parent dataset if it already exists. """ self.bq_client.load_table_from_cloud_storage_async( destination_dataset_ref=self.mock_dataset, destination_table_id=self.mock_table_id, destination_table_schema=[SchemaField('my_column', 'STRING', 'NULLABLE', None, ())], source_uri='gs://bucket/export-uri') self.mock_client.create_dataset.assert_not_called() self.mock_client.load_table_from_uri.assert_called() def test_export_query_results_to_cloud_storage_no_table(self): bucket = self.mock_project_id + '-bucket' self.mock_client.get_table.side_effect = exceptions.NotFound('!') with self.assertLogs(level='WARNING'): self.bq_client.export_query_results_to_cloud_storage([ ExportQueryConfig.from_view_query( view=self.mock_view, view_filter_clause='WHERE x = y', intermediate_table_name=self.mock_table_id, output_uri=f'gs://{bucket}/view.json', output_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON) ]) def test_export_query_results_to_cloud_storage(self): """export_query_results_to_cloud_storage creates the table from the view query and exports the table.""" bucket = self.mock_project_id + '-bucket' query_job = futures.Future() query_job.set_result([]) extract_job = futures.Future() extract_job.set_result(None) self.mock_client.query.return_value = query_job self.mock_client.extract_table.return_value = extract_job self.bq_client.export_query_results_to_cloud_storage([ ExportQueryConfig.from_view_query( view=self.mock_view, view_filter_clause='WHERE x = y', intermediate_table_name=self.mock_table_id, output_uri=f'gs://{bucket}/view.json', output_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON) ]) self.mock_client.query.assert_called() self.mock_client.extract_table.assert_called() self.mock_client.delete_table.assert_called_with( bigquery.DatasetReference(self.mock_project_id, self.mock_view.dataset_id).table(self.mock_table_id)) def test_create_table_from_query(self): """Tests that the create_table_from_query function calls the function to create a table from a query.""" self.bq_client.create_table_from_query_async(self.mock_dataset_id, self.mock_table_id, query="SELECT * FROM some.fake.table", query_parameters=[]) self.mock_client.query.assert_called() def test_insert_into_table_from_table(self): """Tests that the insert_into_table_from_table function runs a query.""" self.bq_client.insert_into_table_from_table_async('fake_source_dataset_id', 'fake_table_id', self.mock_dataset_id, self.mock_table_id) self.mock_client.get_table.assert_called() self.mock_client.query.assert_called() def test_insert_into_table_from_table_invalid_destination(self): """Tests that the insert_into_table_from_table function does not run the query if the destination table does not exist.""" self.mock_client.get_table.side_effect = exceptions.NotFound('!') with pytest.raises(ValueError): self.bq_client.insert_into_table_from_table_async(self.mock_dataset_id, self.mock_table_id, 'fake_source_dataset_id', 'fake_table_id') self.mock_client.get_table.assert_called() self.mock_client.query.assert_not_called() def test_insert_into_table_from_cloud_storage_async(self): self.mock_client.get_dataset.side_effect = exceptions.NotFound('!') self.bq_client.insert_into_table_from_cloud_storage_async( destination_dataset_ref=self.mock_dataset, destination_table_id=self.mock_table_id, destination_table_schema=[SchemaField('my_column', 'STRING', 'NULLABLE', None, ())], source_uri='gs://bucket/export-uri') self.mock_client.create_dataset.assert_called() self.mock_client.load_table_from_uri.assert_called() def test_delete_from_table(self): """Tests that the delete_from_table function runs a query.""" self.bq_client.delete_from_table_async(self.mock_dataset_id, self.mock_table_id, filter_clause="WHERE x > y") self.mock_client.query.assert_called() def test_delete_from_table_invalid_filter_clause(self): """Tests that the delete_from_table function does not run a query when the filter clause is invalid.""" with pytest.raises(ValueError): self.bq_client.delete_from_table_async(self.mock_dataset_id, self.mock_table_id, filter_clause="x > y") self.mock_client.query.assert_not_called() def test_materialize_view_to_table(self): """Tests that the materialize_view_to_table function calls the function to create a table from a query.""" self.bq_client.materialize_view_to_table(self.mock_view) self.mock_client.query.assert_called() def test_materialize_view_to_table_no_materialized_view_table_id(self): """Tests that the materialize_view_to_table function does not call the function to create a table from a query if there is no set materialized_view_table_id on the view.""" invalid_view = BigQueryView( dataset_id='dataset', view_id='test_view', view_query_template='SELECT NULL LIMIT 0', materialized_view_table_id=None ) with pytest.raises(ValueError): self.bq_client.materialize_view_to_table(invalid_view) self.mock_client.query.assert_not_called() def test_create_table_with_schema(self): """Tests that the create_table_with_schema function calls the create_table function on the client.""" self.mock_client.get_table.side_effect = exceptions.NotFound('!') schema_fields = [bigquery.SchemaField('new_schema_field', 'STRING')] self.bq_client.create_table_with_schema(self.mock_dataset_id, self.mock_table_id, schema_fields) self.mock_client.create_table.assert_called() def test_create_table_with_schema_table_exists(self): """Tests that the create_table_with_schema function raises an error when the table already exists.""" self.mock_client.get_table.side_effect = None schema_fields = [bigquery.SchemaField('new_schema_field', 'STRING')] with pytest.raises(ValueError): self.bq_client.create_table_with_schema(self.mock_dataset_id, self.mock_table_id, schema_fields) self.mock_client.create_table.assert_not_called() def test_add_missing_fields_to_schema(self): """Tests that the add_missing_fields_to_schema function calls the client to update the table.""" table_ref = bigquery.TableReference(self.mock_dataset, self.mock_table_id) schema_fields = [bigquery.SchemaField('fake_schema_field', 'STRING')] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [bigquery.SchemaField('new_schema_field', 'STRING')] self.bq_client.add_missing_fields_to_schema(self.mock_dataset_id, self.mock_table_id, new_schema_fields) self.mock_client.update_table.assert_called() def test_add_missing_fields_to_schema_no_missing_fields(self): """Tests that the add_missing_fields_to_schema function does not call the client to update the table when all of the fields are already present.""" table_ref = bigquery.TableReference(self.mock_dataset, self.mock_table_id) schema_fields = [bigquery.SchemaField('fake_schema_field', 'STRING')] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [bigquery.SchemaField('fake_schema_field', 'STRING')] self.bq_client.add_missing_fields_to_schema(self.mock_dataset_id, self.mock_table_id, new_schema_fields) self.mock_client.update_table.assert_not_called() def test_add_missing_fields_to_schema_no_table(self): """Tests that the add_missing_fields_to_schema function does not call the client to update the table when the table does not exist.""" self.mock_client.get_table.side_effect = exceptions.NotFound('!') new_schema_fields = [bigquery.SchemaField('fake_schema_field', 'STRING')] with pytest.raises(ValueError): self.bq_client.add_missing_fields_to_schema(self.mock_dataset_id, self.mock_table_id, new_schema_fields) self.mock_client.update_table.assert_not_called() def test_add_missing_fields_to_schema_fields_with_same_name_different_type(self): """Tests that the add_missing_fields_to_schema function raises an error when the user is trying to add a field with the same name but different field_type as an existing field.""" table_ref = bigquery.TableReference(self.mock_dataset, self.mock_table_id) schema_fields = [bigquery.SchemaField('fake_schema_field', 'STRING')] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [bigquery.SchemaField('fake_schema_field', 'INTEGER')] with pytest.raises(ValueError): self.bq_client.add_missing_fields_to_schema(self.mock_dataset_id, self.mock_table_id, new_schema_fields) self.mock_client.update_table.assert_not_called() def test_add_missing_fields_to_schema_fields_with_same_name_different_mode(self): """Tests that the add_missing_fields_to_schema function raises an error when the user is trying to add a field with the same name but different mode as an existing field.""" table_ref = bigquery.TableReference(self.mock_dataset, self.mock_table_id) schema_fields = [bigquery.SchemaField('fake_schema_field', 'STRING', mode="NULLABLE")] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [bigquery.SchemaField('fake_schema_field', 'STRING', mode="REQUIRED")] with pytest.raises(ValueError): self.bq_client.add_missing_fields_to_schema(self.mock_dataset_id, self.mock_table_id, new_schema_fields) self.mock_client.update_table.assert_not_called()
class BigQueryClientImplTest(unittest.TestCase): """Tests for BigQueryClientImpl""" def setUp(self) -> None: self.location = 'US' self.mock_project_id = 'fake-recidiviz-project' self.mock_dataset_id = 'fake-dataset' self.mock_table_id = 'test_table' self.mock_dataset_ref = bigquery.dataset.DatasetReference( self.mock_project_id, self.mock_dataset_id) self.mock_table = self.mock_dataset_ref.table(self.mock_table_id) self.metadata_patcher = mock.patch( 'recidiviz.utils.metadata.project_id') self.mock_project_id_fn = self.metadata_patcher.start() self.mock_project_id_fn.return_value = self.mock_project_id self.client_patcher = mock.patch( 'recidiviz.big_query.big_query_client.client') self.mock_client = self.client_patcher.start().return_value self.mock_view = BigQueryView( dataset_id='dataset', view_id='test_view', view_query_template='SELECT NULL LIMIT 0', should_materialize=True) self.bq_client = BigQueryClientImpl() def tearDown(self) -> None: self.client_patcher.stop() self.metadata_patcher.stop() def test_create_dataset_if_necessary(self) -> None: """Check that a dataset is created if it does not exist.""" self.mock_client.get_dataset.side_effect = exceptions.NotFound('!') self.bq_client.create_dataset_if_necessary(self.mock_dataset_ref) self.mock_client.create_dataset.assert_called() def test_create_dataset_if_necessary_dataset_exists(self) -> None: """Check that a dataset is not created if it already exists.""" self.mock_client.get_dataset.side_effect = None self.bq_client.create_dataset_if_necessary(self.mock_dataset_ref) self.mock_client.create_dataset.assert_not_called() def test_create_dataset_if_necessary_table_expiration(self) -> None: """Check that the dataset is created with a set table expiration if the dataset does not exist and the new_dataset_table_expiration_ms is specified.""" self.mock_client.get_dataset.side_effect = exceptions.NotFound('!') self.bq_client.create_dataset_if_necessary( self.mock_dataset_ref, default_table_expiration_ms=6000) self.mock_client.create_dataset.assert_called() def test_table_exists(self) -> None: """Check that table_exists returns True if the table exists.""" self.mock_client.get_table.side_effect = None self.assertTrue( self.bq_client.table_exists(self.mock_dataset_ref, self.mock_table_id)) def test_table_exists_does_not_exist(self) -> None: """Check that table_exists returns False if the table does not exist.""" self.mock_client.get_table.side_effect = exceptions.NotFound('!') with self.assertLogs(level='WARNING'): table_exists = self.bq_client.table_exists(self.mock_dataset_ref, self.mock_table_id) self.assertFalse(table_exists) def test_create_or_update_view_creates_view(self) -> None: """create_or_update_view creates a View if it does not exist.""" self.mock_client.get_table.side_effect = exceptions.NotFound('!') self.bq_client.create_or_update_view(self.mock_dataset_ref, self.mock_view) self.mock_client.create_table.assert_called() self.mock_client.update_table.assert_not_called() def test_create_or_update_view_updates_view(self) -> None: """create_or_update_view updates a View if it already exist.""" self.mock_client.get_table.side_effect = None self.bq_client.create_or_update_view(self.mock_dataset_ref, self.mock_view) self.mock_client.update_table.assert_called() self.mock_client.create_table.assert_not_called() def test_export_to_cloud_storage(self) -> None: """export_to_cloud_storage extracts the table corresponding to the view.""" self.assertIsNotNone( self.bq_client.export_table_to_cloud_storage_async( source_table_dataset_ref=self.mock_dataset_ref, source_table_id='source-table', destination_uri= f'gs://{self.mock_project_id}-bucket/destination_path.json', destination_format=bigquery.DestinationFormat. NEWLINE_DELIMITED_JSON)) self.mock_client.extract_table.assert_called() def test_export_to_cloud_storage_no_table(self) -> None: """export_to_cloud_storage does not extract from a table if the table does not exist.""" self.mock_client.get_table.side_effect = exceptions.NotFound('!') with self.assertLogs(level='WARNING'): self.assertIsNone( self.bq_client.export_table_to_cloud_storage_async( source_table_dataset_ref=self.mock_dataset_ref, source_table_id='source-table', destination_uri= f'gs://{self.mock_project_id}-bucket/destination_path.json', destination_format=bigquery.DestinationFormat. NEWLINE_DELIMITED_JSON)) self.mock_client.extract_table.assert_not_called() def test_load_table_async_create_dataset(self) -> None: """Test that load_table_from_cloud_storage_async tries to create a parent dataset.""" self.mock_client.get_dataset.side_effect = exceptions.NotFound('!') self.bq_client.load_table_from_cloud_storage_async( destination_dataset_ref=self.mock_dataset_ref, destination_table_id=self.mock_table_id, destination_table_schema=[ SchemaField('my_column', 'STRING', 'NULLABLE', None, ()) ], source_uri='gs://bucket/export-uri') self.mock_client.create_dataset.assert_called() self.mock_client.load_table_from_uri.assert_called() def test_load_table_async_dataset_exists(self) -> None: """Test that load_table_from_cloud_storage_async does not try to create a parent dataset if it already exists. """ self.bq_client.load_table_from_cloud_storage_async( destination_dataset_ref=self.mock_dataset_ref, destination_table_id=self.mock_table_id, destination_table_schema=[ SchemaField('my_column', 'STRING', 'NULLABLE', None, ()) ], source_uri='gs://bucket/export-uri') self.mock_client.create_dataset.assert_not_called() self.mock_client.load_table_from_uri.assert_called() def test_export_query_results_to_cloud_storage_no_table(self) -> None: bucket = self.mock_project_id + '-bucket' self.mock_client.get_table.side_effect = exceptions.NotFound('!') with self.assertLogs(level='WARNING'): self.bq_client.export_query_results_to_cloud_storage([ ExportQueryConfig.from_view_query( view=self.mock_view, view_filter_clause='WHERE x = y', intermediate_table_name=self.mock_table_id, output_uri=f'gs://{bucket}/view.json', output_format=bigquery.DestinationFormat. NEWLINE_DELIMITED_JSON) ]) def test_export_query_results_to_cloud_storage(self) -> None: """export_query_results_to_cloud_storage creates the table from the view query and exports the table.""" bucket = self.mock_project_id + '-bucket' query_job: futures.Future = futures.Future() query_job.set_result([]) extract_job: futures.Future = futures.Future() extract_job.set_result(None) self.mock_client.query.return_value = query_job self.mock_client.extract_table.return_value = extract_job self.bq_client.export_query_results_to_cloud_storage([ ExportQueryConfig.from_view_query( view=self.mock_view, view_filter_clause='WHERE x = y', intermediate_table_name=self.mock_table_id, output_uri=f'gs://{bucket}/view.json', output_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON ) ]) self.mock_client.query.assert_called() self.mock_client.extract_table.assert_called() self.mock_client.delete_table.assert_called_with( bigquery.DatasetReference(self.mock_project_id, self.mock_view.dataset_id).table( self.mock_table_id)) def test_create_table_from_query(self) -> None: """Tests that the create_table_from_query function calls the function to create a table from a query.""" self.bq_client.create_table_from_query_async( self.mock_dataset_id, self.mock_table_id, query="SELECT * FROM some.fake.table", query_parameters=[]) self.mock_client.query.assert_called() @mock.patch('google.cloud.bigquery.job.QueryJobConfig') def test_insert_into_table_from_table_async( self, mock_job_config: mock.MagicMock) -> None: """Tests that the insert_into_table_from_table_async function runs a query.""" self.bq_client.insert_into_table_from_table_async( source_dataset_id=self.mock_dataset_id, source_table_id=self.mock_table_id, destination_dataset_id=self.mock_dataset_id, destination_table_id='fake_table_temp') expected_query = f"SELECT * FROM `fake-recidiviz-project.{self.mock_dataset_id}.{self.mock_table_id}`" self.mock_client.get_table.assert_called() self.mock_client.query.assert_called_with(query=expected_query, location=self.location, job_config=mock_job_config()) @mock.patch('google.cloud.bigquery.job.QueryJobConfig') def test_insert_into_table_from_table_async_hydrate_missing_columns( self, mock_job_config: mock.MagicMock) -> None: """Tests that the insert_into_table_from_table_async generates a query with missing columns as NULL.""" with mock.patch( 'recidiviz.big_query.big_query_client.BigQueryClientImpl' '._get_schema_fields_missing_from_table') as mock_missing: mock_missing.return_value = [ bigquery.SchemaField('state_code', 'STRING', 'REQUIRED'), bigquery.SchemaField('new_column_name', 'INTEGER', 'REQUIRED') ] self.mock_destination_id = 'fake_table_temp' self.bq_client.insert_into_table_from_table_async( source_dataset_id=self.mock_dataset_id, source_table_id=self.mock_table_id, destination_dataset_id=self.mock_dataset_id, destination_table_id=self.mock_destination_id, hydrate_missing_columns_with_null=True, allow_field_additions=True) expected_query = "SELECT *, CAST(NULL AS STRING) AS state_code, CAST(NULL AS INTEGER) AS new_column_name " \ f"FROM `fake-recidiviz-project.{self.mock_dataset_id}.{self.mock_table_id}`" self.mock_client.query.assert_called_with( query=expected_query, location=self.location, job_config=mock_job_config()) def test_insert_into_table_from_table_invalid_destination(self) -> None: """Tests that the insert_into_table_from_table_async function does not run the query if the destination table does not exist.""" self.mock_client.get_table.side_effect = exceptions.NotFound('!') with pytest.raises(ValueError): self.bq_client.insert_into_table_from_table_async( self.mock_dataset_id, self.mock_table_id, 'fake_source_dataset_id', 'fake_table_id') self.mock_client.get_table.assert_called() self.mock_client.query.assert_not_called() def test_insert_into_table_from_table_invalid_filter_clause(self) -> None: """Tests that the insert_into_table_from_table_async function does not run the query if the filter clause does not start with a WHERE.""" with pytest.raises(ValueError): self.bq_client.insert_into_table_from_table_async( self.mock_dataset_id, self.mock_table_id, 'fake_source_dataset_id', 'fake_table_id', source_data_filter_clause='bad filter clause') self.mock_client.query.assert_not_called() @mock.patch('google.cloud.bigquery.job.QueryJobConfig') def test_insert_into_table_from_table_with_filter_clause( self, mock_job_config: mock.MagicMock) -> None: """Tests that the insert_into_table_from_table_async generates a valid query when given a filter clause.""" filter_clause = "WHERE state_code IN ('US_ND')" job_config = mock_job_config() self.bq_client.insert_into_table_from_table_async( self.mock_dataset_id, self.mock_table_id, 'fake_source_dataset_id', 'fake_table_id', source_data_filter_clause=filter_clause) expected_query = "SELECT * FROM `fake-recidiviz-project.fake-dataset.test_table` " \ "WHERE state_code IN ('US_ND')" self.mock_client.query.assert_called_with(query=expected_query, location=self.location, job_config=job_config) def test_insert_into_table_from_cloud_storage_async(self) -> None: self.mock_client.get_dataset.side_effect = exceptions.NotFound('!') self.bq_client.insert_into_table_from_cloud_storage_async( destination_dataset_ref=self.mock_dataset_ref, destination_table_id=self.mock_table_id, destination_table_schema=[ SchemaField('my_column', 'STRING', 'NULLABLE', None, ()) ], source_uri='gs://bucket/export-uri') self.mock_client.create_dataset.assert_called() self.mock_client.load_table_from_uri.assert_called() def test_delete_from_table(self) -> None: """Tests that the delete_from_table function runs a query.""" self.bq_client.delete_from_table_async(self.mock_dataset_id, self.mock_table_id, filter_clause="WHERE x > y") self.mock_client.query.assert_called() def test_delete_from_table_invalid_filter_clause(self) -> None: """Tests that the delete_from_table function does not run a query when the filter clause is invalid.""" with pytest.raises(ValueError): self.bq_client.delete_from_table_async(self.mock_dataset_id, self.mock_table_id, filter_clause="x > y") self.mock_client.query.assert_not_called() def test_materialize_view_to_table(self) -> None: """Tests that the materialize_view_to_table function calls the function to create a table from a query.""" self.bq_client.materialize_view_to_table(self.mock_view) self.mock_client.query.assert_called() def test_materialize_view_to_table_no_materialized_view_table_id( self) -> None: """Tests that the materialize_view_to_table function does not call the function to create a table from a query if there is no set materialized_view_table_id on the view.""" invalid_view = BigQueryView(dataset_id='dataset', view_id='test_view', view_query_template='SELECT NULL LIMIT 0', should_materialize=False) with pytest.raises(ValueError): self.bq_client.materialize_view_to_table(invalid_view) self.mock_client.query.assert_not_called() def test_create_table_with_schema(self) -> None: """Tests that the create_table_with_schema function calls the create_table function on the client.""" self.mock_client.get_table.side_effect = exceptions.NotFound('!') schema_fields = [bigquery.SchemaField('new_schema_field', 'STRING')] self.bq_client.create_table_with_schema(self.mock_dataset_id, self.mock_table_id, schema_fields) self.mock_client.create_table.assert_called() def test_create_table_with_schema_table_exists(self) -> None: """Tests that the create_table_with_schema function raises an error when the table already exists.""" self.mock_client.get_table.side_effect = None schema_fields = [bigquery.SchemaField('new_schema_field', 'STRING')] with pytest.raises(ValueError): self.bq_client.create_table_with_schema(self.mock_dataset_id, self.mock_table_id, schema_fields) self.mock_client.create_table.assert_not_called() def test_add_missing_fields_to_schema(self) -> None: """Tests that the add_missing_fields_to_schema function calls the client to update the table.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [bigquery.SchemaField('fake_schema_field', 'STRING')] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [ bigquery.SchemaField('new_schema_field', 'STRING') ] self.bq_client.add_missing_fields_to_schema(self.mock_dataset_id, self.mock_table_id, new_schema_fields) self.mock_client.update_table.assert_called() def test_add_missing_fields_to_schema_no_missing_fields(self) -> None: """Tests that the add_missing_fields_to_schema function does not call the client to update the table when all of the fields are already present.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [bigquery.SchemaField('fake_schema_field', 'STRING')] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [ bigquery.SchemaField('fake_schema_field', 'STRING') ] self.bq_client.add_missing_fields_to_schema(self.mock_dataset_id, self.mock_table_id, new_schema_fields) self.mock_client.update_table.assert_not_called() def test_add_missing_fields_to_schema_no_table(self) -> None: """Tests that the add_missing_fields_to_schema function does not call the client to update the table when the table does not exist.""" self.mock_client.get_table.side_effect = exceptions.NotFound('!') new_schema_fields = [ bigquery.SchemaField('fake_schema_field', 'STRING') ] with pytest.raises(ValueError): self.bq_client.add_missing_fields_to_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields) self.mock_client.update_table.assert_not_called() def test_add_missing_fields_to_schema_fields_with_same_name_different_type( self) -> None: """Tests that the add_missing_fields_to_schema function raises an error when the user is trying to add a field with the same name but different field_type as an existing field.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [bigquery.SchemaField('fake_schema_field', 'STRING')] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [ bigquery.SchemaField('fake_schema_field', 'INTEGER') ] with pytest.raises(ValueError): self.bq_client.add_missing_fields_to_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields) self.mock_client.update_table.assert_not_called() def test_add_missing_fields_to_schema_fields_with_same_name_different_mode( self) -> None: """Tests that the add_missing_fields_to_schema function raises an error when the user is trying to add a field with the same name but different mode as an existing field.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [ bigquery.SchemaField('fake_schema_field', 'STRING', mode="NULLABLE") ] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [ bigquery.SchemaField('fake_schema_field', 'STRING', mode="REQUIRED") ] with pytest.raises(ValueError): self.bq_client.add_missing_fields_to_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields) self.mock_client.update_table.assert_not_called() def test_delete_table(self) -> None: """Tests that our delete table function calls the correct client method.""" self.bq_client.delete_table(self.mock_dataset_id, self.mock_table_id) self.mock_client.delete_table.assert_called() @mock.patch('google.cloud.bigquery.QueryJob') def test_paged_read_single_page_single_row( self, mock_query_job: mock.MagicMock) -> None: first_row = bigquery.table.Row( ['parole', 15, '10N'], { 'supervision_type': 0, 'revocations': 1, 'district': 2 }, ) # First call returns a single row, second call returns nothing mock_query_job.result.side_effect = [[first_row], []] processed_results = [] def _process_fn(row: bigquery.table.Row) -> None: processed_results.append(dict(row)) self.bq_client.paged_read_and_process(mock_query_job, 1, _process_fn) self.assertEqual([dict(first_row)], processed_results) mock_query_job.result.assert_has_calls([ call(max_results=1, start_index=0), call(max_results=1, start_index=1), ]) @mock.patch('google.cloud.bigquery.QueryJob') def test_paged_read_single_page_multiple_rows( self, mock_query_job: mock.MagicMock) -> None: first_row = bigquery.table.Row( ['parole', 15, '10N'], { 'supervision_type': 0, 'revocations': 1, 'district': 2 }, ) second_row = bigquery.table.Row( ['probation', 7, '10N'], { 'supervision_type': 0, 'revocations': 1, 'district': 2 }, ) # First call returns a single row, second call returns nothing mock_query_job.result.side_effect = [[first_row, second_row], []] processed_results = [] def _process_fn(row: bigquery.table.Row) -> None: processed_results.append(dict(row)) self.bq_client.paged_read_and_process(mock_query_job, 10, _process_fn) self.assertEqual([dict(first_row), dict(second_row)], processed_results) mock_query_job.result.assert_has_calls([ call(max_results=10, start_index=0), call(max_results=10, start_index=2), ]) @mock.patch('google.cloud.bigquery.QueryJob') def test_paged_read_multiple_pages(self, mock_query_job: mock.MagicMock) -> None: p1_r1 = bigquery.table.Row( ['parole', 15, '10N'], { 'supervision_type': 0, 'revocations': 1, 'district': 2 }, ) p1_r2 = bigquery.table.Row( ['probation', 7, '10N'], { 'supervision_type': 0, 'revocations': 1, 'district': 2 }, ) p2_r1 = bigquery.table.Row( ['parole', 8, '10F'], { 'supervision_type': 0, 'revocations': 1, 'district': 2 }, ) p2_r2 = bigquery.table.Row( ['probation', 3, '10F'], { 'supervision_type': 0, 'revocations': 1, 'district': 2 }, ) # First two calls returns results, third call returns nothing mock_query_job.result.side_effect = [[p1_r1, p1_r2], [p2_r1, p2_r2], []] processed_results = [] def _process_fn(row: bigquery.table.Row) -> None: processed_results.append(dict(row)) self.bq_client.paged_read_and_process(mock_query_job, 2, _process_fn) self.assertEqual( [dict(p1_r1), dict(p1_r2), dict(p2_r1), dict(p2_r2)], processed_results) mock_query_job.result.assert_has_calls([ call(max_results=2, start_index=0), call(max_results=2, start_index=2), call(max_results=2, start_index=4), ])
def _table_exists(self) -> bool: bq_client = BigQueryClientImpl() return bq_client.table_exists( bq_client.dataset_ref_for_id(self.dataset_id), self.table_id)
def compare_raw_data_between_projects( region_code: str, source_project_id: str = environment.GCP_PROJECT_STAGING, comparison_project_id: str = environment.GCP_PROJECT_PRODUCTION, ) -> List[str]: """Compares the raw data between staging and production for a given region.""" logging.info( "**** Ensuring all raw data for [%s] in [%s] also exists in [%s] ****", region_code.upper(), source_project_id, comparison_project_id, ) raw_file_config = DirectIngestRegionRawFileConfig(region_code) bq_client = BigQueryClientImpl(project_id=source_project_id) dataset_id = DirectIngestRawFileImportManager.raw_tables_dataset_for_region( region_code) source_dataset = bq_client.dataset_ref_for_id(dataset_id) query_jobs: Dict[str, bigquery.QueryJob] = {} for file_tag, file_config in raw_file_config.raw_file_configs.items(): if (not bq_client.table_exists(source_dataset, file_tag) or file_config.is_undocumented or not file_config.primary_key_cols): continue columns = ", ".join( [column.name for column in file_config.available_columns]) query_job = bq_client.run_query_async( query_str=COMPARISON_TEMPLATE.format( source_project_id=source_project_id, comparison_project_id=comparison_project_id, raw_data_dataset_id=dataset_id, raw_data_table_id=file_tag, columns=columns, )) query_jobs[file_tag] = query_job table_column_width = min( max(len(tag) for tag in raw_file_config.raw_file_configs), 30) failed_tables: List[str] = [] for file_tag in sorted(raw_file_config.raw_file_tags): justified_name = file_tag.ljust(table_column_width) if file_tag not in query_jobs: # This file did not exist in the project that is the source of truth. continue query_job = query_jobs[file_tag] try: rows = query_job.result() except exceptions.NotFound: logging.warning( "%s | Missing table %s.%s.%s", justified_name, comparison_project_id, dataset_id, file_tag, ) failed_tables.append(file_tag) continue counts: List[Tuple[datetime.datetime, int]] = [row.values() for row in rows] if counts: logging.warning( "%s | Missing data in the %s table", justified_name, comparison_project_id, ) for update_datetime, num_missing in counts: logging.warning("\t%ss: %d", update_datetime.isoformat(), num_missing) failed_tables.append(file_tag) else: logging.info( "%s | %s contains all of the data from %s", justified_name, comparison_project_id, source_project_id, ) return failed_tables
def compare_metric_view_output_to_sandbox( sandbox_dataset_prefix: str, load_sandbox_views: bool, check_determinism: bool, allow_schema_changes: bool, dataset_id_filters: Optional[List[str]], ) -> None: """Compares the output of all deployed metric views to the output of the corresponding views in the sandbox dataset.""" if load_sandbox_views: logging.info( "Loading views into sandbox datasets prefixed with %s", sandbox_dataset_prefix, ) load_views_to_sandbox(sandbox_dataset_prefix) bq_client = BigQueryClientImpl() sandbox_comparison_output_dataset_id = (sandbox_dataset_prefix + "_metric_view_comparison_output") sandbox_comparison_output_dataset_ref = bq_client.dataset_ref_for_id( sandbox_comparison_output_dataset_id) if bq_client.dataset_exists(sandbox_comparison_output_dataset_ref) and any( bq_client.list_tables(sandbox_comparison_output_dataset_id)): raise ValueError( f"Cannot write comparison output to a non-empty dataset. Please delete tables in dataset: " f"{bq_client.project_id}.{sandbox_comparison_output_dataset_id}.") bq_client.create_dataset_if_necessary( sandbox_comparison_output_dataset_ref, TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS) query_jobs: List[Tuple[QueryJob, str]] = [] skipped_views: List[str] = [] for view_builders in VIEW_BUILDERS_BY_NAMESPACE.values(): for view_builder in view_builders: # Only compare output of metric views if not isinstance(view_builder, MetricBigQueryViewBuilder): continue base_dataset_id = view_builder.dataset_id if dataset_id_filters and base_dataset_id not in dataset_id_filters: continue if view_builder in VIEW_BUILDERS_WITH_KNOWN_NOT_DETERMINISTIC_OUTPUT: logging.warning( "View %s.%s has known non-deterministic output. Skipping output comparison.", view_builder.dataset_id, view_builder.view_id, ) skipped_views.append( f"{view_builder.dataset_id}.{view_builder.view_id}") continue sandbox_dataset_id = sandbox_dataset_prefix + "_" + base_dataset_id if not bq_client.dataset_exists( bq_client.dataset_ref_for_id(sandbox_dataset_id)): raise ValueError( f"Trying to compare output to a sandbox dataset that does not exist: " f"{bq_client.project_id}.{sandbox_dataset_id}") base_dataset_ref = bq_client.dataset_ref_for_id(base_dataset_id) base_view_id = (view_builder.build().materialized_view_table_id if view_builder.should_materialize and not check_determinism else view_builder.view_id) if not base_view_id: raise ValueError( "Unexpected empty base_view_id. view_id or materialized_view_table_id unset" f"for {view_builder}.") if not check_determinism and not bq_client.table_exists( base_dataset_ref, base_view_id): logging.warning( "View %s.%s does not exist. Skipping output comparison.", base_dataset_ref.dataset_id, base_view_id, ) skipped_views.append(f"{base_dataset_id}.{base_view_id}") continue if not bq_client.table_exists( bq_client.dataset_ref_for_id(sandbox_dataset_id), base_view_id): logging.warning( "View %s.%s does not exist in sandbox. Skipping output comparison.", sandbox_dataset_id, base_view_id, ) skipped_views.append(f"{sandbox_dataset_id}.{base_view_id}") continue query_job, output_table_id = _view_output_comparison_job( bq_client, view_builder, base_view_id, base_dataset_id, sandbox_dataset_id, sandbox_comparison_output_dataset_id, check_determinism, allow_schema_changes, ) # Add query job to the list of running jobs query_jobs.append((query_job, output_table_id)) for query_job, output_table_id in query_jobs: # Wait for the insert job to complete before looking for the table query_job.result() output_table = bq_client.get_table( sandbox_comparison_output_dataset_ref, output_table_id) if output_table.num_rows == 0: # If there are no rows in the output table, then the view output was identical bq_client.delete_table(sandbox_comparison_output_dataset_id, output_table_id) views_with_different_output = bq_client.list_tables( sandbox_comparison_output_dataset_id) views_with_different_output = peekable(views_with_different_output) logging.info( "\n*************** METRIC VIEW OUTPUT RESULTS ***************\n") if dataset_id_filters: logging.info( "Only compared metric view output for the following datasets: \n %s \n", dataset_id_filters, ) logging.info( "Skipped output comparison for the following metric views: \n %s \n", skipped_views, ) if views_with_different_output: for view in views_with_different_output: base_dataset_id, base_view_id = view.table_id.split("--") logging.warning( "View output differs for view %s.%s. See %s.%s for diverging rows.", base_dataset_id, base_view_id, sandbox_comparison_output_dataset_id, view.table_id, ) else: output_message = ( "identical between deployed views and sandbox datasets" if not check_determinism else "deterministic") logging.info( "View output %s. Deleting dataset %s.", output_message, sandbox_comparison_output_dataset_ref.dataset_id, ) bq_client.delete_dataset(sandbox_comparison_output_dataset_ref, delete_contents=True)
class BigQueryClientImplTest(unittest.TestCase): """Tests for BigQueryClientImpl""" def setUp(self) -> None: self.location = "US" self.mock_project_id = "fake-recidiviz-project" self.mock_dataset_id = "fake-dataset" self.mock_table_id = "test_table" self.mock_dataset_ref = bigquery.dataset.DatasetReference( self.mock_project_id, self.mock_dataset_id ) self.mock_table = self.mock_dataset_ref.table(self.mock_table_id) self.metadata_patcher = mock.patch("recidiviz.utils.metadata.project_id") self.mock_project_id_fn = self.metadata_patcher.start() self.mock_project_id_fn.return_value = self.mock_project_id self.client_patcher = mock.patch("recidiviz.big_query.big_query_client.client") self.mock_client = self.client_patcher.start().return_value self.mock_view = BigQueryView( dataset_id="dataset", view_id="test_view", view_query_template="SELECT NULL LIMIT 0", should_materialize=True, ) self.bq_client = BigQueryClientImpl() def tearDown(self) -> None: self.client_patcher.stop() self.metadata_patcher.stop() def test_create_dataset_if_necessary(self) -> None: """Check that a dataset is created if it does not exist.""" self.mock_client.get_dataset.side_effect = exceptions.NotFound("!") self.bq_client.create_dataset_if_necessary(self.mock_dataset_ref) self.mock_client.create_dataset.assert_called() def test_create_dataset_if_necessary_dataset_exists(self) -> None: """Check that a dataset is not created if it already exists.""" self.mock_client.get_dataset.side_effect = None self.bq_client.create_dataset_if_necessary(self.mock_dataset_ref) self.mock_client.create_dataset.assert_not_called() def test_create_dataset_if_necessary_table_expiration(self) -> None: """Check that the dataset is created with a set table expiration if the dataset does not exist and the new_dataset_table_expiration_ms is specified.""" self.mock_client.get_dataset.side_effect = exceptions.NotFound("!") self.bq_client.create_dataset_if_necessary( self.mock_dataset_ref, default_table_expiration_ms=6000 ) self.mock_client.create_dataset.assert_called() def test_table_exists(self) -> None: """Check that table_exists returns True if the table exists.""" self.mock_client.get_table.side_effect = None self.assertTrue( self.bq_client.table_exists(self.mock_dataset_ref, self.mock_table_id) ) def test_table_exists_does_not_exist(self) -> None: """Check that table_exists returns False if the table does not exist.""" self.mock_client.get_table.side_effect = exceptions.NotFound("!") with self.assertLogs(level="WARNING"): table_exists = self.bq_client.table_exists( self.mock_dataset_ref, self.mock_table_id ) self.assertFalse(table_exists) def test_create_or_update_view_creates_view(self) -> None: """create_or_update_view creates a View if it does not exist.""" self.mock_client.get_table.side_effect = exceptions.NotFound("!") self.bq_client.create_or_update_view(self.mock_dataset_ref, self.mock_view) self.mock_client.create_table.assert_called() self.mock_client.update_table.assert_not_called() def test_create_or_update_view_updates_view(self) -> None: """create_or_update_view updates a View if it already exist.""" self.mock_client.get_table.side_effect = None self.bq_client.create_or_update_view(self.mock_dataset_ref, self.mock_view) self.mock_client.update_table.assert_called() self.mock_client.create_table.assert_not_called() def test_export_to_cloud_storage(self) -> None: """export_to_cloud_storage extracts the table corresponding to the view.""" self.assertIsNotNone( self.bq_client.export_table_to_cloud_storage_async( source_table_dataset_ref=self.mock_dataset_ref, source_table_id="source-table", destination_uri=f"gs://{self.mock_project_id}-bucket/destination_path.json", destination_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON, print_header=True, ) ) self.mock_client.extract_table.assert_called() def test_export_to_cloud_storage_no_table(self) -> None: """export_to_cloud_storage does not extract from a table if the table does not exist.""" self.mock_client.get_table.side_effect = exceptions.NotFound("!") with self.assertLogs(level="WARNING"): self.assertIsNone( self.bq_client.export_table_to_cloud_storage_async( source_table_dataset_ref=self.mock_dataset_ref, source_table_id="source-table", destination_uri=f"gs://{self.mock_project_id}-bucket/destination_path.json", destination_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON, print_header=True, ) ) self.mock_client.extract_table.assert_not_called() def test_load_table_async_create_dataset(self) -> None: """Test that load_table_from_cloud_storage_async tries to create a parent dataset.""" self.mock_client.get_dataset.side_effect = exceptions.NotFound("!") self.bq_client.load_table_from_cloud_storage_async( destination_dataset_ref=self.mock_dataset_ref, destination_table_id=self.mock_table_id, destination_table_schema=[ SchemaField("my_column", "STRING", "NULLABLE", None, ()) ], source_uri="gs://bucket/export-uri", ) self.mock_client.create_dataset.assert_called() self.mock_client.load_table_from_uri.assert_called() def test_load_table_async_dataset_exists(self) -> None: """Test that load_table_from_cloud_storage_async does not try to create a parent dataset if it already exists.""" self.bq_client.load_table_from_cloud_storage_async( destination_dataset_ref=self.mock_dataset_ref, destination_table_id=self.mock_table_id, destination_table_schema=[ SchemaField("my_column", "STRING", "NULLABLE", None, ()) ], source_uri="gs://bucket/export-uri", ) self.mock_client.create_dataset.assert_not_called() self.mock_client.load_table_from_uri.assert_called() def test_export_query_results_to_cloud_storage_no_table(self) -> None: bucket = self.mock_project_id + "-bucket" self.mock_client.get_table.side_effect = exceptions.NotFound("!") with self.assertLogs(level="WARNING"): self.bq_client.export_query_results_to_cloud_storage( [ ExportQueryConfig.from_view_query( view=self.mock_view, view_filter_clause="WHERE x = y", intermediate_table_name=self.mock_table_id, output_uri=f"gs://{bucket}/view.json", output_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON, ) ], print_header=True, ) def test_export_query_results_to_cloud_storage(self) -> None: """export_query_results_to_cloud_storage creates the table from the view query and exports the table.""" bucket = self.mock_project_id + "-bucket" query_job: futures.Future = futures.Future() query_job.set_result([]) extract_job: futures.Future = futures.Future() extract_job.set_result(None) self.mock_client.query.return_value = query_job self.mock_client.extract_table.return_value = extract_job self.bq_client.export_query_results_to_cloud_storage( [ ExportQueryConfig.from_view_query( view=self.mock_view, view_filter_clause="WHERE x = y", intermediate_table_name=self.mock_table_id, output_uri=f"gs://{bucket}/view.json", output_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON, ) ], print_header=True, ) self.mock_client.query.assert_called() self.mock_client.extract_table.assert_called() self.mock_client.delete_table.assert_called_with( bigquery.DatasetReference( self.mock_project_id, self.mock_view.dataset_id ).table(self.mock_table_id) ) def test_create_table_from_query(self) -> None: """Tests that the create_table_from_query function calls the function to create a table from a query.""" self.bq_client.create_table_from_query_async( self.mock_dataset_id, self.mock_table_id, query="SELECT * FROM some.fake.table", query_parameters=[], ) self.mock_client.query.assert_called() @mock.patch("google.cloud.bigquery.job.QueryJobConfig") def test_insert_into_table_from_table_async( self, mock_job_config: mock.MagicMock ) -> None: """Tests that the insert_into_table_from_table_async function runs a query.""" self.bq_client.insert_into_table_from_table_async( source_dataset_id=self.mock_dataset_id, source_table_id=self.mock_table_id, destination_dataset_id=self.mock_dataset_id, destination_table_id="fake_table_temp", ) expected_query = f"SELECT * FROM `fake-recidiviz-project.{self.mock_dataset_id}.{self.mock_table_id}`" self.mock_client.get_table.assert_called() self.mock_client.query.assert_called_with( query=expected_query, location=self.location, job_config=mock_job_config() ) @mock.patch("google.cloud.bigquery.job.QueryJobConfig") def test_insert_into_table_from_table_async_hydrate_missing_columns( self, mock_job_config: mock.MagicMock ) -> None: """Tests that the insert_into_table_from_table_async generates a query with missing columns as NULL.""" with mock.patch( "recidiviz.big_query.big_query_client.BigQueryClientImpl" "._get_excess_schema_fields" ) as mock_missing: mock_missing.return_value = [ bigquery.SchemaField("state_code", "STRING", "REQUIRED"), bigquery.SchemaField("new_column_name", "INTEGER", "REQUIRED"), ] self.mock_destination_id = "fake_table_temp" self.bq_client.insert_into_table_from_table_async( source_dataset_id=self.mock_dataset_id, source_table_id=self.mock_table_id, destination_dataset_id=self.mock_dataset_id, destination_table_id=self.mock_destination_id, hydrate_missing_columns_with_null=True, allow_field_additions=True, ) expected_query = ( "SELECT *, CAST(NULL AS STRING) AS state_code, CAST(NULL AS INTEGER) AS new_column_name " f"FROM `fake-recidiviz-project.{self.mock_dataset_id}.{self.mock_table_id}`" ) self.mock_client.query.assert_called_with( query=expected_query, location=self.location, job_config=mock_job_config(), ) def test_insert_into_table_from_table_invalid_destination(self) -> None: """Tests that the insert_into_table_from_table_async function does not run the query if the destination table does not exist.""" self.mock_client.get_table.side_effect = exceptions.NotFound("!") with pytest.raises(ValueError): self.bq_client.insert_into_table_from_table_async( self.mock_dataset_id, self.mock_table_id, "fake_source_dataset_id", "fake_table_id", ) self.mock_client.get_table.assert_called() self.mock_client.query.assert_not_called() def test_insert_into_table_from_table_invalid_filter_clause(self) -> None: """Tests that the insert_into_table_from_table_async function does not run the query if the filter clause does not start with a WHERE.""" with pytest.raises(ValueError): self.bq_client.insert_into_table_from_table_async( self.mock_dataset_id, self.mock_table_id, "fake_source_dataset_id", "fake_table_id", source_data_filter_clause="bad filter clause", ) self.mock_client.query.assert_not_called() @mock.patch("google.cloud.bigquery.job.QueryJobConfig") def test_insert_into_table_from_table_with_filter_clause( self, mock_job_config: mock.MagicMock ) -> None: """Tests that the insert_into_table_from_table_async generates a valid query when given a filter clause.""" filter_clause = "WHERE state_code IN ('US_ND')" job_config = mock_job_config() self.bq_client.insert_into_table_from_table_async( self.mock_dataset_id, self.mock_table_id, "fake_source_dataset_id", "fake_table_id", source_data_filter_clause=filter_clause, ) expected_query = ( "SELECT * FROM `fake-recidiviz-project.fake-dataset.test_table` " "WHERE state_code IN ('US_ND')" ) self.mock_client.query.assert_called_with( query=expected_query, location=self.location, job_config=job_config ) def test_insert_into_table_from_cloud_storage_async(self) -> None: self.mock_client.get_dataset.side_effect = exceptions.NotFound("!") self.bq_client.insert_into_table_from_cloud_storage_async( destination_dataset_ref=self.mock_dataset_ref, destination_table_id=self.mock_table_id, destination_table_schema=[ SchemaField("my_column", "STRING", "NULLABLE", None, ()) ], source_uri="gs://bucket/export-uri", ) self.mock_client.create_dataset.assert_called() self.mock_client.load_table_from_uri.assert_called() def test_delete_from_table(self) -> None: """Tests that the delete_from_table function runs a query.""" self.bq_client.delete_from_table_async( self.mock_dataset_id, self.mock_table_id, filter_clause="WHERE x > y" ) self.mock_client.query.assert_called() def test_delete_from_table_invalid_filter_clause(self) -> None: """Tests that the delete_from_table function does not run a query when the filter clause is invalid.""" with pytest.raises(ValueError): self.bq_client.delete_from_table_async( self.mock_dataset_id, self.mock_table_id, filter_clause="x > y" ) self.mock_client.query.assert_not_called() def test_materialize_view_to_table(self) -> None: """Tests that the materialize_view_to_table function calls the function to create a table from a query.""" self.bq_client.materialize_view_to_table(self.mock_view) self.mock_client.query.assert_called() def test_materialize_view_to_table_no_materialized_view_table_id(self) -> None: """Tests that the materialize_view_to_table function does not call the function to create a table from a query if there is no set materialized_view_table_id on the view.""" invalid_view = BigQueryView( dataset_id="dataset", view_id="test_view", view_query_template="SELECT NULL LIMIT 0", should_materialize=False, ) with pytest.raises(ValueError): self.bq_client.materialize_view_to_table(invalid_view) self.mock_client.query.assert_not_called() def test_create_table_with_schema(self) -> None: """Tests that the create_table_with_schema function calls the create_table function on the client.""" self.mock_client.get_table.side_effect = exceptions.NotFound("!") schema_fields = [bigquery.SchemaField("new_schema_field", "STRING")] self.bq_client.create_table_with_schema( self.mock_dataset_id, self.mock_table_id, schema_fields ) self.mock_client.create_table.assert_called() def test_create_table_with_schema_table_exists(self) -> None: """Tests that the create_table_with_schema function raises an error when the table already exists.""" self.mock_client.get_table.side_effect = None schema_fields = [bigquery.SchemaField("new_schema_field", "STRING")] with pytest.raises(ValueError): self.bq_client.create_table_with_schema( self.mock_dataset_id, self.mock_table_id, schema_fields ) self.mock_client.create_table.assert_not_called() def test_add_missing_fields_to_schema(self) -> None: """Tests that the add_missing_fields_to_schema function calls the client to update the table.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [bigquery.SchemaField("fake_schema_field", "STRING")] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [bigquery.SchemaField("new_schema_field", "STRING")] self.bq_client.add_missing_fields_to_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields ) self.mock_client.update_table.assert_called() def test_add_missing_fields_to_schema_no_missing_fields(self) -> None: """Tests that the add_missing_fields_to_schema function does not call the client to update the table when all of the fields are already present.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [bigquery.SchemaField("fake_schema_field", "STRING")] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [bigquery.SchemaField("fake_schema_field", "STRING")] self.bq_client.add_missing_fields_to_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields ) self.mock_client.update_table.assert_not_called() def test_add_missing_fields_to_schema_no_table(self) -> None: """Tests that the add_missing_fields_to_schema function does not call the client to update the table when the table does not exist.""" self.mock_client.get_table.side_effect = exceptions.NotFound("!") new_schema_fields = [bigquery.SchemaField("fake_schema_field", "STRING")] with pytest.raises(ValueError): self.bq_client.add_missing_fields_to_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields ) self.mock_client.update_table.assert_not_called() def test_add_missing_fields_to_schema_fields_with_same_name_different_type( self, ) -> None: """Tests that the add_missing_fields_to_schema function raises an error when the user is trying to add a field with the same name but different field_type as an existing field.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [bigquery.SchemaField("fake_schema_field", "STRING")] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [bigquery.SchemaField("fake_schema_field", "INTEGER")] with pytest.raises(ValueError): self.bq_client.add_missing_fields_to_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields ) self.mock_client.update_table.assert_not_called() def test_add_missing_fields_to_schema_fields_with_same_name_different_mode( self, ) -> None: """Tests that the add_missing_fields_to_schema function raises an error when the user is trying to add a field with the same name but different mode as an existing field.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [ bigquery.SchemaField("fake_schema_field", "STRING", mode="NULLABLE") ] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [ bigquery.SchemaField("fake_schema_field", "STRING", mode="REQUIRED") ] with pytest.raises(ValueError): self.bq_client.add_missing_fields_to_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields ) self.mock_client.update_table.assert_not_called() def test_remove_unused_fields_from_schema(self) -> None: """Tests that remove_unused_fields_from_schema() calls the client to update the table with a query.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [ bigquery.SchemaField("field_1", "STRING"), bigquery.SchemaField("field_2", "STRING"), ] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [bigquery.SchemaField("field_1", "STRING")] self.bq_client.remove_unused_fields_from_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields ) self.mock_client.query.assert_called() def test_remove_unused_fields_from_schema_no_missing_fields(self) -> None: """Tests that remove_unused_fields_from_schema() does nothing if there are no missing fields.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [bigquery.SchemaField("field_1", "STRING")] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [bigquery.SchemaField("field_1", "STRING")] self.bq_client.remove_unused_fields_from_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields ) self.mock_client.query.assert_not_called() def test_remove_unused_fields_from_schema_ignore_excess_desired_fields( self, ) -> None: """Tests that remove_unused_fields_from_schema() drops columns even when there are excess desired fields.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [ bigquery.SchemaField("field_1", "STRING"), bigquery.SchemaField("field_2", "STRING"), ] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [ bigquery.SchemaField("field_1", "STRING"), bigquery.SchemaField("field_3", "STRING"), ] self.bq_client.remove_unused_fields_from_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields ) self.mock_client.query.assert_called() @mock.patch( "recidiviz.big_query.big_query_client.BigQueryClientImpl.remove_unused_fields_from_schema" ) @mock.patch( "recidiviz.big_query.big_query_client.BigQueryClientImpl.add_missing_fields_to_schema" ) def test_update_schema( self, remove_unused_mock: mock.MagicMock, add_missing_mock: mock.MagicMock ) -> None: """Tests that update_schema() calls both field updaters if the inputs are valid.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [ bigquery.SchemaField("field_1", "STRING"), bigquery.SchemaField("field_2", "STRING"), ] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [ bigquery.SchemaField("field_1", "STRING"), bigquery.SchemaField("field_3", "STRING"), ] self.bq_client.update_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields ) remove_unused_mock.assert_called() add_missing_mock.assert_called() @mock.patch( "recidiviz.big_query.big_query_client.BigQueryClientImpl.remove_unused_fields_from_schema" ) @mock.patch( "recidiviz.big_query.big_query_client.BigQueryClientImpl.add_missing_fields_to_schema" ) def test_update_schema_fails_on_changed_type( self, remove_unused_mock: mock.MagicMock, add_missing_mock: mock.MagicMock ) -> None: """Tests that update_schema() throws if we try to change a field type.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [ bigquery.SchemaField("field_1", "STRING"), bigquery.SchemaField("field_2", "STRING"), ] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [ bigquery.SchemaField("field_1", "STRING"), bigquery.SchemaField("field_2", "INT"), ] with pytest.raises(ValueError): self.bq_client.update_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields ) remove_unused_mock.assert_not_called() add_missing_mock.assert_not_called() @mock.patch( "recidiviz.big_query.big_query_client.BigQueryClientImpl.remove_unused_fields_from_schema" ) @mock.patch( "recidiviz.big_query.big_query_client.BigQueryClientImpl.add_missing_fields_to_schema" ) def test_update_schema_fails_on_changed_mode( self, remove_unused_mock: mock.MagicMock, add_missing_mock: mock.MagicMock ) -> None: """Tests that update_schema() throws if we try to change a field mode.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [ bigquery.SchemaField("field_1", "STRING", "NULLABLE"), bigquery.SchemaField("field_2", "STRING"), ] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [ bigquery.SchemaField("field_1", "STRING", "REQUIRED"), bigquery.SchemaField("field_2", "INT"), ] with pytest.raises(ValueError): self.bq_client.update_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields ) remove_unused_mock.assert_not_called() add_missing_mock.assert_not_called() def test__get_excess_schema_fields_simple_excess(self) -> None: """Tests _get_excess_schema_fields() when extended_schema is a strict superset of base_schema.""" base_schema = [bigquery.SchemaField("field_1", "INT")] extended_schema = [ bigquery.SchemaField("field_1", "INT"), bigquery.SchemaField("field_2", "INT"), bigquery.SchemaField("field_3", "INT"), ] excess_fields = BigQueryClientImpl._get_excess_schema_fields( base_schema, extended_schema ) self.assertEqual( excess_fields, [ bigquery.SchemaField("field_2", "INT"), bigquery.SchemaField("field_3", "INT"), ], ) def test__get_excess_schema_fields_with_extra_base_schema(self) -> None: """Tests _get_excess_schema_fields() when base_schema has fields not in extended_schema.""" base_schema = [ bigquery.SchemaField("field_1", "INT"), bigquery.SchemaField("field_2", "INT"), ] extended_schema = [ bigquery.SchemaField("field_1", "INT"), bigquery.SchemaField("field_3", "INT"), bigquery.SchemaField("field_4", "INT"), ] excess_fields = BigQueryClientImpl._get_excess_schema_fields( base_schema, extended_schema ) self.assertEqual( excess_fields, [ bigquery.SchemaField("field_3", "INT"), bigquery.SchemaField("field_4", "INT"), ], ) def test__get_excess_schema_fields_with_matching_schema(self) -> None: """Tests _get_excess_schema_fields() when base_schema is the same as extended_schema.""" base_schema = [ bigquery.SchemaField("field_1", "INT"), bigquery.SchemaField("field_2", "INT"), ] excess_fields = BigQueryClientImpl._get_excess_schema_fields( base_schema, base_schema ) self.assertListEqual(excess_fields, []) def test__get_excess_schema_fields_no_excess(self) -> None: """Tests _get_excess_schema_fields() when base_schema is a superset of extended_schema.""" base_schema = [ bigquery.SchemaField("field_1", "INT"), bigquery.SchemaField("field_2", "INT"), ] extended_schema = [bigquery.SchemaField("field_2", "INT")] excess_fields = BigQueryClientImpl._get_excess_schema_fields( base_schema, extended_schema ) self.assertListEqual(excess_fields, []) def test_delete_table(self) -> None: """Tests that our delete table function calls the correct client method.""" self.bq_client.delete_table(self.mock_dataset_id, self.mock_table_id) self.mock_client.delete_table.assert_called() @mock.patch("google.cloud.bigquery.QueryJob") def test_paged_read_single_page_single_row( self, mock_query_job: mock.MagicMock ) -> None: first_row = bigquery.table.Row( ["parole", 15, "10N"], {"supervision_type": 0, "revocations": 1, "district": 2}, ) # First call returns a single row, second call returns nothing mock_query_job.result.side_effect = [[first_row], []] processed_results = [] def _process_fn(row: bigquery.table.Row) -> None: processed_results.append(dict(row)) self.bq_client.paged_read_and_process(mock_query_job, 1, _process_fn) self.assertEqual([dict(first_row)], processed_results) mock_query_job.result.assert_has_calls( [ call(max_results=1, start_index=0), call(max_results=1, start_index=1), ] ) @mock.patch("google.cloud.bigquery.QueryJob") def test_paged_read_single_page_multiple_rows( self, mock_query_job: mock.MagicMock ) -> None: first_row = bigquery.table.Row( ["parole", 15, "10N"], {"supervision_type": 0, "revocations": 1, "district": 2}, ) second_row = bigquery.table.Row( ["probation", 7, "10N"], {"supervision_type": 0, "revocations": 1, "district": 2}, ) # First call returns a single row, second call returns nothing mock_query_job.result.side_effect = [[first_row, second_row], []] processed_results = [] def _process_fn(row: bigquery.table.Row) -> None: processed_results.append(dict(row)) self.bq_client.paged_read_and_process(mock_query_job, 10, _process_fn) self.assertEqual([dict(first_row), dict(second_row)], processed_results) mock_query_job.result.assert_has_calls( [ call(max_results=10, start_index=0), call(max_results=10, start_index=2), ] ) @mock.patch("google.cloud.bigquery.QueryJob") def test_paged_read_multiple_pages(self, mock_query_job: mock.MagicMock) -> None: p1_r1 = bigquery.table.Row( ["parole", 15, "10N"], {"supervision_type": 0, "revocations": 1, "district": 2}, ) p1_r2 = bigquery.table.Row( ["probation", 7, "10N"], {"supervision_type": 0, "revocations": 1, "district": 2}, ) p2_r1 = bigquery.table.Row( ["parole", 8, "10F"], {"supervision_type": 0, "revocations": 1, "district": 2}, ) p2_r2 = bigquery.table.Row( ["probation", 3, "10F"], {"supervision_type": 0, "revocations": 1, "district": 2}, ) # First two calls returns results, third call returns nothing mock_query_job.result.side_effect = [[p1_r1, p1_r2], [p2_r1, p2_r2], []] processed_results = [] def _process_fn(row: bigquery.table.Row) -> None: processed_results.append(dict(row)) self.bq_client.paged_read_and_process(mock_query_job, 2, _process_fn) self.assertEqual( [dict(p1_r1), dict(p1_r2), dict(p2_r1), dict(p2_r2)], processed_results ) mock_query_job.result.assert_has_calls( [ call(max_results=2, start_index=0), call(max_results=2, start_index=2), call(max_results=2, start_index=4), ] )