def _insert_load_job(self, project_id, job_id, table_reference, source_uris, schema=None, write_disposition=None, create_disposition=None): reference = bigquery.JobReference(jobId=job_id, projectId=project_id) request = bigquery.BigqueryJobsInsertRequest( projectId=project_id, job=bigquery.Job( configuration=bigquery.JobConfiguration( load=bigquery.JobConfigurationLoad( sourceUris=source_uris, destinationTable=table_reference, schema=schema, writeDisposition=write_disposition, createDisposition=create_disposition, sourceFormat='NEWLINE_DELIMITED_JSON', autodetect=schema is None, ) ), jobReference=reference, ) ) response = self.client.jobs.Insert(request) return response.jobReference
def perform_extract_job(self, destination, job_id, table_reference, destination_format, include_header=True, compression=ExportCompression.NONE): """Starts a job to export data from BigQuery. Returns: bigquery.JobReference with the information about the job that was started. """ job_reference = bigquery.JobReference(jobId=job_id, projectId=table_reference.projectId) request = bigquery.BigqueryJobsInsertRequest( projectId=table_reference.projectId, job=bigquery.Job( configuration=bigquery.JobConfiguration( extract=bigquery.JobConfigurationExtract( destinationUris=destination, sourceTable=table_reference, printHeader=include_header, destinationFormat=destination_format, compression=compression, ) ), jobReference=job_reference, ) ) response = self.client.jobs.Insert(request) return response.jobReference
def test_records_traverse_transform_with_mocks(self): destination = 'project1:dataset1.table1' job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' result_job = bigquery_api.Job() result_job.jobReference = job_reference mock_job = mock.Mock() mock_job.status.state = 'DONE' mock_job.status.errorResult = None mock_job.jobReference = job_reference bq_client = mock.Mock() bq_client.jobs.Get.return_value = mock_job bq_client.jobs.Insert.return_value = result_job transform = bqfl.BigQueryBatchFileLoads( destination, custom_gcs_temp_location=self._new_tempdir(), test_client=bq_client, validate=False, coder=CustomRowCoder()) # Need to test this with the DirectRunner to avoid serializing mocks with TestPipeline('DirectRunner') as p: outputs = p | beam.Create(_ELEMENTS) | transform dest_files = outputs[ bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS] dest_job = outputs[ bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS] jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1]) files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1]) destinations = ( dest_files | "GetDests" >> beam.Map(lambda x: (bigquery_tools.get_hashable_destination(x[0]), x[1])) | "GetUniques" >> beam.combiners.Count.PerKey() | "GetFinalDests" >> beam.Keys()) # All files exist _ = (files | beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True)))) # One file per destination assert_that(files | beam.combiners.Count.Globally(), equal_to([1]), label='CountFiles') assert_that(destinations, equal_to([destination]), label='CheckDestinations') assert_that(jobs, equal_to([job_reference]), label='CheckJobs')
def _insert_copy_job(self, project_id, job_id, from_table_reference, to_table_reference, create_disposition=None, write_disposition=None): reference = bigquery.JobReference() reference.jobId = job_id reference.projectId = project_id request = bigquery.BigqueryJobsInsertRequest( projectId=project_id, job=bigquery.Job( configuration=bigquery.JobConfiguration( copy=bigquery.JobConfigurationTableCopy( destinationTable=to_table_reference, sourceTable=from_table_reference, createDisposition=create_disposition, writeDisposition=write_disposition, )), jobReference=reference, )) logging.info("Inserting job request: %s", request) response = self.client.jobs.Insert(request) logging.info("Response was %s", response) return response.jobReference
def test_read_from_table_and_multiple_pages(self): client = mock.Mock() client.jobs.Insert.return_value = bigquery.Job( jobReference=bigquery.JobReference(jobId='somejob')) table_rows, schema, expected_rows = self.get_test_rows() # Return a pageToken on first call to trigger the code path where # query needs to handle multiple pages of results. client.jobs.GetQueryResults.side_effect = [ bigquery.GetQueryResultsResponse(jobComplete=True, rows=table_rows, schema=schema, pageToken='token'), bigquery.GetQueryResultsResponse(jobComplete=True, rows=table_rows, schema=schema) ] actual_rows = [] with beam.io.BigQuerySource( 'dataset.table', use_dataflow_native_source=True).reader(client) as reader: for row in reader: actual_rows.append(row) # We return expected rows for each of the two pages of results so we # adjust our expectation below accordingly. self.assertEqual(actual_rows, expected_rows * 2)
def test_load_job_id_used(self): job_reference = bigquery_api.JobReference() job_reference.projectId = 'loadJobProject' job_reference.jobId = 'job_name1' result_job = bigquery_api.Job() result_job.jobReference = job_reference mock_job = mock.Mock() mock_job.status.state = 'DONE' mock_job.status.errorResult = None mock_job.jobReference = job_reference bq_client = mock.Mock() bq_client.jobs.Get.return_value = mock_job bq_client.jobs.Insert.return_value = result_job transform = bqfl.BigQueryBatchFileLoads( 'project1:dataset1.table1', custom_gcs_temp_location=self._new_tempdir(), test_client=bq_client, validate=False, load_job_project_id='loadJobProject') with TestPipeline('DirectRunner') as p: outputs = p | beam.Create(_ELEMENTS) | transform jobs = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS] \ | "GetJobs" >> beam.Map(lambda x: x[1]) assert_that(jobs, equal_to([job_reference]), label='CheckJobProjectIds')
def _insert_load_job(self, project_id, job_id, table_reference, source_uris, schema=None, write_disposition=None, create_disposition=None, additional_load_parameters=None): additional_load_parameters = additional_load_parameters or {} job_schema = None if schema == 'SCHEMA_AUTODETECT' else schema reference = bigquery.JobReference(jobId=job_id, projectId=project_id) request = bigquery.BigqueryJobsInsertRequest( projectId=project_id, job=bigquery.Job( configuration=bigquery.JobConfiguration( load=bigquery.JobConfigurationLoad( sourceUris=source_uris, destinationTable=table_reference, schema=job_schema, writeDisposition=write_disposition, createDisposition=create_disposition, sourceFormat='NEWLINE_DELIMITED_JSON', autodetect=schema == 'SCHEMA_AUTODETECT', **additional_load_parameters)), jobReference=reference, )) response = self.client.jobs.Insert(request) return response.jobReference
def test_read_from_table(self): client = mock.Mock() client.jobs.Insert.return_value = bigquery.Job( jobReference=bigquery.JobReference(jobId='somejob')) table_rows, schema, expected_rows = self.get_test_rows() client.jobs.GetQueryResults.return_value = bigquery.GetQueryResultsResponse( jobComplete=True, rows=table_rows, schema=schema) actual_rows = [] with beam.io.BigQuerySource('dataset.table').reader(client) as reader: for row in reader: actual_rows.append(row) self.assertEqual(actual_rows, expected_rows) self.assertEqual(schema, reader.schema)
def test_read_from_table_as_tablerows(self): client = mock.Mock() client.jobs.Insert.return_value = bigquery.Job( jobReference=bigquery.JobReference(jobId='somejob')) table_rows, schema, _ = self.get_test_rows() client.jobs.GetQueryResults.return_value = bigquery.GetQueryResultsResponse( jobComplete=True, rows=table_rows, schema=schema) actual_rows = [] # We set the coder to TableRowJsonCoder, which is a signal that # the caller wants to see the rows as TableRows. with beam.io.BigQuerySource( 'dataset.table', coder=TableRowJsonCoder).reader(client) as reader: for row in reader: actual_rows.append(row) self.assertEqual(actual_rows, table_rows) self.assertEqual(schema, reader.schema)
def test_read_from_query_unflatten_records(self): client = mock.Mock() client.jobs.Insert.return_value = bigquery.Job( jobReference=bigquery.JobReference(jobId='somejob')) table_rows, schema, expected_rows = self.get_test_rows() client.jobs.GetQueryResults.return_value = bigquery.GetQueryResultsResponse( jobComplete=True, rows=table_rows, schema=schema) actual_rows = [] with beam.io.BigQuerySource( query='query', flatten_results=False).reader(client) as reader: for row in reader: actual_rows.append(row) self.assertEqual(actual_rows, expected_rows) self.assertEqual(schema, reader.schema) self.assertTrue(reader.use_legacy_sql) self.assertFalse(reader.flatten_results)
def test_read_from_table_and_job_complete_retry(self, patched_time_sleep): client = mock.Mock() client.jobs.Insert.return_value = bigquery.Job( jobReference=bigquery.JobReference(jobId='somejob')) table_rows, schema, expected_rows = self.get_test_rows() # Return jobComplete=False on first call to trigger the code path where # query needs to handle waiting a bit. client.jobs.GetQueryResults.side_effect = [ bigquery.GetQueryResultsResponse(jobComplete=False), bigquery.GetQueryResultsResponse( jobComplete=True, rows=table_rows, schema=schema) ] actual_rows = [] with beam.io.BigQuerySource('dataset.table').reader(client) as reader: for row in reader: actual_rows.append(row) self.assertEqual(actual_rows, expected_rows)
def load_table(self, job_id, project_id, table_ref, table_schema, gcs_urls, create_disposition, write_disposition): job_ref = bq.JobReference(jobId=job_id, projectId=project_id) request = bq.BigqueryJobsInsertRequest( projectId=project_id, job=bq.Job( configuration=bq.JobConfiguration(load=bq.JobConfigurationLoad( createDisposition=create_disposition, destinationTable=table_ref, schema=table_schema, sourceFormat="NEWLINE_DELIMITED_JSON", sourceUris=gcs_urls, writeDisposition=write_disposition)), jobReference=job_ref)) response = self.client.jobs.Insert(request) return response.jobReference.jobId
def _start_query_job(self, project_id, query, use_legacy_sql, flatten_results, job_id, dry_run=False): reference = bigquery.JobReference(jobId=job_id, projectId=project_id) request = bigquery.BigqueryJobsInsertRequest( projectId=project_id, job=bigquery.Job( configuration=bigquery.JobConfiguration( dryRun=dry_run, query=bigquery.JobConfigurationQuery( query=query, useLegacySql=use_legacy_sql, allowLargeResults=True, destinationTable=self._get_temp_table(project_id), flattenResults=flatten_results)), jobReference=reference)) response = self.client.jobs.Insert(request) return response.jobReference.jobId
def _insert_load_job(self, project_id, job_id, table_reference, source_uris, schema=None): reference = bigquery.JobReference(jobId=job_id, projectId=project_id) request = bigquery.BigqueryJobsInsertRequest( projectId=table_reference.project_id, job=bigquery.Job( configuration=bigquery.JobConfiguration( load=bigquery.JobConfigurationLoad( source_uris=source_uris, destination_table=table_reference, ) ), jobReference=reference, ) ) response = self.client.jobs.Insert(request) return response.jobReference.jobId
def get_query_location(self, project_id, query, use_legacy_sql): """ Get the location of tables referenced in a query. This method returns the location of the first referenced table in the query and depends on the BigQuery service to provide error handling for queries that reference tables in multiple locations. """ reference = bigquery.JobReference(jobId=uuid.uuid4().hex, projectId=project_id) request = bigquery.BigqueryJobsInsertRequest( projectId=project_id, job=bigquery.Job( configuration=bigquery.JobConfiguration( dryRun=True, query=bigquery.JobConfigurationQuery( query=query, useLegacySql=use_legacy_sql, )), jobReference=reference)) response = self.client.jobs.Insert(request) if response.statistics is None: # This behavior is only expected in tests logging.warning( "Unable to get location, missing response.statistics. Query: %s", query) return None referenced_tables = response.statistics.query.referencedTables if referenced_tables: # Guards against both non-empty and non-None table = referenced_tables[0] location = self.get_table_location( table.projectId, table.datasetId, table.tableId) logging.info("Using location %r from table %r referenced by query %s", location, table, query) return location logging.debug("Query %s does not reference any tables.", query) return None
def _start_query_job(self, project_id, query, use_legacy_sql, flatten_results, job_id, dry_run=False, kms_key=None): reference = bigquery.JobReference(jobId=job_id, projectId=project_id) request = bigquery.BigqueryJobsInsertRequest( projectId=project_id, job=bigquery.Job( configuration=bigquery.JobConfiguration( dryRun=dry_run, query=bigquery.JobConfigurationQuery( query=query, useLegacySql=use_legacy_sql, allowLargeResults=not dry_run, destinationTable=self._get_temp_table(project_id) if not dry_run else None, flattenResults=flatten_results, destinationEncryptionConfiguration=bigquery .EncryptionConfiguration(kmsKeyName=kms_key))), jobReference=reference)) response = self.client.jobs.Insert(request) return response
def test_triggering_frequency(self, is_streaming, with_auto_sharding): destination = 'project1:dataset1.table1' job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' result_job = bigquery_api.Job() result_job.jobReference = job_reference mock_job = mock.Mock() mock_job.status.state = 'DONE' mock_job.status.errorResult = None mock_job.jobReference = job_reference bq_client = mock.Mock() bq_client.jobs.Get.return_value = mock_job bq_client.jobs.Insert.return_value = result_job # Insert a fake clock to work with auto-sharding which needs a processing # time timer. class _FakeClock(object): def __init__(self, now=time.time()): self._now = now def __call__(self): return self._now start_time = timestamp.Timestamp(0) bq_client.test_clock = _FakeClock(now=start_time) triggering_frequency = 20 if is_streaming else None transform = bqfl.BigQueryBatchFileLoads( destination, custom_gcs_temp_location=self._new_tempdir(), test_client=bq_client, validate=False, temp_file_format=bigquery_tools.FileFormat.JSON, is_streaming_pipeline=is_streaming, triggering_frequency=triggering_frequency, with_auto_sharding=with_auto_sharding) # Need to test this with the DirectRunner to avoid serializing mocks with TestPipeline( runner='BundleBasedDirectRunner', options=StandardOptions(streaming=is_streaming)) as p: if is_streaming: _SIZE = len(_ELEMENTS) fisrt_batch = [ TimestampedValue(value, start_time + i + 1) for i, value in enumerate(_ELEMENTS[:_SIZE // 2]) ] second_batch = [ TimestampedValue(value, start_time + _SIZE // 2 + i + 1) for i, value in enumerate(_ELEMENTS[_SIZE // 2:]) ] # Advance processing time between batches of input elements to fire the # user triggers. Intentionally advance the processing time twice for the # auto-sharding case since we need to first fire the timer and then # fire the trigger. test_stream = ( TestStream().advance_watermark_to(start_time).add_elements( fisrt_batch).advance_processing_time(30). advance_processing_time(30).add_elements(second_batch). advance_processing_time(30).advance_processing_time( 30).advance_watermark_to_infinity()) input = p | test_stream else: input = p | beam.Create(_ELEMENTS) outputs = input | transform dest_files = outputs[ bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS] dest_job = outputs[ bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS] files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1][0]) destinations = ( dest_files | "GetDests" >> beam.Map(lambda x: (bigquery_tools.get_hashable_destination(x[0]), x[1])) | "GetUniques" >> combiners.Count.PerKey() | "GetFinalDests" >> beam.Keys()) jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1]) # Check that all files exist. _ = (files | beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True)))) # Expect two load jobs are generated in the streaming case due to the # triggering frequency. Grouping is per trigger so we expect two entries # in the output as opposed to one. file_count = files | combiners.Count.Globally().without_defaults() expected_file_count = [1, 1] if is_streaming else [1] expected_destinations = [destination, destination ] if is_streaming else [destination] expected_jobs = [job_reference, job_reference ] if is_streaming else [job_reference] assert_that(file_count, equal_to(expected_file_count), label='CountFiles') assert_that(destinations, equal_to(expected_destinations), label='CheckDestinations') assert_that(jobs, equal_to(expected_jobs), label='CheckJobs')