Python Job Examples, apache_beam.io.gcp.internal.clients.bigquery.Job Python Examples

Example #1

0

Show file

File: bigquery_tools.py Project: zheng7yan/beam

 def _insert_load_job(self,
                      project_id,
                      job_id,
                      table_reference,
                      source_uris,
                      schema=None,
                      write_disposition=None,
                      create_disposition=None):
   reference = bigquery.JobReference(jobId=job_id, projectId=project_id)
   request = bigquery.BigqueryJobsInsertRequest(
       projectId=project_id,
       job=bigquery.Job(
           configuration=bigquery.JobConfiguration(
               load=bigquery.JobConfigurationLoad(
                   sourceUris=source_uris,
                   destinationTable=table_reference,
                   schema=schema,
                   writeDisposition=write_disposition,
                   createDisposition=create_disposition,
                   sourceFormat='NEWLINE_DELIMITED_JSON',
                   autodetect=schema is None,
               )
           ),
           jobReference=reference,
       )
   )
   response = self.client.jobs.Insert(request)
   return response.jobReference

Example #2

0

Show file

  def perform_extract_job(self, destination, job_id, table_reference,
                          destination_format, include_header=True,
                          compression=ExportCompression.NONE):
    """Starts a job to export data from BigQuery.

    Returns:
      bigquery.JobReference with the information about the job that was started.
    """
    job_reference = bigquery.JobReference(jobId=job_id,
                                          projectId=table_reference.projectId)
    request = bigquery.BigqueryJobsInsertRequest(
        projectId=table_reference.projectId,
        job=bigquery.Job(
            configuration=bigquery.JobConfiguration(
                extract=bigquery.JobConfigurationExtract(
                    destinationUris=destination,
                    sourceTable=table_reference,
                    printHeader=include_header,
                    destinationFormat=destination_format,
                    compression=compression,
                )
            ),
            jobReference=job_reference,
        )
    )
    response = self.client.jobs.Insert(request)
    return response.jobReference

Example #3

0

Show file

    def test_records_traverse_transform_with_mocks(self):
        destination = 'project1:dataset1.table1'

        job_reference = bigquery_api.JobReference()
        job_reference.projectId = 'project1'
        job_reference.jobId = 'job_name1'
        result_job = bigquery_api.Job()
        result_job.jobReference = job_reference

        mock_job = mock.Mock()
        mock_job.status.state = 'DONE'
        mock_job.status.errorResult = None
        mock_job.jobReference = job_reference

        bq_client = mock.Mock()
        bq_client.jobs.Get.return_value = mock_job

        bq_client.jobs.Insert.return_value = result_job

        transform = bqfl.BigQueryBatchFileLoads(
            destination,
            custom_gcs_temp_location=self._new_tempdir(),
            test_client=bq_client,
            validate=False,
            coder=CustomRowCoder())

        # Need to test this with the DirectRunner to avoid serializing mocks
        with TestPipeline('DirectRunner') as p:
            outputs = p | beam.Create(_ELEMENTS) | transform

            dest_files = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
            dest_job = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]

            jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1])

            files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1])
            destinations = (
                dest_files
                | "GetDests" >>
                beam.Map(lambda x:
                         (bigquery_tools.get_hashable_destination(x[0]), x[1]))
                | "GetUniques" >> beam.combiners.Count.PerKey()
                | "GetFinalDests" >> beam.Keys())

            # All files exist
            _ = (files | beam.Map(
                lambda x: hamcrest_assert(os.path.exists(x), is_(True))))

            # One file per destination
            assert_that(files | beam.combiners.Count.Globally(),
                        equal_to([1]),
                        label='CountFiles')

            assert_that(destinations,
                        equal_to([destination]),
                        label='CheckDestinations')

            assert_that(jobs, equal_to([job_reference]), label='CheckJobs')

Example #4

0

Show file

File: bigquery_tools.py Project: shabeeb-rahman/beam

    def _insert_copy_job(self,
                         project_id,
                         job_id,
                         from_table_reference,
                         to_table_reference,
                         create_disposition=None,
                         write_disposition=None):
        reference = bigquery.JobReference()
        reference.jobId = job_id
        reference.projectId = project_id
        request = bigquery.BigqueryJobsInsertRequest(
            projectId=project_id,
            job=bigquery.Job(
                configuration=bigquery.JobConfiguration(
                    copy=bigquery.JobConfigurationTableCopy(
                        destinationTable=to_table_reference,
                        sourceTable=from_table_reference,
                        createDisposition=create_disposition,
                        writeDisposition=write_disposition,
                    )),
                jobReference=reference,
            ))

        logging.info("Inserting job request: %s", request)
        response = self.client.jobs.Insert(request)
        logging.info("Response was %s", response)
        return response.jobReference

Example #5

0

Show file

 def test_read_from_table_and_multiple_pages(self):
     client = mock.Mock()
     client.jobs.Insert.return_value = bigquery.Job(
         jobReference=bigquery.JobReference(jobId='somejob'))
     table_rows, schema, expected_rows = self.get_test_rows()
     # Return a pageToken on first call to trigger the code path where
     # query needs to handle multiple pages of results.
     client.jobs.GetQueryResults.side_effect = [
         bigquery.GetQueryResultsResponse(jobComplete=True,
                                          rows=table_rows,
                                          schema=schema,
                                          pageToken='token'),
         bigquery.GetQueryResultsResponse(jobComplete=True,
                                          rows=table_rows,
                                          schema=schema)
     ]
     actual_rows = []
     with beam.io.BigQuerySource(
             'dataset.table',
             use_dataflow_native_source=True).reader(client) as reader:
         for row in reader:
             actual_rows.append(row)
     # We return expected rows for each of the two pages of results so we
     # adjust our expectation below accordingly.
     self.assertEqual(actual_rows, expected_rows * 2)

Example #6

0

Show file

    def test_load_job_id_used(self):
        job_reference = bigquery_api.JobReference()
        job_reference.projectId = 'loadJobProject'
        job_reference.jobId = 'job_name1'

        result_job = bigquery_api.Job()
        result_job.jobReference = job_reference

        mock_job = mock.Mock()
        mock_job.status.state = 'DONE'
        mock_job.status.errorResult = None
        mock_job.jobReference = job_reference

        bq_client = mock.Mock()
        bq_client.jobs.Get.return_value = mock_job

        bq_client.jobs.Insert.return_value = result_job

        transform = bqfl.BigQueryBatchFileLoads(
            'project1:dataset1.table1',
            custom_gcs_temp_location=self._new_tempdir(),
            test_client=bq_client,
            validate=False,
            load_job_project_id='loadJobProject')

        with TestPipeline('DirectRunner') as p:
            outputs = p | beam.Create(_ELEMENTS) | transform
            jobs = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS] \
                   | "GetJobs" >> beam.Map(lambda x: x[1])

            assert_that(jobs,
                        equal_to([job_reference]),
                        label='CheckJobProjectIds')

Example #7

0

Show file

File: bigquery_tools.py Project: shabeeb-rahman/beam

 def _insert_load_job(self,
                      project_id,
                      job_id,
                      table_reference,
                      source_uris,
                      schema=None,
                      write_disposition=None,
                      create_disposition=None,
                      additional_load_parameters=None):
     additional_load_parameters = additional_load_parameters or {}
     job_schema = None if schema == 'SCHEMA_AUTODETECT' else schema
     reference = bigquery.JobReference(jobId=job_id, projectId=project_id)
     request = bigquery.BigqueryJobsInsertRequest(
         projectId=project_id,
         job=bigquery.Job(
             configuration=bigquery.JobConfiguration(
                 load=bigquery.JobConfigurationLoad(
                     sourceUris=source_uris,
                     destinationTable=table_reference,
                     schema=job_schema,
                     writeDisposition=write_disposition,
                     createDisposition=create_disposition,
                     sourceFormat='NEWLINE_DELIMITED_JSON',
                     autodetect=schema == 'SCHEMA_AUTODETECT',
                     **additional_load_parameters)),
             jobReference=reference,
         ))
     response = self.client.jobs.Insert(request)
     return response.jobReference

Example #8

0

Show file

File: bigquery_tools_test.py Project: alberwan/Apache-beam

 def test_read_from_table(self):
     client = mock.Mock()
     client.jobs.Insert.return_value = bigquery.Job(
         jobReference=bigquery.JobReference(jobId='somejob'))
     table_rows, schema, expected_rows = self.get_test_rows()
     client.jobs.GetQueryResults.return_value = bigquery.GetQueryResultsResponse(
         jobComplete=True, rows=table_rows, schema=schema)
     actual_rows = []
     with beam.io.BigQuerySource('dataset.table').reader(client) as reader:
         for row in reader:
             actual_rows.append(row)
     self.assertEqual(actual_rows, expected_rows)
     self.assertEqual(schema, reader.schema)

Example #9

0

Show file

 def test_read_from_table_as_tablerows(self):
   client = mock.Mock()
   client.jobs.Insert.return_value = bigquery.Job(
       jobReference=bigquery.JobReference(jobId='somejob'))
   table_rows, schema, _ = self.get_test_rows()
   client.jobs.GetQueryResults.return_value = bigquery.GetQueryResultsResponse(
       jobComplete=True, rows=table_rows, schema=schema)
   actual_rows = []
   # We set the coder to TableRowJsonCoder, which is a signal that
   # the caller wants to see the rows as TableRows.
   with beam.io.BigQuerySource(
       'dataset.table', coder=TableRowJsonCoder).reader(client) as reader:
     for row in reader:
       actual_rows.append(row)
   self.assertEqual(actual_rows, table_rows)
   self.assertEqual(schema, reader.schema)

Example #10

0

Show file

File: bigquery_tools_test.py Project: alberwan/Apache-beam

 def test_read_from_query_unflatten_records(self):
     client = mock.Mock()
     client.jobs.Insert.return_value = bigquery.Job(
         jobReference=bigquery.JobReference(jobId='somejob'))
     table_rows, schema, expected_rows = self.get_test_rows()
     client.jobs.GetQueryResults.return_value = bigquery.GetQueryResultsResponse(
         jobComplete=True, rows=table_rows, schema=schema)
     actual_rows = []
     with beam.io.BigQuerySource(
             query='query', flatten_results=False).reader(client) as reader:
         for row in reader:
             actual_rows.append(row)
     self.assertEqual(actual_rows, expected_rows)
     self.assertEqual(schema, reader.schema)
     self.assertTrue(reader.use_legacy_sql)
     self.assertFalse(reader.flatten_results)

Example #11

0

Show file

 def test_read_from_table_and_job_complete_retry(self, patched_time_sleep):
   client = mock.Mock()
   client.jobs.Insert.return_value = bigquery.Job(
       jobReference=bigquery.JobReference(jobId='somejob'))
   table_rows, schema, expected_rows = self.get_test_rows()
   # Return jobComplete=False on first call to trigger the code path where
   # query needs to handle waiting a bit.
   client.jobs.GetQueryResults.side_effect = [
       bigquery.GetQueryResultsResponse(jobComplete=False),
       bigquery.GetQueryResultsResponse(
           jobComplete=True, rows=table_rows, schema=schema)
   ]
   actual_rows = []
   with beam.io.BigQuerySource('dataset.table').reader(client) as reader:
     for row in reader:
       actual_rows.append(row)
   self.assertEqual(actual_rows, expected_rows)

Example #12

0

Show file

    def load_table(self, job_id, project_id, table_ref, table_schema, gcs_urls,
                   create_disposition, write_disposition):

        job_ref = bq.JobReference(jobId=job_id, projectId=project_id)
        request = bq.BigqueryJobsInsertRequest(
            projectId=project_id,
            job=bq.Job(
                configuration=bq.JobConfiguration(load=bq.JobConfigurationLoad(
                    createDisposition=create_disposition,
                    destinationTable=table_ref,
                    schema=table_schema,
                    sourceFormat="NEWLINE_DELIMITED_JSON",
                    sourceUris=gcs_urls,
                    writeDisposition=write_disposition)),
                jobReference=job_ref))

        response = self.client.jobs.Insert(request)
        return response.jobReference.jobId

Example #13

0

Show file

File: bigquery_tools.py Project: zhujk/beam

  def _start_query_job(self, project_id, query, use_legacy_sql, flatten_results,
                       job_id, dry_run=False):
    reference = bigquery.JobReference(jobId=job_id, projectId=project_id)
    request = bigquery.BigqueryJobsInsertRequest(
        projectId=project_id,
        job=bigquery.Job(
            configuration=bigquery.JobConfiguration(
                dryRun=dry_run,
                query=bigquery.JobConfigurationQuery(
                    query=query,
                    useLegacySql=use_legacy_sql,
                    allowLargeResults=True,
                    destinationTable=self._get_temp_table(project_id),
                    flattenResults=flatten_results)),
            jobReference=reference))

    response = self.client.jobs.Insert(request)
    return response.jobReference.jobId

Example #14

0

Show file

File: bigquery_tools.py Project: zhujk/beam

  def _insert_load_job(self, project_id, job_id, table_reference, source_uris,
                       schema=None):
    reference = bigquery.JobReference(jobId=job_id, projectId=project_id)
    request = bigquery.BigqueryJobsInsertRequest(
        projectId=table_reference.project_id,
        job=bigquery.Job(
            configuration=bigquery.JobConfiguration(
                load=bigquery.JobConfigurationLoad(
                    source_uris=source_uris,
                    destination_table=table_reference,
                )
            ),
            jobReference=reference,
        )
    )

    response = self.client.jobs.Insert(request)
    return response.jobReference.jobId

Example #15

0

Show file

File: bigquery_tools.py Project: zhujk/beam

  def get_query_location(self, project_id, query, use_legacy_sql):
    """
    Get the location of tables referenced in a query.

    This method returns the location of the first referenced table in the query
    and depends on the BigQuery service to provide error handling for
    queries that reference tables in multiple locations.
    """
    reference = bigquery.JobReference(jobId=uuid.uuid4().hex,
                                      projectId=project_id)
    request = bigquery.BigqueryJobsInsertRequest(
        projectId=project_id,
        job=bigquery.Job(
            configuration=bigquery.JobConfiguration(
                dryRun=True,
                query=bigquery.JobConfigurationQuery(
                    query=query,
                    useLegacySql=use_legacy_sql,
                )),
            jobReference=reference))

    response = self.client.jobs.Insert(request)

    if response.statistics is None:
      # This behavior is only expected in tests
      logging.warning(
          "Unable to get location, missing response.statistics. Query: %s",
          query)
      return None

    referenced_tables = response.statistics.query.referencedTables
    if referenced_tables:  # Guards against both non-empty and non-None
      table = referenced_tables[0]
      location = self.get_table_location(
          table.projectId,
          table.datasetId,
          table.tableId)
      logging.info("Using location %r from table %r referenced by query %s",
                   location, table, query)
      return location

    logging.debug("Query %s does not reference any tables.", query)
    return None

Example #16

0

Show file

  def _start_query_job(self, project_id, query, use_legacy_sql, flatten_results,
                       job_id, dry_run=False, kms_key=None):
    reference = bigquery.JobReference(jobId=job_id, projectId=project_id)
    request = bigquery.BigqueryJobsInsertRequest(
        projectId=project_id,
        job=bigquery.Job(
            configuration=bigquery.JobConfiguration(
                dryRun=dry_run,
                query=bigquery.JobConfigurationQuery(
                    query=query,
                    useLegacySql=use_legacy_sql,
                    allowLargeResults=not dry_run,
                    destinationTable=self._get_temp_table(project_id) if not
                    dry_run else None,
                    flattenResults=flatten_results,
                    destinationEncryptionConfiguration=bigquery
                    .EncryptionConfiguration(kmsKeyName=kms_key))),
            jobReference=reference))

    response = self.client.jobs.Insert(request)
    return response

Example #17

0

Show file

File: bigquery_file_loads_test.py Project: sanjayksh/beam

    def test_triggering_frequency(self, is_streaming, with_auto_sharding):
        destination = 'project1:dataset1.table1'

        job_reference = bigquery_api.JobReference()
        job_reference.projectId = 'project1'
        job_reference.jobId = 'job_name1'
        result_job = bigquery_api.Job()
        result_job.jobReference = job_reference

        mock_job = mock.Mock()
        mock_job.status.state = 'DONE'
        mock_job.status.errorResult = None
        mock_job.jobReference = job_reference

        bq_client = mock.Mock()
        bq_client.jobs.Get.return_value = mock_job
        bq_client.jobs.Insert.return_value = result_job

        # Insert a fake clock to work with auto-sharding which needs a processing
        # time timer.
        class _FakeClock(object):
            def __init__(self, now=time.time()):
                self._now = now

            def __call__(self):
                return self._now

        start_time = timestamp.Timestamp(0)
        bq_client.test_clock = _FakeClock(now=start_time)

        triggering_frequency = 20 if is_streaming else None
        transform = bqfl.BigQueryBatchFileLoads(
            destination,
            custom_gcs_temp_location=self._new_tempdir(),
            test_client=bq_client,
            validate=False,
            temp_file_format=bigquery_tools.FileFormat.JSON,
            is_streaming_pipeline=is_streaming,
            triggering_frequency=triggering_frequency,
            with_auto_sharding=with_auto_sharding)

        # Need to test this with the DirectRunner to avoid serializing mocks
        with TestPipeline(
                runner='BundleBasedDirectRunner',
                options=StandardOptions(streaming=is_streaming)) as p:
            if is_streaming:
                _SIZE = len(_ELEMENTS)
                fisrt_batch = [
                    TimestampedValue(value, start_time + i + 1)
                    for i, value in enumerate(_ELEMENTS[:_SIZE // 2])
                ]
                second_batch = [
                    TimestampedValue(value, start_time + _SIZE // 2 + i + 1)
                    for i, value in enumerate(_ELEMENTS[_SIZE // 2:])
                ]
                # Advance processing time between batches of input elements to fire the
                # user triggers. Intentionally advance the processing time twice for the
                # auto-sharding case since we need to first fire the timer and then
                # fire the trigger.
                test_stream = (
                    TestStream().advance_watermark_to(start_time).add_elements(
                        fisrt_batch).advance_processing_time(30).
                    advance_processing_time(30).add_elements(second_batch).
                    advance_processing_time(30).advance_processing_time(
                        30).advance_watermark_to_infinity())
                input = p | test_stream
            else:
                input = p | beam.Create(_ELEMENTS)
            outputs = input | transform

            dest_files = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
            dest_job = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]

            files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1][0])
            destinations = (
                dest_files
                | "GetDests" >>
                beam.Map(lambda x:
                         (bigquery_tools.get_hashable_destination(x[0]), x[1]))
                | "GetUniques" >> combiners.Count.PerKey()
                | "GetFinalDests" >> beam.Keys())
            jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1])

            # Check that all files exist.
            _ = (files
                 | beam.Map(
                     lambda x: hamcrest_assert(os.path.exists(x), is_(True))))

            # Expect two load jobs are generated in the streaming case due to the
            # triggering frequency. Grouping is per trigger so we expect two entries
            # in the output as opposed to one.
            file_count = files | combiners.Count.Globally().without_defaults()
            expected_file_count = [1, 1] if is_streaming else [1]
            expected_destinations = [destination, destination
                                     ] if is_streaming else [destination]
            expected_jobs = [job_reference, job_reference
                             ] if is_streaming else [job_reference]
            assert_that(file_count,
                        equal_to(expected_file_count),
                        label='CountFiles')
            assert_that(destinations,
                        equal_to(expected_destinations),
                        label='CheckDestinations')
            assert_that(jobs, equal_to(expected_jobs), label='CheckJobs')