def process(self,
                element,
                job_name_prefix=None,
                unused_schema_mod_jobs=None):
        destination = element[0]
        job_reference = element[1]

        copy_to_reference = bigquery_tools.parse_table_reference(destination)
        if copy_to_reference.projectId is None:
            copy_to_reference.projectId = vp.RuntimeValueProvider.get_value(
                'project', str, '')

        copy_from_reference = bigquery_tools.parse_table_reference(destination)
        copy_from_reference.tableId = job_reference.jobId
        if copy_from_reference.projectId is None:
            copy_from_reference.projectId = vp.RuntimeValueProvider.get_value(
                'project', str, '')

        copy_job_name = '%s_%s' % (job_name_prefix,
                                   _bq_uuid('%s:%s.%s' %
                                            (copy_from_reference.projectId,
                                             copy_from_reference.datasetId,
                                             copy_from_reference.tableId)))

        _LOGGER.info("Triggering copy job from %s to %s", copy_from_reference,
                     copy_to_reference)
        if copy_to_reference.tableId not in self._observed_tables:
            # When the write_disposition for a job is WRITE_TRUNCATE,
            # multiple copy jobs to the same destination can stump on
            # each other, truncate data, and write to the BQ table over and
            # over.
            # Thus, the first copy job runs with the user's write_disposition,
            # but afterwards, all jobs must always WRITE_APPEND to the table.
            # If they do not, subsequent copy jobs will clear out data appended
            # by previous jobs.
            write_disposition = self.write_disposition
            wait_for_job = True
            self._observed_tables.add(copy_to_reference.tableId)
        else:
            wait_for_job = False
            write_disposition = 'WRITE_APPEND'

        if not self.bq_io_metadata:
            self.bq_io_metadata = create_bigquery_io_metadata(self._step_name)
        job_reference = self.bq_wrapper._insert_copy_job(
            copy_to_reference.projectId,
            copy_job_name,
            copy_from_reference,
            copy_to_reference,
            create_disposition=self.create_disposition,
            write_disposition=write_disposition,
            job_labels=self.bq_io_metadata.add_additional_bq_job_labels())

        if wait_for_job:
            self.bq_wrapper.wait_for_bq_job(job_reference,
                                            sleep_duration_sec=10)

        yield (destination, job_reference)
    def process(self, element, load_job_name_prefix, *schema_side_inputs):
        # Each load job is assumed to have files respecting these constraints:
        # 1. Total size of all files < 15 TB (Max size for load jobs)
        # 2. Total no. of files in a single load job < 10,000
        # This assumption means that there will always be a single load job
        # triggered for each partition of files.
        destination = element[0]
        files = element[1]

        if callable(self.schema):
            schema = self.schema(destination, *schema_side_inputs)
        elif isinstance(self.schema, vp.ValueProvider):
            schema = self.schema.get()
        else:
            schema = self.schema

        if callable(self.additional_bq_parameters):
            additional_parameters = self.additional_bq_parameters(destination)
        elif isinstance(self.additional_bq_parameters, vp.ValueProvider):
            additional_parameters = self.additional_bq_parameters.get()
        else:
            additional_parameters = self.additional_bq_parameters

        table_reference = bigquery_tools.parse_table_reference(destination)
        if table_reference.projectId is None:
            table_reference.projectId = vp.RuntimeValueProvider.get_value(
                'project', str, '')
        # Load jobs for a single destination are always triggered from the same
        # worker. This means that we can generate a deterministic numbered job id,
        # and not need to worry.
        destination_hash = _bq_uuid(
            '%s:%s.%s' % (table_reference.projectId, table_reference.datasetId,
                          table_reference.tableId))
        uid = _bq_uuid()
        job_name = '%s_%s_%s' % (load_job_name_prefix, destination_hash, uid)
        _LOGGER.debug('Load job has %s files. Job name is %s.', len(files),
                      job_name)

        create_disposition = self.create_disposition
        if self.temporary_tables:
            # If we are using temporary tables, then we must always create the
            # temporary tables, so we replace the create_disposition.
            create_disposition = 'CREATE_IF_NEEDED'
            # For temporary tables, we create a new table with the name with JobId.
            table_reference.tableId = job_name
            yield pvalue.TaggedOutput(
                TriggerLoadJobs.TEMP_TABLES,
                bigquery_tools.get_hashable_destination(table_reference))

        _LOGGER.info(
            'Triggering job %s to load data to BigQuery table %s.'
            'Schema: %s. Additional parameters: %s', job_name, table_reference,
            schema, additional_parameters)
        if not self.bq_io_metadata:
            self.bq_io_metadata = create_bigquery_io_metadata(self._step_name)
        job_reference = self.bq_wrapper.perform_load_job(
            destination=table_reference,
            source_uris=files,
            job_id=job_name,
            schema=schema,
            write_disposition=self.write_disposition,
            create_disposition=create_disposition,
            additional_load_parameters=additional_parameters,
            source_format=self.source_format,
            job_labels=self.bq_io_metadata.add_additional_bq_job_labels())
        yield (destination, job_reference)
 def start_bundle(self):
     self.bq_wrapper = bigquery_tools.BigQueryWrapper(
         client=self.test_client)
     if not self.bq_io_metadata:
         self.bq_io_metadata = create_bigquery_io_metadata(self._step_name)
 def setup(self):
     self._bq_wrapper = bigquery_tools.BigQueryWrapper(
         client=self._test_client)
     self._bq_io_metadata = create_bigquery_io_metadata(self._step_name)
Beispiel #5
0
 def _get_bq_metadata(self):
     if not self.bq_io_metadata:
         self.bq_io_metadata = create_bigquery_io_metadata(self._step_name)
     return self.bq_io_metadata
Beispiel #6
0
 def start_bundle(self):
   self._observed_tables = set()
   self.bq_wrapper = bigquery_tools.BigQueryWrapper(client=self.test_client)
   if not self.bq_io_metadata:
     self.bq_io_metadata = create_bigquery_io_metadata()