def process(self, element, job_name_prefix=None, unused_schema_mod_jobs=None): destination = element[0] job_reference = element[1] copy_to_reference = bigquery_tools.parse_table_reference(destination) if copy_to_reference.projectId is None: copy_to_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') copy_from_reference = bigquery_tools.parse_table_reference(destination) copy_from_reference.tableId = job_reference.jobId if copy_from_reference.projectId is None: copy_from_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') copy_job_name = '%s_%s' % (job_name_prefix, _bq_uuid('%s:%s.%s' % (copy_from_reference.projectId, copy_from_reference.datasetId, copy_from_reference.tableId))) _LOGGER.info("Triggering copy job from %s to %s", copy_from_reference, copy_to_reference) if copy_to_reference.tableId not in self._observed_tables: # When the write_disposition for a job is WRITE_TRUNCATE, # multiple copy jobs to the same destination can stump on # each other, truncate data, and write to the BQ table over and # over. # Thus, the first copy job runs with the user's write_disposition, # but afterwards, all jobs must always WRITE_APPEND to the table. # If they do not, subsequent copy jobs will clear out data appended # by previous jobs. write_disposition = self.write_disposition wait_for_job = True self._observed_tables.add(copy_to_reference.tableId) else: wait_for_job = False write_disposition = 'WRITE_APPEND' if not self.bq_io_metadata: self.bq_io_metadata = create_bigquery_io_metadata(self._step_name) job_reference = self.bq_wrapper._insert_copy_job( copy_to_reference.projectId, copy_job_name, copy_from_reference, copy_to_reference, create_disposition=self.create_disposition, write_disposition=write_disposition, job_labels=self.bq_io_metadata.add_additional_bq_job_labels()) if wait_for_job: self.bq_wrapper.wait_for_bq_job(job_reference, sleep_duration_sec=10) yield (destination, job_reference)
def process(self, element, load_job_name_prefix, *schema_side_inputs): # Each load job is assumed to have files respecting these constraints: # 1. Total size of all files < 15 TB (Max size for load jobs) # 2. Total no. of files in a single load job < 10,000 # This assumption means that there will always be a single load job # triggered for each partition of files. destination = element[0] files = element[1] if callable(self.schema): schema = self.schema(destination, *schema_side_inputs) elif isinstance(self.schema, vp.ValueProvider): schema = self.schema.get() else: schema = self.schema if callable(self.additional_bq_parameters): additional_parameters = self.additional_bq_parameters(destination) elif isinstance(self.additional_bq_parameters, vp.ValueProvider): additional_parameters = self.additional_bq_parameters.get() else: additional_parameters = self.additional_bq_parameters table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') # Load jobs for a single destination are always triggered from the same # worker. This means that we can generate a deterministic numbered job id, # and not need to worry. destination_hash = _bq_uuid( '%s:%s.%s' % (table_reference.projectId, table_reference.datasetId, table_reference.tableId)) uid = _bq_uuid() job_name = '%s_%s_%s' % (load_job_name_prefix, destination_hash, uid) _LOGGER.debug('Load job has %s files. Job name is %s.', len(files), job_name) create_disposition = self.create_disposition if self.temporary_tables: # If we are using temporary tables, then we must always create the # temporary tables, so we replace the create_disposition. create_disposition = 'CREATE_IF_NEEDED' # For temporary tables, we create a new table with the name with JobId. table_reference.tableId = job_name yield pvalue.TaggedOutput( TriggerLoadJobs.TEMP_TABLES, bigquery_tools.get_hashable_destination(table_reference)) _LOGGER.info( 'Triggering job %s to load data to BigQuery table %s.' 'Schema: %s. Additional parameters: %s', job_name, table_reference, schema, additional_parameters) if not self.bq_io_metadata: self.bq_io_metadata = create_bigquery_io_metadata(self._step_name) job_reference = self.bq_wrapper.perform_load_job( destination=table_reference, source_uris=files, job_id=job_name, schema=schema, write_disposition=self.write_disposition, create_disposition=create_disposition, additional_load_parameters=additional_parameters, source_format=self.source_format, job_labels=self.bq_io_metadata.add_additional_bq_job_labels()) yield (destination, job_reference)
def start_bundle(self): self.bq_wrapper = bigquery_tools.BigQueryWrapper( client=self.test_client) if not self.bq_io_metadata: self.bq_io_metadata = create_bigquery_io_metadata(self._step_name)
def setup(self): self._bq_wrapper = bigquery_tools.BigQueryWrapper( client=self._test_client) self._bq_io_metadata = create_bigquery_io_metadata(self._step_name)
def _get_bq_metadata(self): if not self.bq_io_metadata: self.bq_io_metadata = create_bigquery_io_metadata(self._step_name) return self.bq_io_metadata
def start_bundle(self): self._observed_tables = set() self.bq_wrapper = bigquery_tools.BigQueryWrapper(client=self.test_client) if not self.bq_io_metadata: self.bq_io_metadata = create_bigquery_io_metadata()