def _write_files_with_auto_sharding(self, destination_data_kv_pc, file_prefix_pcv): clock = self.test_client.test_clock if self.test_client else time.time # Auto-sharding is achieved via GroupIntoBatches.WithShardedKey # transform which shards, groups and at the same time batches the table rows # to be inserted to BigQuery. # Firstly, the keys of tagged_data (table references) are converted to a # hashable format. This is needed to work with the keyed states used by. # GroupIntoBatches. After grouping and batching is done, table references # are restored. destination_files_kv_pc = ( destination_data_kv_pc | 'ToHashableTableRef' >> beam.Map( bigquery_tools.to_hashable_table_ref) | 'WithAutoSharding' >> GroupIntoBatches.WithShardedKey( batch_size=_FILE_TRIGGERING_RECORD_COUNT, max_buffering_duration_secs= _FILE_TRIGGERING_BATCHING_DURATION_SECS, clock=clock) | 'FromHashableTableRefAndDropShard' >> beam.Map(lambda kvs: ( bigquery_tools.parse_table_reference(kvs[0].key), kvs[1])) | beam.ParDo( WriteGroupedRecordsToFile(schema=self.schema, file_format=self._temp_file_format), file_prefix_pcv, *self.schema_side_inputs)) return self._maybe_apply_user_trigger(destination_files_kv_pc)
def expand(self, pcoll): if self.project is None: self.project = pcoll.pipeline.options.view_as( GoogleCloudOptions).project if self.project is None: raise ValueError( 'GCP project name needs to be specified in "project" pipeline option' ) return (pcoll | GroupIntoBatches.WithShardedKey( self.max_batch_size) | ParDo( _ImportCatalogItemsFn(self.project, self.retry, self.timeout, self.metadata, self.catalog_name)))