def _write_files_with_auto_sharding(self, destination_data_kv_pc,
                                        file_prefix_pcv):
        clock = self.test_client.test_clock if self.test_client else time.time

        # Auto-sharding is achieved via GroupIntoBatches.WithShardedKey
        # transform which shards, groups and at the same time batches the table rows
        # to be inserted to BigQuery.

        # Firstly, the keys of tagged_data (table references) are converted to a
        # hashable format. This is needed to work with the keyed states used by.
        # GroupIntoBatches. After grouping and batching is done, table references
        # are restored.
        destination_files_kv_pc = (
            destination_data_kv_pc
            | 'ToHashableTableRef' >> beam.Map(
                bigquery_tools.to_hashable_table_ref)
            | 'WithAutoSharding' >> GroupIntoBatches.WithShardedKey(
                batch_size=_FILE_TRIGGERING_RECORD_COUNT,
                max_buffering_duration_secs=
                _FILE_TRIGGERING_BATCHING_DURATION_SECS,
                clock=clock)
            | 'FromHashableTableRefAndDropShard' >> beam.Map(lambda kvs: (
                bigquery_tools.parse_table_reference(kvs[0].key), kvs[1]))
            | beam.ParDo(
                WriteGroupedRecordsToFile(schema=self.schema,
                                          file_format=self._temp_file_format),
                file_prefix_pcv, *self.schema_side_inputs))

        return self._maybe_apply_user_trigger(destination_files_kv_pc)
Ejemplo n.º 2
0
 def expand(self, pcoll):
     if self.project is None:
         self.project = pcoll.pipeline.options.view_as(
             GoogleCloudOptions).project
     if self.project is None:
         raise ValueError(
             'GCP project name needs to be specified in "project" pipeline option'
         )
     return (pcoll | GroupIntoBatches.WithShardedKey(
         self.max_batch_size) | ParDo(
             _ImportCatalogItemsFn(self.project, self.retry, self.timeout,
                                   self.metadata, self.catalog_name)))