def expand(self, pcoll): p = pcoll.pipeline if (isinstance(self.table_reference, bigquery.TableReference) and self.table_reference.projectId is None): self.table_reference.projectId = pcoll.pipeline.options.view_as( GoogleCloudOptions).project method_to_use = self._compute_method(p, p.options) if method_to_use == WriteToBigQuery.Method.STREAMING_INSERTS: # TODO: Support load jobs for streaming pipelines. bigquery_write_fn = BigQueryWriteFn( schema=self.schema, batch_size=self.batch_size, create_disposition=self.create_disposition, write_disposition=self.write_disposition, kms_key=self.kms_key, retry_strategy=self.insert_retry_strategy, test_client=self.test_client) outputs = ( pcoll | 'AppendDestination' >> beam.ParDo( bigquery_tools.AppendDestinationsFn(self.table_reference)) | 'StreamInsertRows' >> ParDo(bigquery_write_fn).with_outputs( BigQueryWriteFn.FAILED_ROWS, main='main')) return { BigQueryWriteFn.FAILED_ROWS: outputs[BigQueryWriteFn.FAILED_ROWS] } else: if p.options.view_as(StandardOptions).streaming: raise NotImplementedError( 'File Loads to BigQuery are only supported on Batch pipelines.' ) from apache_beam.io.gcp import bigquery_file_loads return (pcoll | bigquery_file_loads.BigQueryBatchFileLoads( destination=self.table_reference, schema=self.schema, create_disposition=self.create_disposition, write_disposition=self.write_disposition, max_file_size=self.max_file_size, max_files_per_bundle=self.max_files_per_bundle, custom_gcs_temp_location=self.custom_gcs_temp_location, test_client=self.test_client, validate=self._validate))
def expand(self, pcoll): p = pcoll.pipeline temp_location = p.options.view_as(GoogleCloudOptions).temp_location empty_pc = p | "ImpulseEmptyPC" >> beam.Create([]) singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None]) load_job_name_pcv = pvalue.AsSingleton( singleton_pc | beam.Map(lambda _: _generate_load_job_name())) file_prefix_pcv = pvalue.AsSingleton( singleton_pc | "GenerateFilePrefix" >> beam.Map( file_prefix_generator(self._validate, self._custom_gcs_temp_location, temp_location))) destination_data_kv_pc = ( pcoll | "RewindowIntoGlobal" >> self._window_fn() | "AppendDestination" >> beam.ParDo( bigquery_tools.AppendDestinationsFn(self.destination), * self.table_side_inputs)) all_destination_file_pairs_pc = self._write_files( destination_data_kv_pc, file_prefix_pcv) grouped_files_pc = ( all_destination_file_pairs_pc | "GroupFilesByTableDestinations" >> beam.GroupByKey()) partitions = ( grouped_files_pc | beam.ParDo( PartitionFiles(self.max_partition_size, self.max_files_per_partition)).with_outputs( PartitionFiles.MULTIPLE_PARTITIONS_TAG, PartitionFiles.SINGLE_PARTITION_TAG)) multiple_partitions_per_destination_pc = partitions[ PartitionFiles.MULTIPLE_PARTITIONS_TAG] single_partition_per_destination_pc = partitions[ PartitionFiles.SINGLE_PARTITION_TAG] # When using dynamic destinations, elements with both single as well as # multiple partitions are loaded into BigQuery using temporary tables to # ensure atomicity. if self.dynamic_destinations: all_partitions = ((multiple_partitions_per_destination_pc, single_partition_per_destination_pc) | "FlattenPartitions" >> beam.Flatten()) destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\ _load_data(all_partitions, empty_pc, load_job_name_pcv, singleton_pc) else: destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\ _load_data(multiple_partitions_per_destination_pc, single_partition_per_destination_pc, load_job_name_pcv, singleton_pc) return { self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc, self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc, self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc, }
def expand(self, pcoll): p = pcoll.pipeline try: step_name = self.label except AttributeError: step_name = 'BigQueryBatchFileLoads_%d' % BigQueryBatchFileLoads.COUNT BigQueryBatchFileLoads.COUNT += 1 temp_location = p.options.view_as(GoogleCloudOptions).temp_location job_name = (p.options.view_as(GoogleCloudOptions).job_name or 'AUTOMATIC_JOB_NAME') empty_pc = p | "ImpulseEmptyPC" >> beam.Create([]) singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None]) load_job_name_pcv = pvalue.AsSingleton( singleton_pc | "LoadJobNamePrefix" >> beam.Map(lambda _: _generate_job_name( job_name, bigquery_tools.BigQueryJobTypes.LOAD, 'LOAD_STEP'))) schema_mod_job_name_pcv = pvalue.AsSingleton( singleton_pc | "SchemaModJobNamePrefix" >> beam.Map(lambda _: _generate_job_name( job_name, bigquery_tools.BigQueryJobTypes.LOAD, 'SCHEMA_MOD_STEP'))) copy_job_name_pcv = pvalue.AsSingleton( singleton_pc | "CopyJobNamePrefix" >> beam.Map(lambda _: _generate_job_name( job_name, bigquery_tools.BigQueryJobTypes.COPY, 'COPY_STEP'))) file_prefix_pcv = pvalue.AsSingleton( singleton_pc | "GenerateFilePrefix" >> beam.Map( file_prefix_generator(self._validate, self._custom_gcs_temp_location, temp_location))) destination_data_kv_pc = ( pcoll | "RewindowIntoGlobal" >> self._window_fn() | "AppendDestination" >> beam.ParDo( bigquery_tools.AppendDestinationsFn(self.destination), * self.table_side_inputs)) if not self.with_auto_sharding: all_destination_file_pairs_pc = self._write_files( destination_data_kv_pc, file_prefix_pcv) else: all_destination_file_pairs_pc = self._write_files_with_auto_sharding( destination_data_kv_pc, file_prefix_pcv) grouped_files_pc = ( all_destination_file_pairs_pc | "GroupFilesByTableDestinations" >> beam.GroupByKey()) partitions = ( grouped_files_pc | beam.ParDo( PartitionFiles(self.max_partition_size, self.max_files_per_partition)).with_outputs( PartitionFiles.MULTIPLE_PARTITIONS_TAG, PartitionFiles.SINGLE_PARTITION_TAG)) multiple_partitions_per_destination_pc = partitions[ PartitionFiles.MULTIPLE_PARTITIONS_TAG] single_partition_per_destination_pc = partitions[ PartitionFiles.SINGLE_PARTITION_TAG] # When using dynamic destinations, elements with both single as well as # multiple partitions are loaded into BigQuery using temporary tables to # ensure atomicity. if self.dynamic_destinations: all_partitions = ((multiple_partitions_per_destination_pc, single_partition_per_destination_pc) | "FlattenPartitions" >> beam.Flatten()) destination_load_job_ids_pc, destination_copy_job_ids_pc = ( self._load_data(all_partitions, empty_pc, load_job_name_pcv, schema_mod_job_name_pcv, copy_job_name_pcv, p, step_name)) else: destination_load_job_ids_pc, destination_copy_job_ids_pc = ( self._load_data(multiple_partitions_per_destination_pc, single_partition_per_destination_pc, load_job_name_pcv, schema_mod_job_name_pcv, copy_job_name_pcv, p, step_name)) return { self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc, self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc, self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc, }
def expand(self, pcoll): p = pcoll.pipeline self._custom_gcs_temp_location = ( self._custom_gcs_temp_location or p.options.view_as(GoogleCloudOptions).temp_location) load_job_name_pcv = pvalue.AsSingleton( p | "ImpulseJobName" >> beam.Create([None]) | beam.Map(lambda _: _generate_load_job_name())) file_prefix_pcv = pvalue.AsSingleton( p | "CreateFilePrefixView" >> beam.Create( [self._custom_gcs_temp_location]) | "GenerateFilePrefix" >> beam.Map( file_prefix_generator(self._validate))) outputs = ( pcoll | "ApplyGlobalWindow" >> beam.WindowInto(beam.window.GlobalWindows()) | "AppendDestination" >> beam.ParDo( bigquery_tools.AppendDestinationsFn(self.destination)) | beam.ParDo(WriteRecordsToFile( max_files_per_bundle=self.max_files_per_bundle, max_file_size=self.max_file_size, coder=self.coder), file_prefix=file_prefix_pcv).with_outputs( WriteRecordsToFile.UNWRITTEN_RECORD_TAG, WriteRecordsToFile.WRITTEN_FILE_TAG)) # A PCollection of (destination, file) tuples. It lists files with records, # and the destination each file is meant to be imported into. destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG] # A PCollection of (destination, record) tuples. These are later sharded, # grouped, and all records for each destination-shard is written to files. # This PCollection is necessary because not all records can be written into # files in ``WriteRecordsToFile``. unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG] more_destination_files_kv_pc = ( unwritten_records_pc | beam.ParDo(_ShardDestinations()) | "GroupShardedRows" >> beam.GroupByKey() | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1])) | "WriteGroupedRecordsToFile" >> beam.ParDo( WriteGroupedRecordsToFile(coder=self.coder), file_prefix=file_prefix_pcv)) all_destination_file_pairs_pc = ( (destination_files_kv_pc, more_destination_files_kv_pc) | "DestinationFilesUnion" >> beam.Flatten()) grouped_files_pc = ( all_destination_file_pairs_pc | "GroupFilesByTableDestinations" >> beam.GroupByKey()) # Load Jobs are triggered to temporary tables, and those are later copied to # the actual appropriate destination query. This ensures atomicity when only # some of the load jobs would fail but not other. # If any of them fails, then copy jobs are not triggered. trigger_loads_outputs = (grouped_files_pc | beam.ParDo( TriggerLoadJobs(schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, test_client=self.test_client, temporary_tables=self.temp_tables), load_job_name_pcv).with_outputs(TriggerLoadJobs.TEMP_TABLES, main='main')) destination_job_ids_pc = trigger_loads_outputs['main'] temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES] destination_copy_job_ids_pc = ( p | "ImpulseMonitorLoadJobs" >> beam.Create([None]) | "WaitForLoadJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(destination_job_ids_pc)) | beam.ParDo( TriggerCopyJobs(create_disposition=self.create_disposition, write_disposition=self.write_disposition, temporary_tables=self.temp_tables, test_client=self.test_client), load_job_name_pcv)) finished_copy_jobs_pc = ( p | "ImpulseMonitorCopyJobs" >> beam.Create([None]) | "WaitForCopyJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(destination_copy_job_ids_pc))) _ = ( finished_copy_jobs_pc | "RemoveTempTables/PassTables" >> beam.FlatMap( lambda x, deleting_tables: deleting_tables, pvalue.AsIter(temp_tables_pc)) | "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None)) | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey() | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0]) | "RemoveTempTables/Delete" >> beam.ParDo(DeleteTablesFn())) return { self.DESTINATION_JOBID_PAIRS: destination_job_ids_pc, self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc, self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc, }
def expand(self, pcoll): p = pcoll.pipeline temp_location = p.options.view_as(GoogleCloudOptions).temp_location load_job_name_pcv = pvalue.AsSingleton( p | "ImpulseJobName" >> beam.Create([None]) | beam.Map(lambda _: _generate_load_job_name())) file_prefix_pcv = pvalue.AsSingleton( p | "CreateFilePrefixView" >> beam.Create(['']) | "GenerateFilePrefix" >> beam.Map( file_prefix_generator(self._validate, self._custom_gcs_temp_location, temp_location))) destination_data_kv_pc = ( pcoll | "RewindowIntoGlobal" >> self._window_fn() | "AppendDestination" >> beam.ParDo( bigquery_tools.AppendDestinationsFn(self.destination), * self.table_side_inputs)) all_destination_file_pairs_pc = self._write_files( destination_data_kv_pc, file_prefix_pcv) grouped_files_pc = ( all_destination_file_pairs_pc | "GroupFilesByTableDestinations" >> beam.GroupByKey()) # Load Jobs are triggered to temporary tables, and those are later copied to # the actual appropriate destination query. This ensures atomicity when only # some of the load jobs would fail but not other. # If any of them fails, then copy jobs are not triggered. trigger_loads_outputs = (grouped_files_pc | beam.ParDo( TriggerLoadJobs( schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, test_client=self.test_client, temporary_tables=self.temp_tables, additional_bq_parameters=self.additional_bq_parameters), load_job_name_pcv, *self.schema_side_inputs).with_outputs( TriggerLoadJobs.TEMP_TABLES, main='main')) destination_job_ids_pc = trigger_loads_outputs['main'] temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES] destination_copy_job_ids_pc = ( p | "ImpulseMonitorLoadJobs" >> beam.Create([None]) | "WaitForLoadJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(destination_job_ids_pc)) | beam.ParDo( TriggerCopyJobs(create_disposition=self.create_disposition, write_disposition=self.write_disposition, temporary_tables=self.temp_tables, test_client=self.test_client), load_job_name_pcv)) finished_copy_jobs_pc = ( p | "ImpulseMonitorCopyJobs" >> beam.Create([None]) | "WaitForCopyJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(destination_copy_job_ids_pc))) _ = ( finished_copy_jobs_pc | "RemoveTempTables/PassTables" >> beam.FlatMap( lambda x, deleting_tables: deleting_tables, pvalue.AsIter(temp_tables_pc)) | "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None)) | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey() | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0]) | "RemoveTempTables/Delete" >> beam.ParDo(DeleteTablesFn())) return { self.DESTINATION_JOBID_PAIRS: destination_job_ids_pc, self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc, self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc, }