def testSideInputNotCopied(self): with beam.Pipeline() as p: side = (p | 'CreateSide' >> beam.Create(['s1', 's2', 's3']) | beam.Map(lambda x: DeepCopyTest._CountingIdentityFn( 'SideInput', x))) main = ( p | 'CreateMain' >> beam.Create([1, 2, 3]) | beam.Map(lambda x: DeepCopyTest._CountingIdentityFn('Main', x)) | beam.Map(lambda e, s: (e, list(s)), pvalue.AsList(side))) copied = deep_copy.deep_copy(main) # Check that deep copy was performed. self.assertIsNot(main, copied) self.assertIsNot(main.producer, copied.producer) # Check that deep copy stops at the side input materialization boundary. self.assertIs(main.producer.side_inputs[0], copied.producer.side_inputs[0]) self.assertIs(main.producer.side_inputs[0].pvalue, side) # Check counts of processed items. self.assertEqual(DeepCopyTest._counts['SideInput'], 3) self.assertEqual(DeepCopyTest._counts['Main'], 6)
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--key_file', dest='key_file', required=True, help='Path to service account credentials JSON.') parser.add_argument('--input', dest='input', required=True, help='GCS input file to sign.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) credentials = ( p | 'Read Credentials from GCS' >> ReadFromText(known_args.key_file)) (p | 'Read File from GCS' >> beam.Create([known_args.input]) \ | 'Sign File' >> beam.ParDo(SignFileFn(), pvalue.AsList(credentials))) result = p.run() result.wait_until_finish()
def run(): parser = argparse.ArgumentParser() _, pipeline_args = parser.parse_known_args() # google cloud parameters pipeline_options = PipelineOptions(pipeline_args) google_cloud_options = pipeline_options.view_as(GoogleCloudOptions) if not google_cloud_options.project: raise ValueError("Project is a required input.") logging.info("Using project: %s", google_cloud_options.project) if not google_cloud_options.job_name: google_cloud_options.job_name = 'name-counts-{}'.format( dt.datetime.utcnow().strftime('%Y%m%d-%H%M%S')) logging.info("Dataflow job name: %s", google_cloud_options.job_name) # parameters used in the pipeline name_count_options = pipeline_options.view_as(NameCountOptions) gcs_input_path = name_count_options.input_path gcs_output_path_first_names = name_count_options.output_path_template.format( 'first_names') gcs_output_path_last_names = name_count_options.output_path_template.format( 'last_names') logging.info("Using input path: %s", gcs_input_path) logging.info("Using output path for first names: %s", gcs_output_path_first_names) logging.info("Using output path for last names: %s", gcs_output_path_last_names) p = beam.Pipeline(options=pipeline_options) # Setup pipeline pipeline_startup = ( p | 'Create File Glob' >> Create([gcs_input_path]) | 'Read All Files' >> beam.io.ReadAllFromText() | 'Split Into Names' >> beam.Map(lambda string: string.split(','))) filter_letters = ( p | 'Letters Side Input' >> Create(['A', 'B', 'C', 'X', 'Y', 'Z'])) names_filtered = (pipeline_startup | 'Filter Names' >> beam.ParDo( SplitAndFilterNames(), filter_letters=pvalue.AsList(filter_letters))) first_names = (names_filtered | 'Get First Names' >> beam.ParDo(GetFirstName()) | 'Group By First Name' >> beam.CombinePerKey(max) | 'Store First Name Result' >> beam.io.WriteToText( gcs_output_path_first_names, num_shards=1)) last_names = (names_filtered | 'Get Last Names' >> beam.ParDo(GetLastName()) | 'Group By Last Name' >> beam.CombinePerKey(min) | 'Store Last Name Result' >> beam.io.WriteToText( gcs_output_path_last_names, num_shards=1)) result = p.run()
def _load_data(self, partitions_using_temp_tables, partitions_direct_to_destination, load_job_name_pcv, schema_mod_job_name_pcv, copy_job_name_pcv, p, step_name): """Load data to BigQuery Data is loaded into BigQuery in the following two ways: 1. Single partition: When there is a single partition of files destined to a single destination, a single load job is triggered. 2. Multiple partitions and/or Dynamic Destinations: When there are multiple partitions of files destined for a single destination or when Dynamic Destinations are used, multiple load jobs need to be triggered for each partition/destination. Load Jobs are triggered to temporary tables, and those are later copied to the actual appropriate destination table. This ensures atomicity when only some of the load jobs would fail but not other. If any of them fails, then copy jobs are not triggered. """ # Load data using temp tables trigger_loads_outputs = ( partitions_using_temp_tables | "TriggerLoadJobsWithTempTables" >> beam.ParDo( TriggerLoadJobs( schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, test_client=self.test_client, temporary_tables=True, additional_bq_parameters=self.additional_bq_parameters, source_format=self._temp_file_format, step_name=step_name), load_job_name_pcv, * self.schema_side_inputs).with_outputs( TriggerLoadJobs.TEMP_TABLES, main='main')) temp_tables_load_job_ids_pc = trigger_loads_outputs['main'] temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES] finished_temp_tables_load_jobs_pc = ( p | "ImpulseMonitorLoadJobs" >> beam.Create([None]) | "WaitForTempTableLoadJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), pvalue.AsList(temp_tables_load_job_ids_pc))) schema_mod_job_ids_pc = ( finished_temp_tables_load_jobs_pc | beam.ParDo( UpdateDestinationSchema( write_disposition=self.write_disposition, test_client=self.test_client, additional_bq_parameters=self.additional_bq_parameters, step_name=step_name), schema_mod_job_name_pcv)) finished_schema_mod_jobs_pc = ( p | "ImpulseMonitorSchemaModJobs" >> beam.Create([None]) | "WaitForSchemaModJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), pvalue.AsList(schema_mod_job_ids_pc))) destination_copy_job_ids_pc = ( finished_temp_tables_load_jobs_pc | beam.ParDo( TriggerCopyJobs(create_disposition=self.create_disposition, write_disposition=self.write_disposition, test_client=self.test_client, step_name=step_name), copy_job_name_pcv, pvalue.AsIter(finished_schema_mod_jobs_pc))) finished_copy_jobs_pc = ( p | "ImpulseMonitorCopyJobs" >> beam.Create([None]) | "WaitForCopyJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), pvalue.AsList(destination_copy_job_ids_pc))) _ = ( p | "RemoveTempTables/Impulse" >> beam.Create([None]) | "RemoveTempTables/PassTables" >> beam.FlatMap( lambda _, unused_copy_jobs, deleting_tables: deleting_tables, pvalue.AsIter(finished_copy_jobs_pc), pvalue.AsIter(temp_tables_pc)) | "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None)) | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey() | "RemoveTempTables/GetTableNames" >> beam.Keys() | "RemoveTempTables/Delete" >> beam.ParDo( DeleteTablesFn(self.test_client))) # Load data directly to destination table destination_load_job_ids_pc = ( partitions_direct_to_destination | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo( TriggerLoadJobs( schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, test_client=self.test_client, temporary_tables=False, additional_bq_parameters=self.additional_bq_parameters, source_format=self._temp_file_format, step_name=step_name), load_job_name_pcv, * self.schema_side_inputs)) _ = (p | "ImpulseMonitorDestinationLoadJobs" >> beam.Create([None]) | "WaitForDestinationLoadJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), pvalue.AsList(destination_load_job_ids_pc))) destination_load_job_ids_pc = ( (temp_tables_load_job_ids_pc, destination_load_job_ids_pc) | beam.Flatten()) return destination_load_job_ids_pc, destination_copy_job_ids_pc