def test(self): self.result = (self.pipeline | 'Read from BigQuery' >> Read(BigQuerySource( dataset=self.input_dataset, table=self.input_table)) | 'Count messages' >> ParDo(CountMessages( self.metrics_namespace)) | 'Measure time' >> ParDo(MeasureTime( self.metrics_namespace)) | 'Count' >> Count.Globally())
def test(self): output = ( self.pipeline | 'Read from BigQuery' >> Read( BigQuerySource(dataset=self.input_dataset, table=self.input_table)) | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) | 'Count' >> Count.Globally()) assert_that(output, equal_to([self.input_options['num_records']]))
def test(self): output = ( self.pipeline | 'Read from Spanner' >> ReadFromSpanner( self.project, self.spanner_instance, self.spanner_database, sql="select data from test_data") | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) | 'Count' >> Count.Globally()) assert_that(output, equal_to([self.input_options['num_records']]))
def _verify_data(self, pcol, init_size, data_size): read = pcol | 'read' >> ReadAllFromParquet() v1 = ( read | 'get_number' >> Map(lambda x: x['number']) | 'sum_globally' >> CombineGlobally(sum) | 'validate_number' >> FlatMap(lambda x: TestParquetIT._sum_verifier(init_size, data_size, x))) v2 = ( read | 'make_pair' >> Map(lambda x: (x['name'], x['number'])) | 'count_per_key' >> Count.PerKey() | 'validate_name' >> FlatMap( lambda x: TestParquetIT._count_verifier(init_size, data_size, x))) _ = ((v1, v2, pcol) | 'flatten' >> Flatten() | 'reshuffle' >> Reshuffle() | 'cleanup' >> Map(lambda x: FileSystems.delete([x])))
def expand(self, pcoll): p = pcoll.pipeline load_job_name_pcv = pvalue.AsSingleton( p | "ImpulseJobName" >> beam.Create([None]) | beam.Map(lambda _: _generate_load_job_name())) file_prefix_pcv = pvalue.AsSingleton( p | "CreateFilePrefixView" >> beam.Create([self._input_gs_location]) | "GenerateFilePrefix" >> beam.Map(_generate_file_prefix)) outputs = ( pcoll | "ApplyGlobalWindow" >> beam.WindowInto(beam.window.GlobalWindows()) | "AppendDestination" >> beam.ParDo( _AppendDestinationsFn(self.destination)) | beam.ParDo(WriteRecordsToFile( max_files_per_bundle=self.max_files_per_bundle, max_file_size=self.max_file_size, coder=self.coder), file_prefix=file_prefix_pcv).with_outputs( WriteRecordsToFile.UNWRITTEN_RECORD_TAG, WriteRecordsToFile.WRITTEN_FILE_TAG)) # A PCollection of (destination, file) tuples. It lists files with records, # and the destination each file is meant to be imported into. destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG] # A PCollection of (destination, record) tuples. These are later sharded, # grouped, and all records for each destination-shard is written to files. # This PCollection is necessary because not all records can be written into # files in ``WriteRecordsToFile``. unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG] more_destination_files_kv_pc = ( unwritten_records_pc | beam.ParDo(_ShardDestinations()) | "GroupShardedRows" >> beam.GroupByKey() | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1])) | "WriteGroupedRecordsToFile" >> beam.ParDo( WriteGroupedRecordsToFile(coder=self.coder), file_prefix=file_prefix_pcv)) all_destination_file_pairs_pc = ( (destination_files_kv_pc, more_destination_files_kv_pc) | "DestinationFilesUnion" >> beam.Flatten()) grouped_files_pc = ( all_destination_file_pairs_pc | "GroupFilesByTableDestinations" >> beam.GroupByKey()) # Load Jobs are triggered to temporary tables, and those are later copied to # the actual appropriate destination query. This ensures atomicity when only # some of the load jobs would fail but not other. # If any of them fails, then copy jobs are not triggered. trigger_loads_outputs = (grouped_files_pc | beam.ParDo( TriggerLoadJobs(schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, test_client=self.test_client, temporary_tables=self.temp_tables), load_job_name_pcv).with_outputs(TriggerLoadJobs.TEMP_TABLES, main='main')) destination_job_ids_pc = trigger_loads_outputs['main'] temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES] destination_copy_job_ids_pc = ( p | "ImpulseMonitorLoadJobs" >> beam.Create([None]) | "WaitForLoadJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(destination_job_ids_pc)) | beam.ParDo( TriggerCopyJobs(create_disposition=self.create_disposition, write_disposition=self.write_disposition, temporary_tables=self.temp_tables, test_client=self.test_client), load_job_name_pcv)) finished_copy_jobs_pc = ( p | "ImpulseMonitorCopyJobs" >> beam.Create([None]) | "WaitForCopyJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(destination_copy_job_ids_pc))) _ = (finished_copy_jobs_pc | "RemoveTempTables/PassTables" >> beam.FlatMap( lambda x, deleting_tables: deleting_tables, pvalue.AsIter(temp_tables_pc)) | "RemoveTempTables/DeduplicateTables" >> Count.PerElement() | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0]) | "RemoveTempTables/Delete" >> beam.ParDo(DeleteTablesFn())) return { self.DESTINATION_JOBID_PAIRS: destination_job_ids_pc, self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc, self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc, }
def test(self): self.result = (self.pipeline | 'Read from BigQuery' >> Read( BigQuerySource(dataset=self.input_dataset, table=self.input_table)) | 'Count' >> Count.Globally())