コード例 #1
0
 def test(self):
   self.result = (self.pipeline
                  | 'Read from BigQuery' >> Read(BigQuerySource(
                      dataset=self.input_dataset, table=self.input_table))
                  | 'Count messages' >> ParDo(CountMessages(
                      self.metrics_namespace))
                  | 'Measure time' >> ParDo(MeasureTime(
                      self.metrics_namespace))
                  | 'Count' >> Count.Globally())
コード例 #2
0
 def test(self):
   output = (
       self.pipeline
       | 'Read from BigQuery' >> Read(
           BigQuerySource(dataset=self.input_dataset, table=self.input_table))
       | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
       | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
       | 'Count' >> Count.Globally())
   assert_that(output, equal_to([self.input_options['num_records']]))
コード例 #3
0
 def test(self):
     output = (
         self.pipeline
         | 'Read from Spanner' >> ReadFromSpanner(
             self.project,
             self.spanner_instance,
             self.spanner_database,
             sql="select data from test_data")
         | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
         | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
         | 'Count' >> Count.Globally())
     assert_that(output, equal_to([self.input_options['num_records']]))
コード例 #4
0
ファイル: parquetio_it_test.py プロジェクト: sanjayksh/beam
 def _verify_data(self, pcol, init_size, data_size):
   read = pcol | 'read' >> ReadAllFromParquet()
   v1 = (
       read
       | 'get_number' >> Map(lambda x: x['number'])
       | 'sum_globally' >> CombineGlobally(sum)
       | 'validate_number' >>
       FlatMap(lambda x: TestParquetIT._sum_verifier(init_size, data_size, x)))
   v2 = (
       read
       | 'make_pair' >> Map(lambda x: (x['name'], x['number']))
       | 'count_per_key' >> Count.PerKey()
       | 'validate_name' >> FlatMap(
           lambda x: TestParquetIT._count_verifier(init_size, data_size, x)))
   _ = ((v1, v2, pcol)
        | 'flatten' >> Flatten()
        | 'reshuffle' >> Reshuffle()
        | 'cleanup' >> Map(lambda x: FileSystems.delete([x])))
コード例 #5
0
    def expand(self, pcoll):
        p = pcoll.pipeline

        load_job_name_pcv = pvalue.AsSingleton(
            p
            | "ImpulseJobName" >> beam.Create([None])
            | beam.Map(lambda _: _generate_load_job_name()))

        file_prefix_pcv = pvalue.AsSingleton(
            p
            | "CreateFilePrefixView" >> beam.Create([self._input_gs_location])
            | "GenerateFilePrefix" >> beam.Map(_generate_file_prefix))

        outputs = (
            pcoll
            |
            "ApplyGlobalWindow" >> beam.WindowInto(beam.window.GlobalWindows())
            | "AppendDestination" >> beam.ParDo(
                _AppendDestinationsFn(self.destination))
            | beam.ParDo(WriteRecordsToFile(
                max_files_per_bundle=self.max_files_per_bundle,
                max_file_size=self.max_file_size,
                coder=self.coder),
                         file_prefix=file_prefix_pcv).with_outputs(
                             WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                             WriteRecordsToFile.WRITTEN_FILE_TAG))

        # A PCollection of (destination, file) tuples. It lists files with records,
        # and the destination each file is meant to be imported into.
        destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]

        # A PCollection of (destination, record) tuples. These are later sharded,
        # grouped, and all records for each destination-shard is written to files.
        # This PCollection is necessary because not all records can be written into
        # files in ``WriteRecordsToFile``.
        unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

        more_destination_files_kv_pc = (
            unwritten_records_pc
            | beam.ParDo(_ShardDestinations())
            | "GroupShardedRows" >> beam.GroupByKey()
            | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
            | "WriteGroupedRecordsToFile" >> beam.ParDo(
                WriteGroupedRecordsToFile(coder=self.coder),
                file_prefix=file_prefix_pcv))

        all_destination_file_pairs_pc = (
            (destination_files_kv_pc, more_destination_files_kv_pc)
            | "DestinationFilesUnion" >> beam.Flatten())

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        # Load Jobs are triggered to temporary tables, and those are later copied to
        # the actual appropriate destination query. This ensures atomicity when only
        # some of the load jobs would fail but not other.
        # If any of them fails, then copy jobs are not triggered.
        trigger_loads_outputs = (grouped_files_pc | beam.ParDo(
            TriggerLoadJobs(schema=self.schema,
                            write_disposition=self.write_disposition,
                            create_disposition=self.create_disposition,
                            test_client=self.test_client,
                            temporary_tables=self.temp_tables),
            load_job_name_pcv).with_outputs(TriggerLoadJobs.TEMP_TABLES,
                                            main='main'))

        destination_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        destination_copy_job_ids_pc = (
            p
            | "ImpulseMonitorLoadJobs" >> beam.Create([None])
            | "WaitForLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_job_ids_pc))
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                temporary_tables=self.temp_tables,
                                test_client=self.test_client),
                load_job_name_pcv))

        finished_copy_jobs_pc = (
            p
            | "ImpulseMonitorCopyJobs" >> beam.Create([None])
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (finished_copy_jobs_pc
             | "RemoveTempTables/PassTables" >> beam.FlatMap(
                 lambda x, deleting_tables: deleting_tables,
                 pvalue.AsIter(temp_tables_pc))
             | "RemoveTempTables/DeduplicateTables" >> Count.PerElement()
             | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0])
             | "RemoveTempTables/Delete" >> beam.ParDo(DeleteTablesFn()))

        return {
            self.DESTINATION_JOBID_PAIRS: destination_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }
コード例 #6
0
 def test(self):
     self.result = (self.pipeline
                    | 'Read from BigQuery' >> Read(
                        BigQuerySource(dataset=self.input_dataset,
                                       table=self.input_table))
                    | 'Count' >> Count.Globally())