def expand(self, pbegin): if self._read_operations is not None and isinstance(pbegin, PBegin): pcoll = pbegin.pipeline | Create(self._read_operations) elif not isinstance(pbegin, PBegin): if self._read_operations is not None: raise ValueError( "Read operation in the constructor only works with " "the root of the pipeline.") pcoll = pbegin else: raise ValueError( "Spanner required read operation, sql or table " "with columns.") if self._transaction is None: # reading as batch read using the spanner partitioning query to create # batches. p = ( pcoll | 'Generate Partitions' >> ParDo( _CreateReadPartitions(spanner_configuration=self._configuration)) | 'Reshuffle' >> Reshuffle() | 'Read From Partitions' >> ParDo( _ReadFromPartitionFn(spanner_configuration=self._configuration))) else: # reading as naive read, in which we don't make batches and execute the # queries as a single read. p = ( pcoll | 'Reshuffle' >> Reshuffle().with_input_types(ReadOperation) | 'Perform Read' >> ParDo( _NaiveSpannerReadDoFn(spanner_configuration=self._configuration), AsSingleton(self._transaction))) return p
def _verify_data(self, pcol, init_size, data_size): read = pcol | 'read' >> ReadAllFromParquet() v1 = ( read | 'get_number' >> Map(lambda x: x['number']) | 'sum_globally' >> CombineGlobally(sum) | 'validate_number' >> FlatMap(lambda x: TestParquetIT._sum_verifier(init_size, data_size, x))) v2 = ( read | 'make_pair' >> Map(lambda x: (x['name'], x['number'])) | 'count_per_key' >> Count.PerKey() | 'validate_name' >> FlatMap( lambda x: TestParquetIT._count_verifier(init_size, data_size, x))) _ = ((v1, v2, pcol) | 'flatten' >> Flatten() | 'reshuffle' >> Reshuffle() | 'cleanup' >> Map(lambda x: FileSystems.delete([x])))