Esempio n. 1
0
 def _verify_data(self, pcol, init_size, data_size):
   read = pcol | 'read' >> ReadAllFromParquet()
   v1 = (
       read
       | 'get_number' >> Map(lambda x: x['number'])
       | 'sum_globally' >> CombineGlobally(sum)
       | 'validate_number' >>
       FlatMap(lambda x: TestParquetIT._sum_verifier(init_size, data_size, x)))
   v2 = (
       read
       | 'make_pair' >> Map(lambda x: (x['name'], x['number']))
       | 'count_per_key' >> Count.PerKey()
       | 'validate_name' >> FlatMap(
           lambda x: TestParquetIT._count_verifier(init_size, data_size, x)))
   _ = ((v1, v2, pcol)
        | 'flatten' >> Flatten()
        | 'reshuffle' >> Reshuffle()
        | 'cleanup' >> Map(lambda x: FileSystems.delete([x])))
Esempio n. 2
0
    def expand(self, pcoll):
        filter_batchable_mutations = (
            pcoll
            | 'Making mutation groups' >> ParDo(_MakeMutationGroupsFn())
            | 'Filtering Batchable Mutations' >> ParDo(
                _BatchableFilterFn(
                    max_batch_size_bytes=self._max_batch_size_bytes,
                    max_number_rows=self._max_number_rows,
                    max_number_cells=self._max_number_cells)).with_outputs(
                        _BatchableFilterFn.OUTPUT_TAG_UNBATCHABLE,
                        main='batchable'))

        batching_batchables = (
            filter_batchable_mutations['batchable']
            | ParDo(
                _BatchFn(max_batch_size_bytes=self._max_batch_size_bytes,
                         max_number_rows=self._max_number_rows,
                         max_number_cells=self._max_number_cells)))

        return ((batching_batchables, filter_batchable_mutations[
            _BatchableFilterFn.OUTPUT_TAG_UNBATCHABLE])
                | 'Merging batchable and unbatchable' >> Flatten())