def expand(self, pbegin): from apache_beam.options.pipeline_options import DebugOptions from apache_beam.transforms import util assert isinstance(pbegin, pvalue.PBegin) self.pipeline = pbegin.pipeline debug_options = self.pipeline._options.view_as(DebugOptions) if debug_options.experiments and 'beam_fn_api' in debug_options.experiments: source = self.source def split_source(unused_impulse): total_size = source.estimate_size() if total_size: # 1MB = 1 shard, 1GB = 32 shards, 1TB = 1000 shards, 1PB = 32k shards chunk_size = max(1 << 20, 1000 * int(math.sqrt(total_size))) else: chunk_size = 64 << 20 # 64mb return source.split(chunk_size) return ( pbegin | core.Impulse() | 'Split' >> core.FlatMap(split_source) | util.Reshuffle() | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read( split.source.get_range_tracker( split.start_position, split.stop_position)))) else: # Treat Read itself as a primitive. return pvalue.PCollection(self.pipeline)
def expand(self, pbegin): return ( pbegin | core.Impulse() | 'Split' >> core.FlatMap(lambda _: source.split( Read.get_desired_chunk_size(source.estimate_size()))) | util.Reshuffle() | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read( split.source.get_range_tracker( split.start_position, split.stop_position))))
def expand(self, pbegin): from apache_beam.options.pipeline_options import DebugOptions from apache_beam.transforms import util assert isinstance(pbegin, pvalue.PBegin) self.pipeline = pbegin.pipeline debug_options = self.pipeline._options.view_as(DebugOptions) if debug_options.experiments and 'beam_fn_api' in debug_options.experiments: NUM_SPLITS = 1000 source = self.source return ( pbegin | core.Impulse() | 'Split' >> core.FlatMap(lambda _: source.split(NUM_SPLITS)) | util.Reshuffle() | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read( split.source.get_range_tracker(split.start_position, split. stop_position)))) else: # Treat Read itself as a primitive. return pvalue.PCollection(self.pipeline)