def split(self, desired_bundle_size, start_position=None, stop_position=None): from apache_beam.io import iobase if len(self._serialized_values) < 2: yield iobase.SourceBundle( weight=0, source=self, start_position=0, stop_position=len(self._serialized_values)) else: if start_position is None: start_position = 0 if stop_position is None: stop_position = len(self._serialized_values) avg_size_per_value = self._total_size / len(self._serialized_values) num_values_per_split = max( int(desired_bundle_size / avg_size_per_value), 1) start = start_position while start < stop_position: end = min(start + num_values_per_split, stop_position) remaining = stop_position - end # Avoid having a too small bundle at the end. if remaining < (num_values_per_split / 4): end = stop_position sub_source = Create._create_source( self._serialized_values[start:end], self._coder) yield iobase.SourceBundle(weight=(end - start), source=sub_source, start_position=0, stop_position=(end - start)) start = end
def expand(self, pcoll): side = pcoll | CombineGlobally(sum).as_singleton_view() main = pcoll.pipeline | Create([None]) return main | Map(lambda _, s: s, side)
def test_combine_globally_with_default(self): with TestPipeline() as p: assert_that(p | Create([]) | CombineGlobally(sum), equal_to([0]))
def test_combine_globally_without_default(self): with TestPipeline() as p: result = p | Create([]) | CombineGlobally(sum).without_defaults() assert_that(result, equal_to([]))
def test_per_key_empty(self): l = [] with TestPipeline() as p: pc = p | Create(l) | Map(lambda x: x) latest = pc | combine.Latest.PerKey() assert_that(latest, equal_to([]))
def test_globally_empty(self): l = [] with TestPipeline() as p: pc = p | Create(l) | Map(lambda x: x) latest = pc | combine.Latest.Globally() assert_that(latest, equal_to([None]))
def test_log_distribution(self): with TestPipeline() as p: data = [int(math.log(x)) for x in range(1, 1000)] pc = p | Create(data) quantiles = pc | beam.ApproximateQuantiles.Globally(5) assert_that(quantiles, equal_to([[0, 5, 6, 6, 6]]))
def test_singleton(self): with TestPipeline() as p: data = [389] pc = p | Create(data) qunatiles = pc | beam.ApproximateQuantiles.Globally(5) assert_that(qunatiles, equal_to([[389, 389, 389, 389, 389]]))
def expand(self, pcoll): return ( pcoll | 'main' >> Create([1, 2]) | 'compute' >> beam.FlatMap( lambda x, s: [x * y for y in s], beam.pvalue.AsIter(side)))