def split(self, desired_bundle_size, start_position=None,
           stop_position=None):
   from apache_beam.io import iobase
   if len(self._serialized_values) < 2:
     yield iobase.SourceBundle(
         weight=0, source=self, start_position=0,
         stop_position=len(self._serialized_values))
   else:
     if start_position is None:
       start_position = 0
     if stop_position is None:
       stop_position = len(self._serialized_values)
     avg_size_per_value = self._total_size / len(self._serialized_values)
     num_values_per_split = max(
         int(desired_bundle_size / avg_size_per_value), 1)
     start = start_position
     while start < stop_position:
       end = min(start + num_values_per_split, stop_position)
       remaining = stop_position - end
       # Avoid having a too small bundle at the end.
       if remaining < (num_values_per_split / 4):
         end = stop_position
       sub_source = Create._create_source(
           self._serialized_values[start:end], self._coder)
       yield iobase.SourceBundle(weight=(end - start),
                                 source=sub_source,
                                 start_position=0,
                                 stop_position=(end - start))
       start = end
Beispiel #2
0
 def expand(self, pcoll):
     side = pcoll | CombineGlobally(sum).as_singleton_view()
     main = pcoll.pipeline | Create([None])
     return main | Map(lambda _, s: s, side)
Beispiel #3
0
 def test_combine_globally_with_default(self):
     with TestPipeline() as p:
         assert_that(p | Create([]) | CombineGlobally(sum), equal_to([0]))
Beispiel #4
0
 def test_combine_globally_without_default(self):
     with TestPipeline() as p:
         result = p | Create([]) | CombineGlobally(sum).without_defaults()
         assert_that(result, equal_to([]))
Beispiel #5
0
 def test_per_key_empty(self):
   l = []
   with TestPipeline() as p:
     pc = p | Create(l) | Map(lambda x: x)
     latest = pc | combine.Latest.PerKey()
     assert_that(latest, equal_to([]))
Beispiel #6
0
 def test_globally_empty(self):
   l = []
   with TestPipeline() as p:
     pc = p | Create(l) | Map(lambda x: x)
     latest = pc | combine.Latest.Globally()
     assert_that(latest, equal_to([None]))
Beispiel #7
0
 def test_log_distribution(self):
     with TestPipeline() as p:
         data = [int(math.log(x)) for x in range(1, 1000)]
         pc = p | Create(data)
         quantiles = pc | beam.ApproximateQuantiles.Globally(5)
         assert_that(quantiles, equal_to([[0, 5, 6, 6, 6]]))
Beispiel #8
0
 def test_singleton(self):
     with TestPipeline() as p:
         data = [389]
         pc = p | Create(data)
         qunatiles = pc | beam.ApproximateQuantiles.Globally(5)
         assert_that(qunatiles, equal_to([[389, 389, 389, 389, 389]]))
Beispiel #9
0
 def expand(self, pcoll):
     return (
         pcoll | 'main' >> Create([1, 2])
         | 'compute' >> beam.FlatMap(
             lambda x, s: [x * y
                           for y in s], beam.pvalue.AsIter(side)))