コード例 #1
0
 def expand(self, pcoll):
     compare = self._compare
     if (not self._args and not self._kwargs
             and pcoll.windowing.is_default()):
         if self._reverse:
             if compare is None or compare is operator.lt:
                 compare = operator.gt
             else:
                 original_compare = compare
                 compare = lambda a, b: original_compare(b, a)
         # This is a more efficient global algorithm.
         top_per_bundle = pcoll | core.ParDo(
             _TopPerBundle(self._n, compare, self._key))
         # If pcoll is empty, we can't guerentee that top_per_bundle
         # won't be empty, so inject at least one empty accumulator
         # so that downstream is guerenteed to produce non-empty output.
         empty_bundle = pcoll.pipeline | core.Create([(None, [])])
         return ((top_per_bundle, empty_bundle) | core.Flatten()
                 | core.GroupByKey()
                 | core.ParDo(
                     _MergeTopPerBundle(self._n, compare, self._key)))
     else:
         if self.has_defaults:
             return pcoll | core.CombineGlobally(
                 TopCombineFn(self._n, compare, self._key,
                              self._reverse), *self._args, **
                 self._kwargs)
         else:
             return pcoll | core.CombineGlobally(
                 TopCombineFn(self._n, compare, self._key,
                              self._reverse), *self._args, **
                 self._kwargs).without_defaults()
コード例 #2
0
 def expand(self, pcoll):
     if self.has_defaults:
         return pcoll | self.label >> core.CombineGlobally(
             ToDictCombineFn())
     else:
         return pcoll | self.label >> core.CombineGlobally(
             ToDictCombineFn()).without_defaults()
コード例 #3
0
 def expand(self, pcoll):
   assert_that(
       pcoll | 'min-4-globally' >> core.CombineGlobally(min),
       equal_to([-1]),
       label='assert-min-4-globally')
   assert_that(
       pcoll | 'min-5-globally' >> core.CombineGlobally(min),
       equal_to([-1]),
       label='assert-min-5-globally')
コード例 #4
0
 def expand(self, pcoll):
     if self.has_defaults:
         return (pcoll
                 | core.ParDo(self.add_timestamp).with_output_types(
                     Tuple[T, TimestampType])
                 | core.CombineGlobally(LatestCombineFn()))
     else:
         return (pcoll
                 | core.ParDo(self.add_timestamp).with_output_types(
                     Tuple[T, TimestampType])
                 | core.CombineGlobally(
                     LatestCombineFn()).without_defaults())
コード例 #5
0
    def Of(pcoll, n, compare=None, *args, **kwargs):
        """Obtain a list of the compare-most N elements in a PCollection.

    This transform will retrieve the n greatest elements in the PCollection
    to which it is applied, where "greatest" is determined by the comparator
    function supplied as the compare argument.

    compare should be an implementation of "a < b" taking at least two arguments
    (a and b). Additional arguments and side inputs specified in the apply call
    become additional arguments to the comparator.  Defaults to the natural
    ordering of the elements.

    The arguments 'key' and 'reverse' may instead be passed as keyword
    arguments, and have the same meaning as for Python's sort functions.

    Args:
      pcoll: PCollection to process.
      n: number of elements to extract from pcoll.
      compare: as described above.
      *args: as described above.
      **kwargs: as described above.
    """
        key = kwargs.pop('key', None)
        reverse = kwargs.pop('reverse', False)
        return pcoll | core.CombineGlobally(
            TopCombineFn(n, compare, key, reverse), *args, **kwargs)
コード例 #6
0
ファイル: combiners.py プロジェクト: iindyk/beam
 def expand(self, pcoll):
     if pcoll.windowing.is_default():
         # This is a more efficient global algorithm.
         top_per_bundle = pcoll | core.ParDo(
             _TopPerBundle(self._n, self._key, self._reverse))
         # If pcoll is empty, we can't guerentee that top_per_bundle
         # won't be empty, so inject at least one empty accumulator
         # so that downstream is guerenteed to produce non-empty output.
         empty_bundle = pcoll.pipeline | core.Create([(None, [])])
         return ((top_per_bundle, empty_bundle) | core.Flatten()
                 | core.GroupByKey()
                 | core.ParDo(
                     _MergeTopPerBundle(self._n, self._key,
                                        self._reverse)))
     else:
         if self.has_defaults:
             return pcoll | core.CombineGlobally(
                 TopCombineFn(self._n, self._key, self._reverse))
         else:
             return pcoll | core.CombineGlobally(
                 TopCombineFn(self._n, self._key,
                              self._reverse)).without_defaults()
コード例 #7
0
 def expand(self, pcoll):
     # These CombineGlobally stages will be packed if and only if
     # translations.eliminate_common_key_with_void and
     # translations.pack_combiners are enabled in the TestPipeline runner.
     assert_that(pcoll
                 | 'min-globally' >> core.CombineGlobally(min),
                 equal_to([-1]),
                 label='assert-min-globally')
     assert_that(pcoll
                 | 'count-globally' >> combiners.Count.Globally(),
                 equal_to([10]),
                 label='assert-count-globally')
     assert_that(pcoll
                 | 'largest-globally' >> combiners.Top.Largest(2),
                 equal_to([[9, 6]]),
                 label='assert-largest-globally')
コード例 #8
0
 def test_optimize_multiple_combine_globally(self):
   pipeline = beam.Pipeline()
   vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]
   pcoll = pipeline | Create(vals)
   _ = pcoll | 'mean-globally' >> combiners.Mean.Globally()
   _ = pcoll | 'count-globally' >> combiners.Count.Globally()
   _ = pcoll | 'largest-globally' >> core.CombineGlobally(combiners.Largest(1))
   pipeline_proto = pipeline.to_runner_api()
   optimized_pipeline_proto = translations.optimize_pipeline(
       pipeline_proto, [
           translations.pack_combiners,
       ],
       known_runner_urns=frozenset(),
       partial=True)
   # Tests that Pipeline.from_runner_api() does not throw an exception.
   runner = runners.DirectRunner()
   beam.Pipeline.from_runner_api(
       optimized_pipeline_proto, runner, pipeline_options.PipelineOptions())
コード例 #9
0
 def expand(self, pcoll):
   compare = self._compare
   if (not self._args and not self._kwargs and
       not self._key and pcoll.windowing.is_default()):
     if self._reverse:
       if compare is None or compare is operator.lt:
         compare = operator.gt
       else:
         original_compare = compare
         compare = lambda a, b: original_compare(b, a)
     # This is a more efficient global algorithm.
     return (
         pcoll
         | core.ParDo(_TopPerBundle(self._n, compare))
         | core.GroupByKey()
         | core.ParDo(_MergeTopPerBundle(self._n, compare)))
   else:
     return pcoll | core.CombineGlobally(
         TopCombineFn(self._n, compare, self._key, self._reverse),
         *self._args, **self._kwargs)
コード例 #10
0
    def Of(pcoll, n, compare=None, *args, **kwargs):
        """Obtain a list of the compare-most N elements in a PCollection.

    This transform will retrieve the n greatest elements in the PCollection
    to which it is applied, where "greatest" is determined by the comparator
    function supplied as the compare argument.

    compare should be an implementation of "a < b" taking at least two arguments
    (a and b). Additional arguments and side inputs specified in the apply call
    become additional arguments to the comparator.  Defaults to the natural
    ordering of the elements.

    The arguments 'key' and 'reverse' may instead be passed as keyword
    arguments, and have the same meaning as for Python's sort functions.

    Args:
      pcoll: PCollection to process.
      n: number of elements to extract from pcoll.
      compare: as described above.
      *args: as described above.
      **kwargs: as described above.
    """
        key = kwargs.pop('key', None)
        reverse = kwargs.pop('reverse', False)
        if not args and not kwargs and not key and pcoll.windowing.is_default(
        ):
            if reverse:
                if compare is None or compare is operator.lt:
                    compare = operator.gt
                else:
                    original_compare = compare
                    compare = lambda a, b: original_compare(b, a)
            # This is a more efficient global algorithm.
            return (pcoll
                    | core.ParDo(_TopPerBundle(n, compare))
                    | core.GroupByKey()
                    | core.ParDo(_MergeTopPerBundle(n, compare)))
        else:
            return pcoll | core.CombineGlobally(
                TopCombineFn(n, compare, key, reverse), *args, **kwargs)
コード例 #11
0
 def expand(self, pcoll):
     return pcoll | core.CombineGlobally(MeanCombineFn())
コード例 #12
0
 def FixedSizeGlobally(pcoll, n):
     return pcoll | core.CombineGlobally(SampleCombineFn(n))
コード例 #13
0
 def expand(self, pcoll):
     return pcoll | self.label >> core.CombineGlobally(ToDictCombineFn())
コード例 #14
0
 def expand(self, pcoll):
     return (pcoll
             | core.ParDo(self.add_timestamp).with_output_types(
                 Tuple[T, TimestampType])  # type: ignore[misc]
             | core.CombineGlobally(LatestCombineFn()))
コード例 #15
0
 def expand(self, pcoll):
     if self.has_defaults:
         return pcoll | core.CombineGlobally(MeanCombineFn())
     else:
         return pcoll | core.CombineGlobally(
             MeanCombineFn()).without_defaults()
コード例 #16
0
 def expand(self, pcoll):
     return pcoll | core.CombineGlobally(SampleCombineFn(self._n))
コード例 #17
0
ファイル: combiners.py プロジェクト: wikier/beam
 def expand(self, pcoll):
     return pcoll | core.CombineGlobally(self.label, ToListCombineFn())
コード例 #18
0
 def expand(self, pcoll):
   _ = pcoll | 'mean-globally' >> combiners.Mean.Globally()
   _ = pcoll | 'count-globally' >> combiners.Count.Globally()
   _ = pcoll | 'largest-globally' >> core.CombineGlobally(
       combiners.Largest(1))