Example #1
0
  def test_optimize_multiple_combine_globally(self):
    class MultipleCombines(beam.PTransform):
      def annotations(self):
        return {python_urns.APPLY_COMBINER_PACKING: b''}

      def expand(self, pcoll):
        _ = pcoll | 'mean-globally' >> combiners.Mean.Globally()
        _ = pcoll | 'count-globally' >> combiners.Count.Globally()
        _ = pcoll | 'largest-globally' >> core.CombineGlobally(
            combiners.Largest(1))

    pipeline = beam.Pipeline()
    vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]
    _ = pipeline | Create(vals) | MultipleCombines()
    pipeline_proto = pipeline.to_runner_api()
    optimized_pipeline_proto = translations.optimize_pipeline(
        pipeline_proto, [
            translations.pack_combiners,
        ],
        known_runner_urns=frozenset(),
        partial=True)
    # Tests that Pipeline.from_runner_api() does not throw an exception.
    runner = runners.DirectRunner()
    beam.Pipeline.from_runner_api(
        optimized_pipeline_proto, runner, pipeline_options.PipelineOptions())
Example #2
0
  def test_pipeline_from_sorted_stages_is_toplogically_ordered(self):
    pipeline = beam.Pipeline()
    side = pipeline | 'side' >> Create([3, 4])

    class CreateAndMultiplyBySide(beam.PTransform):
      def expand(self, pcoll):
        return (
            pcoll | 'main' >> Create([1, 2]) | 'compute' >> beam.FlatMap(
                lambda x, s: [x * y for y in s], beam.pvalue.AsIter(side)))

    _ = pipeline | 'create-and-multiply-by-side' >> CreateAndMultiplyBySide()
    pipeline_proto = pipeline.to_runner_api()
    optimized_pipeline_proto = translations.optimize_pipeline(
        pipeline_proto, [
            (lambda stages, _: reversed(list(stages))),
            translations.sort_stages,
        ],
        known_runner_urns=frozenset(),
        partial=True)

    def assert_is_topologically_sorted(transform_id, visited_pcolls):
      transform = optimized_pipeline_proto.components.transforms[transform_id]
      self.assertTrue(set(transform.inputs.values()).issubset(visited_pcolls))
      visited_pcolls.update(transform.outputs.values())
      for subtransform in transform.subtransforms:
        assert_is_topologically_sorted(subtransform, visited_pcolls)

    self.assertEqual(len(optimized_pipeline_proto.root_transform_ids), 1)
    assert_is_topologically_sorted(
        optimized_pipeline_proto.root_transform_ids[0], set())
Example #3
0
 def test_optimize_empty_pipeline(self):
     pipeline = beam.Pipeline()
     pipeline_proto = pipeline.to_runner_api()
     optimized_pipeline_proto = translations.optimize_pipeline(
         pipeline_proto, [], known_runner_urns=frozenset(), partial=True)
     runner = runners.DirectRunner()
     beam.Pipeline.from_runner_api(optimized_pipeline_proto, runner,
                                   pipeline_options.PipelineOptions())
Example #4
0
 def test_optimize_single_combine_globally(self):
   pipeline = beam.Pipeline()
   vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]
   _ = pipeline | Create(vals) | combiners.Count.Globally()
   pipeline_proto = pipeline.to_runner_api()
   optimized_pipeline_proto = translations.optimize_pipeline(
       pipeline_proto, [
           translations.pack_combiners,
       ],
       known_runner_urns=frozenset(),
       partial=True)
   # Tests that Pipeline.from_runner_api() does not throw an exception.
   runner = runners.DirectRunner()
   beam.Pipeline.from_runner_api(
       optimized_pipeline_proto, runner, pipeline_options.PipelineOptions())
Example #5
0
    def test_conditionally_packed_combiners(self):
        class RecursiveCombine(beam.PTransform):
            def __init__(self, labels):
                self._labels = labels

            def expand(self, pcoll):
                base = pcoll | 'Sum' >> beam.CombineGlobally(sum)
                if self._labels:
                    rest = pcoll | self._labels[0] >> RecursiveCombine(
                        self._labels[1:])
                    return (base, rest) | beam.Flatten()
                else:
                    return base

            def annotations(self):
                if len(self._labels) == 2:
                    return {python_urns.APPLY_COMBINER_PACKING: b''}
                else:
                    return {}

        # Verify the results are as expected.
        with TestPipeline() as pipeline:
            result = pipeline | beam.Create([1, 2, 3
                                             ]) | RecursiveCombine('ABCD')
            assert_that(result, equal_to([6, 6, 6, 6, 6]))

        # Verify the optimization is as expected.
        proto = pipeline.to_runner_api(
            default_environment=environments.EmbeddedPythonEnvironment(
                capabilities=environments.python_sdk_capabilities()))
        optimized = translations.optimize_pipeline(
            proto,
            phases=[translations.pack_combiners],
            known_runner_urns=frozenset(),
            partial=True)
        optimized_stage_names = sorted(
            t.unique_name for t in optimized.components.transforms.values())
        self.assertIn('RecursiveCombine/Sum/CombinePerKey',
                      optimized_stage_names)
        self.assertIn('RecursiveCombine/A/Sum/CombinePerKey',
                      optimized_stage_names)
        self.assertNotIn('RecursiveCombine/A/B/Sum/CombinePerKey',
                         optimized_stage_names)
        self.assertIn(
            'RecursiveCombine/A/B/Packed[Sum_CombinePerKey, '
            'C_Sum_CombinePerKey, C_D_Sum_CombinePerKey]/Pack',
            optimized_stage_names)
Example #6
0
 def test_optimize_multiple_combine_globally(self):
   pipeline = beam.Pipeline()
   vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]
   pcoll = pipeline | Create(vals)
   _ = pcoll | 'mean-globally' >> combiners.Mean.Globally()
   _ = pcoll | 'count-globally' >> combiners.Count.Globally()
   _ = pcoll | 'largest-globally' >> core.CombineGlobally(combiners.Largest(1))
   pipeline_proto = pipeline.to_runner_api()
   optimized_pipeline_proto = translations.optimize_pipeline(
       pipeline_proto, [
           translations.pack_combiners,
       ],
       known_runner_urns=frozenset(),
       partial=True)
   # Tests that Pipeline.from_runner_api() does not throw an exception.
   runner = runners.DirectRunner()
   beam.Pipeline.from_runner_api(
       optimized_pipeline_proto, runner, pipeline_options.PipelineOptions())
Example #7
0
  def test_run_packable_combine_limit(self):
    class MultipleLargeCombines(beam.PTransform):
      def annotations(self):
        # Limit to at most 2 combiners per packed combiner.
        return {python_urns.APPLY_COMBINER_PACKING: b'2'}

      def expand(self, pcoll):
        assert_that(
            pcoll | 'min-1-globally' >> core.CombineGlobally(min),
            equal_to([-1]),
            label='assert-min-1-globally')
        assert_that(
            pcoll | 'min-2-globally' >> core.CombineGlobally(min),
            equal_to([-1]),
            label='assert-min-2-globally')
        assert_that(
            pcoll | 'min-3-globally' >> core.CombineGlobally(min),
            equal_to([-1]),
            label='assert-min-3-globally')

    class MultipleSmallCombines(beam.PTransform):
      def annotations(self):
        # Limit to at most 4 combiners per packed combiner.
        return {python_urns.APPLY_COMBINER_PACKING: b'4'}

      def expand(self, pcoll):
        assert_that(
            pcoll | 'min-4-globally' >> core.CombineGlobally(min),
            equal_to([-1]),
            label='assert-min-4-globally')
        assert_that(
            pcoll | 'min-5-globally' >> core.CombineGlobally(min),
            equal_to([-1]),
            label='assert-min-5-globally')

    with TestPipeline() as pipeline:
      vals = [6, 3, 1, -1, 9, 1, 5, 2, 0, 6]
      pcoll = pipeline | Create(vals)
      _ = pcoll | 'multiple-large-combines' >> MultipleLargeCombines()
      _ = pcoll | 'multiple-small-combines' >> MultipleSmallCombines()

    proto = pipeline.to_runner_api(
        default_environment=environments.EmbeddedPythonEnvironment(
            capabilities=environments.python_sdk_capabilities()))
    optimized = translations.optimize_pipeline(
        proto,
        phases=[translations.pack_combiners],
        known_runner_urns=frozenset(),
        partial=True)
    optimized_stage_names = [
        t.unique_name for t in optimized.components.transforms.values()
    ]
    self.assertIn(
        'multiple-large-combines/Packed[min-1-globally_CombinePerKey, '
        'min-2-globally_CombinePerKey]/Pack',
        optimized_stage_names)
    self.assertIn(
        'Packed[multiple-large-combines_min-3-globally_CombinePerKey, '
        'multiple-small-combines_min-4-globally_CombinePerKey]/Pack',
        optimized_stage_names)
    self.assertIn(
        'multiple-small-combines/min-5-globally/CombinePerKey',
        optimized_stage_names)
    self.assertNotIn(
        'multiple-large-combines/min-1-globally/CombinePerKey',
        optimized_stage_names)
    self.assertNotIn(
        'multiple-large-combines/min-2-globally/CombinePerKey',
        optimized_stage_names)
    self.assertNotIn(
        'multiple-large-combines/min-3-globally/CombinePerKey',
        optimized_stage_names)
    self.assertNotIn(
        'multiple-small-combines/min-4-globally/CombinePerKey',
        optimized_stage_names)