def __init__(
        self,
        default_environment=None,  # type: Optional[environments.Environment]
        bundle_repeat=0,
        use_state_iterables=False,
        provision_info=None,  # type: Optional[ExtendedProvisionInfo]
        progress_request_frequency=None):
        # type: (...) -> None
        """Creates a new Fn API Runner.

    Args:
      default_environment: the default environment to use for UserFns.
      bundle_repeat: replay every bundle this many extra times, for profiling
          and debugging
      use_state_iterables: Intentionally split gbk iterables over state API
          (for testing)
      provision_info: provisioning info to make available to workers, or None
      progress_request_frequency: The frequency (in seconds) that the runner
          waits before requesting progress from the SDK.
    """
        super(FnApiRunner, self).__init__()
        self._default_environment = (default_environment or
                                     environments.EmbeddedPythonEnvironment())
        self._bundle_repeat = bundle_repeat
        self._num_workers = 1
        self._progress_frequency = progress_request_frequency
        self._profiler_factory = None  # type: Optional[Callable[..., profiler.Profile]]
        self._use_state_iterables = use_state_iterables
        self._provision_info = provision_info or ExtendedProvisionInfo(
            beam_provision_api_pb2.ProvisionInfo(
                retrieval_token='unused-retrieval-token'))
Exemple #2
0
    def test_conditionally_packed_combiners(self):
        class RecursiveCombine(beam.PTransform):
            def __init__(self, labels):
                self._labels = labels

            def expand(self, pcoll):
                base = pcoll | 'Sum' >> beam.CombineGlobally(sum)
                if self._labels:
                    rest = pcoll | self._labels[0] >> RecursiveCombine(
                        self._labels[1:])
                    return (base, rest) | beam.Flatten()
                else:
                    return base

            def annotations(self):
                if len(self._labels) == 2:
                    return {python_urns.APPLY_COMBINER_PACKING: b''}
                else:
                    return {}

        # Verify the results are as expected.
        with TestPipeline() as pipeline:
            result = pipeline | beam.Create([1, 2, 3
                                             ]) | RecursiveCombine('ABCD')
            assert_that(result, equal_to([6, 6, 6, 6, 6]))

        # Verify the optimization is as expected.
        proto = pipeline.to_runner_api(
            default_environment=environments.EmbeddedPythonEnvironment(
                capabilities=environments.python_sdk_capabilities()))
        optimized = translations.optimize_pipeline(
            proto,
            phases=[translations.pack_combiners],
            known_runner_urns=frozenset(),
            partial=True)
        optimized_stage_names = sorted(
            t.unique_name for t in optimized.components.transforms.values())
        self.assertIn('RecursiveCombine/Sum/CombinePerKey',
                      optimized_stage_names)
        self.assertIn('RecursiveCombine/A/Sum/CombinePerKey',
                      optimized_stage_names)
        self.assertNotIn('RecursiveCombine/A/B/Sum/CombinePerKey',
                         optimized_stage_names)
        self.assertIn(
            'RecursiveCombine/A/B/Packed[Sum_CombinePerKey, '
            'C_Sum_CombinePerKey, C_D_Sum_CombinePerKey]/Pack',
            optimized_stage_names)
Exemple #3
0
  def test_run_packable_combine_limit(self):
    class MultipleLargeCombines(beam.PTransform):
      def annotations(self):
        # Limit to at most 2 combiners per packed combiner.
        return {python_urns.APPLY_COMBINER_PACKING: b'2'}

      def expand(self, pcoll):
        assert_that(
            pcoll | 'min-1-globally' >> core.CombineGlobally(min),
            equal_to([-1]),
            label='assert-min-1-globally')
        assert_that(
            pcoll | 'min-2-globally' >> core.CombineGlobally(min),
            equal_to([-1]),
            label='assert-min-2-globally')
        assert_that(
            pcoll | 'min-3-globally' >> core.CombineGlobally(min),
            equal_to([-1]),
            label='assert-min-3-globally')

    class MultipleSmallCombines(beam.PTransform):
      def annotations(self):
        # Limit to at most 4 combiners per packed combiner.
        return {python_urns.APPLY_COMBINER_PACKING: b'4'}

      def expand(self, pcoll):
        assert_that(
            pcoll | 'min-4-globally' >> core.CombineGlobally(min),
            equal_to([-1]),
            label='assert-min-4-globally')
        assert_that(
            pcoll | 'min-5-globally' >> core.CombineGlobally(min),
            equal_to([-1]),
            label='assert-min-5-globally')

    with TestPipeline() as pipeline:
      vals = [6, 3, 1, -1, 9, 1, 5, 2, 0, 6]
      pcoll = pipeline | Create(vals)
      _ = pcoll | 'multiple-large-combines' >> MultipleLargeCombines()
      _ = pcoll | 'multiple-small-combines' >> MultipleSmallCombines()

    proto = pipeline.to_runner_api(
        default_environment=environments.EmbeddedPythonEnvironment(
            capabilities=environments.python_sdk_capabilities()))
    optimized = translations.optimize_pipeline(
        proto,
        phases=[translations.pack_combiners],
        known_runner_urns=frozenset(),
        partial=True)
    optimized_stage_names = [
        t.unique_name for t in optimized.components.transforms.values()
    ]
    self.assertIn(
        'multiple-large-combines/Packed[min-1-globally_CombinePerKey, '
        'min-2-globally_CombinePerKey]/Pack',
        optimized_stage_names)
    self.assertIn(
        'Packed[multiple-large-combines_min-3-globally_CombinePerKey, '
        'multiple-small-combines_min-4-globally_CombinePerKey]/Pack',
        optimized_stage_names)
    self.assertIn(
        'multiple-small-combines/min-5-globally/CombinePerKey',
        optimized_stage_names)
    self.assertNotIn(
        'multiple-large-combines/min-1-globally/CombinePerKey',
        optimized_stage_names)
    self.assertNotIn(
        'multiple-large-combines/min-2-globally/CombinePerKey',
        optimized_stage_names)
    self.assertNotIn(
        'multiple-large-combines/min-3-globally/CombinePerKey',
        optimized_stage_names)
    self.assertNotIn(
        'multiple-small-combines/min-4-globally/CombinePerKey',
        optimized_stage_names)