Exemple #1
0
  def matches(self, applied_ptransform):
    # Imported here to avoid circular dependencies.
    # pylint: disable=wrong-import-order, wrong-import-position
    from apache_beam import util

    transform = applied_ptransform.transform

    if not isinstance(transform, util.GroupIntoBatches.WithShardedKey):
      return False

    # The replacement is only valid for portable Streaming Engine jobs with
    # runner v2.
    standard_options = self.options.view_as(StandardOptions)
    if not standard_options.streaming:
      return False
    google_cloud_options = self.options.view_as(GoogleCloudOptions)
    if not google_cloud_options.enable_streaming_engine:
      return False

    from apache_beam.runners.dataflow.internal import apiclient
    if not apiclient._use_unified_worker(self.options):
      return False
    experiments = self.options.view_as(DebugOptions).experiments or []
    if 'enable_streaming_auto_sharding' not in experiments:
      return False

    self.dataflow_runner.add_pcoll_with_auto_sharding(applied_ptransform)
    return True
Exemple #2
0
    def matches(self, applied_ptransform):
        # Imported here to avoid circular dependencies.
        # pylint: disable=wrong-import-order, wrong-import-position
        from apache_beam import util

        transform = applied_ptransform.transform

        if not isinstance(transform, util.GroupIntoBatches.WithShardedKey):
            return False

        # The replacement is only valid for portable Streaming Engine jobs with
        # runner v2.
        standard_options = self.options.view_as(StandardOptions)
        if not standard_options.streaming:
            return False
        google_cloud_options = self.options.view_as(GoogleCloudOptions)
        if not google_cloud_options.enable_streaming_engine:
            raise ValueError(
                'Runner determined sharding not available in Dataflow for '
                'GroupIntoBatches for non-Streaming-Engine jobs. In order to use '
                'runner determined sharding, please use '
                '--streaming --enable_streaming_engine --experiments=use_runner_v2'
            )

        from apache_beam.runners.dataflow.internal import apiclient
        if not apiclient._use_unified_worker(self.options):
            raise ValueError(
                'Runner determined sharding not available in Dataflow for '
                'GroupIntoBatches for jobs not using Runner V2. In order to use '
                'runner determined sharding, please use '
                '--streaming --enable_streaming_engine --experiments=use_runner_v2'
            )

        self.dataflow_runner.add_pcoll_with_auto_sharding(applied_ptransform)
        return True
Exemple #3
0
  def test_use_unified_worker(self):
    pipeline_options = PipelineOptions([])
    self.assertFalse(apiclient._use_unified_worker(pipeline_options))

    pipeline_options = PipelineOptions(['--experiments=beam_fn_api'])
    self.assertFalse(apiclient._use_unified_worker(pipeline_options))

    pipeline_options = PipelineOptions(['--experiments=use_unified_worker'])
    self.assertTrue(apiclient._use_unified_worker(pipeline_options))

    pipeline_options = PipelineOptions(
        ['--experiments=use_unified_worker', '--experiments=beam_fn_api'])
    self.assertTrue(apiclient._use_unified_worker(pipeline_options))

    pipeline_options = PipelineOptions(
        ['--experiments=use_runner_v2', '--experiments=beam_fn_api'])
    self.assertTrue(apiclient._use_unified_worker(pipeline_options))

    pipeline_options = PipelineOptions([
        '--experiments=use_unified_worker',
        '--experiments=use_runner_v2',
        '--experiments=beam_fn_api'
    ])
    self.assertTrue(apiclient._use_unified_worker(pipeline_options))