def test_java_sdk_harness_dedup(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned proto_pipeline, _ = pipeline.to_runner_api(return_context=True) dummy_env_1 = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=(beam_runner_api_pb2.DockerPayload( container_image='apache/beam_java:dummy_tag') ).SerializeToString()) proto_pipeline.components.environments['dummy_env_id_1'].CopyFrom( dummy_env_1) dummy_transform_1 = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id_1') proto_pipeline.components.transforms['dummy_transform_id_1'].CopyFrom( dummy_transform_1) dummy_env_2 = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=(beam_runner_api_pb2.DockerPayload( container_image='apache/beam_java:dummy_tag') ).SerializeToString()) proto_pipeline.components.environments['dummy_env_id_2'].CopyFrom( dummy_env_2) dummy_transform_2 = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id_2') proto_pipeline.components.transforms['dummy_transform_id_2'].CopyFrom( dummy_transform_2) # Accessing non-public method for testing. apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( proto_pipeline, dict(), pipeline_options) # Only one of 'dummy_env_id_1' or 'dummy_env_id_2' should be in the set of # environment IDs used by the proto after Java environment de-duping. env_ids_from_transforms = [ proto_pipeline.components.transforms[transform_id].environment_id for transform_id in proto_pipeline.components.transforms ] if 'dummy_env_id_1' in env_ids_from_transforms: self.assertTrue('dummy_env_id_2' not in env_ids_from_transforms) else: self.assertTrue('dummy_env_id_2' in env_ids_from_transforms)
def _create_environment(options): portable_options = options.view_as(PortableOptions) environment_urn = common_urns.environments.DOCKER.urn if portable_options.environment_type == 'DOCKER': environment_urn = common_urns.environments.DOCKER.urn elif portable_options.environment_type == 'PROCESS': environment_urn = common_urns.environments.PROCESS.urn if environment_urn == common_urns.environments.DOCKER.urn: docker_image = ( portable_options.environment_config or PortableRunner.default_docker_image()) return beam_runner_api_pb2.Environment( url=docker_image, urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image=docker_image ).SerializeToString()) elif environment_urn == common_urns.environments.PROCESS.urn: config = json.loads(portable_options.environment_config) return beam_runner_api_pb2.Environment( urn=common_urns.environments.PROCESS.urn, payload=beam_runner_api_pb2.ProcessPayload( os=(config.get('os') or ''), arch=(config.get('arch') or ''), command=config.get('command'), env=(config.get('env') or '') ).SerializeToString())
def test__create_default_environment(self): docker_image = PortableRunner.default_docker_image() self.assertEqual( PortableRunner._create_environment( PipelineOptions.from_dictionary({})), beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image=docker_image).SerializeToString()))
def test__create_docker_environment(self): docker_image = 'py-docker' self.assertEqual( PortableRunner._create_environment(PipelineOptions.from_dictionary({ 'environment_type': 'DOCKER', 'environment_config': docker_image, })), beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image=docker_image ).SerializeToString()))
def test_sdk_harness_container_images_get_set(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned test_environment = DockerEnvironment( container_image='test_default_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) # We have to manually add environments since Dataflow only sets # 'sdkHarnessContainerImages' when there are at least two environments. dummy_env = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=(beam_runner_api_pb2.DockerPayload( container_image='dummy_image')).SerializeToString()) proto_pipeline.components.environments['dummy_env_id'].CopyFrom( dummy_env) dummy_transform = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id') proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom( dummy_transform) env = apiclient.Environment( [], # packages pipeline_options, '2.0.0', # any environment version FAKE_PIPELINE_URL, proto_pipeline, _sdk_image_overrides={ '.*dummy.*': 'dummy_image', '.*test.*': 'test_default_image' }) worker_pool = env.proto.workerPools[0] # For the test, a third environment get added since actual default # container image for Dataflow is different from 'test_default_image' # we've provided above. self.assertEqual(3, len(worker_pool.sdkHarnessContainerImages)) # Container image should be overridden by a Dataflow specific URL. self.assertTrue( str.startswith( (worker_pool.sdkHarnessContainerImages[0]).containerImage, 'gcr.io/cloud-dataflow/v1beta3/python'))
def _create_environment(options): portable_options = options.view_as(PortableOptions) environment_urn = common_urns.environments.DOCKER.urn if portable_options.environment_type == 'DOCKER': environment_urn = common_urns.environments.DOCKER.urn elif portable_options.environment_type == 'PROCESS': environment_urn = common_urns.environments.PROCESS.urn elif portable_options.environment_type in ('EXTERNAL', 'LOOPBACK'): environment_urn = common_urns.environments.EXTERNAL.urn elif portable_options.environment_type: if portable_options.environment_type.startswith('beam:env:'): environment_urn = portable_options.environment_type else: raise ValueError( 'Unknown environment type: %s' % portable_options.environment_type) if environment_urn == common_urns.environments.DOCKER.urn: docker_image = ( portable_options.environment_config or PortableRunner.default_docker_image()) return beam_runner_api_pb2.Environment( url=docker_image, urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image=docker_image ).SerializeToString()) elif environment_urn == common_urns.environments.PROCESS.urn: config = json.loads(portable_options.environment_config) return beam_runner_api_pb2.Environment( urn=common_urns.environments.PROCESS.urn, payload=beam_runner_api_pb2.ProcessPayload( os=(config.get('os') or ''), arch=(config.get('arch') or ''), command=config.get('command'), env=(config.get('env') or '') ).SerializeToString()) elif environment_urn == common_urns.environments.EXTERNAL.urn: return beam_runner_api_pb2.Environment( urn=common_urns.environments.EXTERNAL.urn, payload=beam_runner_api_pb2.ExternalPayload( endpoint=endpoints_pb2.ApiServiceDescriptor( url=portable_options.environment_config) ).SerializeToString()) else: return beam_runner_api_pb2.Environment( urn=environment_urn, payload=(portable_options.environment_config.encode('ascii') if portable_options.environment_config else None))
def test_environment_override_translation(self): self.default_properties.append('--experiments=beam_fn_api') self.default_properties.append('--worker_harness_container_image=FOO') remote_runner = DataflowRunner() p = Pipeline(remote_runner, options=PipelineOptions(self.default_properties)) (p | ptransform.Create([1, 2, 3]) # pylint: disable=expression-not-assigned | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)]) | ptransform.GroupByKey()) p.run() self.assertEqual( list(remote_runner.proto_pipeline.components.environments.values()), [beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image='FOO').SerializeToString())])
def test_default_environment_get_set(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned test_environment = DockerEnvironment(container_image='test_default_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) dummy_env = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=( beam_runner_api_pb2.DockerPayload( container_image='dummy_image')).SerializeToString()) proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env) dummy_transform = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id') proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom( dummy_transform) env = apiclient.Environment( [], # packages pipeline_options, '2.0.0', # any environment version FAKE_PIPELINE_URL, proto_pipeline, _sdk_image_overrides={ '.*dummy.*': 'dummy_image', '.*test.*': 'test_default_image' }) worker_pool = env.proto.workerPools[0] self.assertEqual(2, len(worker_pool.sdkHarnessContainerImages)) images_from_proto = [ sdk_info.containerImage for sdk_info in worker_pool.sdkHarnessContainerImages ] self.assertIn('test_default_image', images_from_proto)
def test_pipeline_sdk_not_overridden(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp', '--worker_harness_container_image=dummy_prefix/dummy_name:dummy_tag' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned proto_pipeline, _ = pipeline.to_runner_api(return_context=True) dummy_env = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=( beam_runner_api_pb2.DockerPayload( container_image='dummy_prefix/dummy_name:dummy_tag') ).SerializeToString()) proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env) dummy_transform = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id') proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom( dummy_transform) # Accessing non-public method for testing. apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( proto_pipeline, dict(), pipeline_options) self.assertIsNotNone(2, len(proto_pipeline.components.environments)) from apache_beam.utils import proto_utils found_override = False for env in proto_pipeline.components.environments.values(): docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) if docker_payload.container_image.startswith( names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY): found_override = True self.assertFalse(found_override)
def __init__(self, proto=None, default_environment_url=None): if isinstance(proto, beam_fn_api_pb2.ProcessBundleDescriptor): proto = beam_runner_api_pb2.Components( coders=dict(proto.coders.items()), windowing_strategies=dict(proto.windowing_strategies.items()), environments=dict(proto.environments.items())) for name, cls in self._COMPONENT_TYPES.items(): setattr(self, name, _PipelineContextMap(self, cls, getattr(proto, name, None))) if default_environment_url: self._default_environment_id = self.environments.get_id( Environment( beam_runner_api_pb2.Environment( url=default_environment_url, urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image=default_environment_url). SerializeToString()))) else: self._default_environment_id = None
def to_runner_api_parameter(self, context): # type: (PipelineContext) -> Tuple[str, beam_runner_api_pb2.DockerPayload] return ( common_urns.environments.DOCKER.urn, beam_runner_api_pb2.DockerPayload(container_image=self.container_image))
def _create_environment(options): portable_options = options.view_as(PortableOptions) # Do not set a Runner. Otherwise this can cause problems in Java's # PipelineOptions, i.e. ClassNotFoundException, if the corresponding Runner # does not exist in the Java SDK. In portability, the entry point is clearly # defined via the JobService. portable_options.view_as(StandardOptions).runner = None environment_urn = common_urns.environments.DOCKER.urn if portable_options.environment_type == 'DOCKER': environment_urn = common_urns.environments.DOCKER.urn elif portable_options.environment_type == 'PROCESS': environment_urn = common_urns.environments.PROCESS.urn elif portable_options.environment_type in ('EXTERNAL', 'LOOPBACK'): environment_urn = common_urns.environments.EXTERNAL.urn elif portable_options.environment_type: if portable_options.environment_type.startswith('beam:env:'): environment_urn = portable_options.environment_type else: raise ValueError('Unknown environment type: %s' % portable_options.environment_type) if environment_urn == common_urns.environments.DOCKER.urn: docker_image = (portable_options.environment_config or PortableRunner.default_docker_image()) return beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image=docker_image).SerializeToString()) elif environment_urn == common_urns.environments.PROCESS.urn: config = json.loads(portable_options.environment_config) return beam_runner_api_pb2.Environment( urn=common_urns.environments.PROCESS.urn, payload=beam_runner_api_pb2.ProcessPayload( os=(config.get('os') or ''), arch=(config.get('arch') or ''), command=config.get('command'), env=(config.get('env') or '')).SerializeToString()) elif environment_urn == common_urns.environments.EXTERNAL.urn: def looks_like_json(environment_config): import re return re.match(r'\s*\{.*\}\s*$', environment_config) if looks_like_json(portable_options.environment_config): config = json.loads(portable_options.environment_config) url = config.get('url') if not url: raise ValueError( 'External environment endpoint must be set.') params = config.get('params') else: url = portable_options.environment_config params = None return beam_runner_api_pb2.Environment( urn=common_urns.environments.EXTERNAL.urn, payload=beam_runner_api_pb2.ExternalPayload( endpoint=endpoints_pb2.ApiServiceDescriptor(url=url), params=params).SerializeToString()) else: return beam_runner_api_pb2.Environment( urn=environment_urn, payload=(portable_options.environment_config.encode('ascii') if portable_options.environment_config else None))
def to_runner_api_parameter(self, context): return (common_urns.environments.DOCKER.urn, beam_runner_api_pb2.DockerPayload( container_image=self.container_image))