def test_parent_pointer(self): class MyPTransform(beam.PTransform): def expand(self, p): self.p = p return p | beam.Create([None]) p = beam.Pipeline() p | MyPTransform() # pylint: disable=expression-not-assigned p = Pipeline.from_runner_api( Pipeline.to_runner_api(p, use_fake_coders=True), None, None) self.assertIsNotNone(p.transforms_stack[0].parts[0].parent) self.assertEqual(p.transforms_stack[0].parts[0].parent, p.transforms_stack[0])
def test_parent_pointer(self): class MyPTransform(beam.PTransform): def expand(self, p): self.p = p return p | beam.Create([None]) p = beam.Pipeline() p | MyPTransform() # pylint: disable=expression-not-assigned p = Pipeline.from_runner_api(Pipeline.to_runner_api(p), None, None) self.assertIsNotNone(p.transforms_stack[0].parts[0].parent) self.assertEquals(p.transforms_stack[0].parts[0].parent, p.transforms_stack[0])
def test_sdk_harness_container_images_get_set(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned test_environment = DockerEnvironment( container_image='test_default_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) # We have to manually add environments since Dataflow only sets # 'sdkHarnessContainerImages' when there are at least two environments. dummy_env = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=(beam_runner_api_pb2.DockerPayload( container_image='dummy_image')).SerializeToString()) proto_pipeline.components.environments['dummy_env_id'].CopyFrom( dummy_env) dummy_transform = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id') proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom( dummy_transform) env = apiclient.Environment( [], # packages pipeline_options, '2.0.0', # any environment version FAKE_PIPELINE_URL, proto_pipeline, _sdk_image_overrides={ '.*dummy.*': 'dummy_image', '.*test.*': 'test_default_image' }) worker_pool = env.proto.workerPools[0] # For the test, a third environment get added since actual default # container image for Dataflow is different from 'test_default_image' # we've provided above. self.assertEqual(3, len(worker_pool.sdkHarnessContainerImages)) # Container image should be overridden by a Dataflow specific URL. self.assertTrue( str.startswith( (worker_pool.sdkHarnessContainerImages[0]).containerImage, 'gcr.io/cloud-dataflow/v1beta3/python'))
def test_java_sdk_harness_dedup(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned proto_pipeline, _ = pipeline.to_runner_api(return_context=True) dummy_env_1 = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=(beam_runner_api_pb2.DockerPayload( container_image='apache/beam_java:dummy_tag') ).SerializeToString()) proto_pipeline.components.environments['dummy_env_id_1'].CopyFrom( dummy_env_1) dummy_transform_1 = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id_1') proto_pipeline.components.transforms['dummy_transform_id_1'].CopyFrom( dummy_transform_1) dummy_env_2 = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=(beam_runner_api_pb2.DockerPayload( container_image='apache/beam_java:dummy_tag') ).SerializeToString()) proto_pipeline.components.environments['dummy_env_id_2'].CopyFrom( dummy_env_2) dummy_transform_2 = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id_2') proto_pipeline.components.transforms['dummy_transform_id_2'].CopyFrom( dummy_transform_2) # Accessing non-public method for testing. apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( proto_pipeline, dict(), pipeline_options) # Only one of 'dummy_env_id_1' or 'dummy_env_id_2' should be in the set of # environment IDs used by the proto after Java environment de-duping. env_ids_from_transforms = [ proto_pipeline.components.transforms[transform_id].environment_id for transform_id in proto_pipeline.components.transforms ] if 'dummy_env_id_1' in env_ids_from_transforms: self.assertTrue('dummy_env_id_2' not in env_ids_from_transforms) else: self.assertTrue('dummy_env_id_2' in env_ids_from_transforms)
def test_visitor_not_sorted(self): p = Pipeline() # pylint: disable=expression-not-assigned from apache_beam.testing.test_stream import TestStream p | TestStream().add_elements(['']) | beam.Map(lambda _: _) original_graph = p.to_runner_api(return_context=False) out_of_order_graph = p.to_runner_api(return_context=False) root_id = out_of_order_graph.root_transform_ids[0] root = out_of_order_graph.components.transforms[root_id] tmp = root.subtransforms[0] root.subtransforms[0] = root.subtransforms[1] root.subtransforms[1] = tmp p = beam.Pipeline().from_runner_api(out_of_order_graph, runner='BundleBasedDirectRunner', options=None) v_out_of_order = ConsumerTrackingPipelineVisitor() p.visit(v_out_of_order) p = beam.Pipeline().from_runner_api(original_graph, runner='BundleBasedDirectRunner', options=None) v_original = ConsumerTrackingPipelineVisitor() p.visit(v_original) # Convert to string to assert they are equal. out_of_order_labels = { str(k): [str(t) for t in v_out_of_order.value_to_consumers[k]] for k in v_out_of_order.value_to_consumers } original_labels = { str(k): [str(t) for t in v_original.value_to_consumers[k]] for k in v_original.value_to_consumers } self.assertDictEqual(out_of_order_labels, original_labels)
def test_bad_path(self): dummy_sdk_file = tempfile.NamedTemporaryFile() remote_runner = DataflowRunner() pipeline = Pipeline( remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--sdk_location=' + dummy_sdk_file.name, '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--template_location=/bad/path', '--no_auth' ])) remote_runner.job = apiclient.Job(pipeline._options, pipeline.to_runner_api()) with self.assertRaises(IOError): pipeline.run().wait_until_finish()
def test_default_environment_get_set(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned test_environment = DockerEnvironment(container_image='test_default_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) dummy_env = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=( beam_runner_api_pb2.DockerPayload( container_image='dummy_image')).SerializeToString()) proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env) dummy_transform = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id') proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom( dummy_transform) env = apiclient.Environment( [], # packages pipeline_options, '2.0.0', # any environment version FAKE_PIPELINE_URL, proto_pipeline, _sdk_image_overrides={ '.*dummy.*': 'dummy_image', '.*test.*': 'test_default_image' }) worker_pool = env.proto.workerPools[0] self.assertEqual(2, len(worker_pool.sdkHarnessContainerImages)) images_from_proto = [ sdk_info.containerImage for sdk_info in worker_pool.sdkHarnessContainerImages ] self.assertIn('test_default_image', images_from_proto)
def test_bad_path(self): dummy_sdk_file = tempfile.NamedTemporaryFile() remote_runner = DataflowRunner() pipeline = Pipeline(remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--sdk_location=' + dummy_sdk_file.name, '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--template_location=/bad/path', '--no_auth=True'])) remote_runner.job = apiclient.Job(pipeline._options, pipeline.to_runner_api()) with self.assertRaises(IOError): pipeline.run().wait_until_finish()
def test_pipeline_sdk_not_overridden(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp', '--worker_harness_container_image=dummy_prefix/dummy_name:dummy_tag' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned proto_pipeline, _ = pipeline.to_runner_api(return_context=True) dummy_env = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=( beam_runner_api_pb2.DockerPayload( container_image='dummy_prefix/dummy_name:dummy_tag') ).SerializeToString()) proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env) dummy_transform = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id') proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom( dummy_transform) # Accessing non-public method for testing. apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( proto_pipeline, dict(), pipeline_options) self.assertIsNotNone(2, len(proto_pipeline.components.environments)) from apache_beam.utils import proto_utils found_override = False for env in proto_pipeline.components.environments.values(): docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) if docker_payload.container_image.startswith( names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY): found_override = True self.assertFalse(found_override)
def test_sdk_harness_container_image_overrides(self): if 'sdkHarnessContainerImages' not in dataflow.WorkerPool.__dict__: _LOGGER.warning( 'Skipping test \'test_sdk_harness_container_image_overrides\' since ' 'Dataflow API WorkerPool does not have attribute ' '\'sdkHarnessContainerImages\'') return pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp', '--project', 'dummy_project', '--sdk_harness_container_image_overrides', '.*dummy.*,new_dummy_container_image', ]) pipeline = Pipeline(options=pipeline_options) test_environment = DockerEnvironment( container_image='dummy_container_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) dataflow_client = apiclient.DataflowApplicationClient(pipeline_options) # Accessing non-public method for testing. dataflow_client._apply_sdk_environment_overrides(proto_pipeline) self.assertIsNotNone(1, len(proto_pipeline.components.environments)) env = list(proto_pipeline.components.environments.values())[0] from apache_beam.utils import proto_utils docker_payload = proto_utils.parse_Bytes( env.payload, beam_runner_api_pb2.DockerPayload) # Container image should be overridden by a the given override. self.assertEqual(docker_payload.container_image, 'new_dummy_container_image')
def test_display_data(self): class MyParentTransform(beam.PTransform): def expand(self, p): self.p = p return p | beam.Create([None]) def display_data(self): # type: () -> dict parent_dd = super(MyParentTransform, self).display_data() parent_dd['p_dd_string'] = DisplayDataItem( 'p_dd_string_value', label='p_dd_string_label') parent_dd['p_dd_bool'] = DisplayDataItem(True, label='p_dd_bool_label') parent_dd['p_dd_int'] = DisplayDataItem(1, label='p_dd_int_label') return parent_dd class MyPTransform(MyParentTransform): def expand(self, p): self.p = p return p | beam.Create([None]) def display_data(self): # type: () -> dict parent_dd = super(MyPTransform, self).display_data() parent_dd['dd_string'] = DisplayDataItem( 'dd_string_value', label='dd_string_label') parent_dd['dd_bool'] = DisplayDataItem(False, label='dd_bool_label') parent_dd['dd_int'] = DisplayDataItem(1.1, label='dd_int_label') return parent_dd p = beam.Pipeline() p | MyPTransform() # pylint: disable=expression-not-assigned from apache_beam.portability.api import beam_runner_api_pb2 proto_pipeline = Pipeline.to_runner_api(p, use_fake_coders=True) my_transform, = [ transform for transform in proto_pipeline.components.transforms.values() if transform.unique_name == 'MyPTransform' ] self.assertIsNotNone(my_transform) self.assertListEqual( list(my_transform.display_data), [ beam_runner_api_pb2.DisplayData( urn=common_urns.StandardDisplayData.DisplayData.LABELLED.urn, payload=beam_runner_api_pb2.LabelledPayload( label='p_dd_string_label', string_value='p_dd_string_value').SerializeToString()), beam_runner_api_pb2.DisplayData( urn=common_urns.StandardDisplayData.DisplayData.LABELLED.urn, payload=beam_runner_api_pb2.LabelledPayload( label='p_dd_bool_label', bool_value=True).SerializeToString()), beam_runner_api_pb2.DisplayData( urn=common_urns.StandardDisplayData.DisplayData.LABELLED.urn, payload=beam_runner_api_pb2.LabelledPayload( label='p_dd_int_label', double_value=1).SerializeToString()), beam_runner_api_pb2.DisplayData( urn=common_urns.StandardDisplayData.DisplayData.LABELLED.urn, payload=beam_runner_api_pb2.LabelledPayload( label='dd_string_label', string_value='dd_string_value').SerializeToString()), beam_runner_api_pb2.DisplayData( urn=common_urns.StandardDisplayData.DisplayData.LABELLED.urn, payload=beam_runner_api_pb2.LabelledPayload( label='dd_bool_label', bool_value=False).SerializeToString()), beam_runner_api_pb2.DisplayData( urn=common_urns.StandardDisplayData.DisplayData.LABELLED.urn, payload=beam_runner_api_pb2.LabelledPayload( label='dd_int_label', double_value=1.1).SerializeToString()), ])