def test_flexrs_cost(self): pipeline_options = PipelineOptions( ['--flexrs_goal', 'COST_OPTIMIZED', '--temp_location', 'gs://any-location/temp']) env = apiclient.Environment([], #packages pipeline_options, '2.0.0', #any environment version FAKE_PIPELINE_URL) self.assertEqual( env.proto.flexResourceSchedulingGoal, (dataflow.Environment.FlexResourceSchedulingGoalValueValuesEnum. FLEXRS_COST_OPTIMIZED))
def test_set_subnetwork(self): pipeline_options = PipelineOptions([ '--subnetwork', '/regions/MY/subnetworks/SUBNETWORK', '--temp_location', 'gs://any-location/temp' ]) env = apiclient.Environment( [], #packages pipeline_options, '2.0.0', #any environment version FAKE_PIPELINE_URL) self.assertEqual(env.proto.workerPools[0].subnetwork, '/regions/MY/subnetworks/SUBNETWORK')
def test_harness_override_uses_base_version_in_rc_releases(self): pipeline_options = PipelineOptions( ['--temp_location', 'gs://any-location/temp', '--streaming']) override = ''.join([ 'runner_harness_container_image=', names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY, '/harness:2.2.0' ]) env = apiclient.Environment( [], #packages pipeline_options, '2.0.0', #any environment version FAKE_PIPELINE_URL) self.assertIn(override, env.proto.experiments)
def test_harness_override_absent_in_released_sdks_with_runner_v2(self): pipeline_options = PipelineOptions([ '--temp_location', 'gs://any-location/temp', '--streaming', '--experiments=use_runner_v2' ]) env = apiclient.Environment( [], #packages pipeline_options, '2.0.0', #any environment version FAKE_PIPELINE_URL) if env.proto.experiments: for experiment in env.proto.experiments: self.assertNotIn('runner_harness_container_image=', experiment)
def test_sdk_harness_container_images_get_set(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned test_environment = DockerEnvironment( container_image='test_default_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) # We have to manually add environments since Dataflow only sets # 'sdkHarnessContainerImages' when there are at least two environments. dummy_env = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=(beam_runner_api_pb2.DockerPayload( container_image='dummy_image')).SerializeToString()) proto_pipeline.components.environments['dummy_env_id'].CopyFrom( dummy_env) dummy_transform = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id') proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom( dummy_transform) env = apiclient.Environment( [], # packages pipeline_options, '2.0.0', # any environment version FAKE_PIPELINE_URL, proto_pipeline, _sdk_image_overrides={ '.*dummy.*': 'dummy_image', '.*test.*': 'test_default_image' }) worker_pool = env.proto.workerPools[0] # For the test, a third environment get added since actual default # container image for Dataflow is different from 'test_default_image' # we've provided above. self.assertEqual(3, len(worker_pool.sdkHarnessContainerImages)) # Container image should be overridden by a Dataflow specific URL. self.assertTrue( str.startswith( (worker_pool.sdkHarnessContainerImages[0]).containerImage, 'gcr.io/cloud-dataflow/v1beta3/python'))
def test_pinned_worker_harness_image_tag_used_in_dev_sdk(self): # streaming, fnapi pipeline. pipeline_options = PipelineOptions( ['--temp_location', 'gs://any-location/temp', '--streaming']) env = apiclient.Environment( [], #packages pipeline_options, '2.0.0', #any environment version FAKE_PIPELINE_URL) if sys.version_info[0] == 3: self.assertEqual( env.proto.workerPools[0].workerHarnessContainerImage, (names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY + '/python3-fnapi:' + names.BEAM_FNAPI_CONTAINER_VERSION)) else: self.assertEqual( env.proto.workerPools[0].workerHarnessContainerImage, (names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY + '/python-fnapi:' + names.BEAM_FNAPI_CONTAINER_VERSION)) # batch, legacy pipeline. pipeline_options = PipelineOptions( ['--temp_location', 'gs://any-location/temp']) env = apiclient.Environment( [], #packages pipeline_options, '2.0.0', #any environment version FAKE_PIPELINE_URL) if sys.version_info[0] == 3: self.assertEqual( env.proto.workerPools[0].workerHarnessContainerImage, (names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY + '/python3:' + names.BEAM_CONTAINER_VERSION)) else: self.assertEqual( env.proto.workerPools[0].workerHarnessContainerImage, (names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY + '/python:' + names.BEAM_CONTAINER_VERSION))
def test_enable_hot_key_logging(self): # Tests that the enable_hot_key_logging is not set by default. pipeline_options = PipelineOptions( ['--temp_location', 'gs://any-location/temp']) env = apiclient.Environment( [], #packages pipeline_options, '2.0.0', #any environment version FAKE_PIPELINE_URL) self.assertIsNone(env.proto.debugOptions) # Now test that it is set when given. pipeline_options = PipelineOptions([ '--enable_hot_key_logging', '--temp_location', 'gs://any-location/temp' ]) env = apiclient.Environment( [], #packages pipeline_options, '2.0.0', #any environment version FAKE_PIPELINE_URL) self.assertEqual(env.proto.debugOptions, dataflow.DebugOptions(enableHotKeyLogging=True))
def test_worker_harness_override_takes_precedence_over_sdk_defaults(self): # streaming, fnapi pipeline. pipeline_options = PipelineOptions( ['--temp_location', 'gs://any-location/temp', '--streaming', '--worker_harness_container_image=some:image']) env = apiclient.Environment([], #packages pipeline_options, '2.0.0', #any environment version FAKE_PIPELINE_URL) self.assertEqual( env.proto.workerPools[0].workerHarnessContainerImage, 'some:image') # batch, legacy pipeline. pipeline_options = PipelineOptions( ['--temp_location', 'gs://any-location/temp', '--worker_harness_container_image=some:image']) env = apiclient.Environment([], #packages pipeline_options, '2.0.0', #any environment version FAKE_PIPELINE_URL) self.assertEqual( env.proto.workerPools[0].workerHarnessContainerImage, 'some:image')
def test_harness_override_present_in_dataflow_distributions(self): pipeline_options = PipelineOptions( ['--temp_location', 'gs://any-location/temp', '--streaming']) override = ''.join([ 'runner_harness_container_image=', dependency.DATAFLOW_CONTAINER_IMAGE_REPOSITORY, '/harness:2.2.0' ]) distribution = pkg_resources.Distribution(version='2.2.0') with mock.patch( 'apache_beam.runners.dataflow.internal.dependency.pkg_resources' '.get_distribution', mock.MagicMock(return_value=distribution)): env = apiclient.Environment([], pipeline_options, '2.2.0') self.assertIn(override, env.proto.experiments)
def test_worker_harness_image_tag_matches_released_sdk_version(self): # streaming, fnapi pipeline. pipeline_options = PipelineOptions( ['--temp_location', 'gs://any-location/temp', '--streaming']) env = apiclient.Environment( [], #packages pipeline_options, '2.0.0', #any environment version FAKE_PIPELINE_URL) self.assertEqual(env.proto.workerPools[0].workerHarnessContainerImage, (names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY + '/python-fnapi:2.2.0')) # batch, legacy pipeline. pipeline_options = PipelineOptions( ['--temp_location', 'gs://any-location/temp']) env = apiclient.Environment( [], #packages pipeline_options, '2.0.0', #any environment version FAKE_PIPELINE_URL) self.assertEqual( env.proto.workerPools[0].workerHarnessContainerImage, (names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY + '/python:2.2.0'))
def test_harness_override_absent_in_unreleased_sdk(self): pipeline_options = PipelineOptions( ['--temp_location', 'gs://any-location/temp', '--streaming']) with mock.patch( 'apache_beam.runners.dataflow.internal.dependency.pkg_resources' '.get_distribution', mock.Mock(side_effect=pkg_resources.DistributionNotFound())): env = apiclient.Environment( [], #packages pipeline_options, '2.0.0') #any environment version if env.proto.experiments: for experiment in env.proto.experiments: self.assertNotIn('runner_harness_container_image=', experiment)
def test_harness_override_present_in_beam_releases(self): pipeline_options = PipelineOptions( ['--temp_location', 'gs://any-location/temp', '--streaming']) override = ''.join([ 'runner_harness_container_image=', dependency.DATAFLOW_CONTAINER_IMAGE_REPOSITORY, '/harness:2.2.0' ]) with mock.patch( 'apache_beam.runners.dataflow.internal.dependency.pkg_resources' '.get_distribution', mock.Mock(side_effect=pkg_resources.DistributionNotFound())): env = apiclient.Environment( [], #packages pipeline_options, '2.0.0') #any environment version self.assertIn(override, env.proto.experiments)
def test_default_environment_get_set(self): pipeline_options = PipelineOptions([ '--experiments=beam_fn_api', '--experiments=use_unified_worker', '--temp_location', 'gs://any-location/temp' ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned test_environment = DockerEnvironment(container_image='test_default_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) dummy_env = beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=( beam_runner_api_pb2.DockerPayload( container_image='dummy_image')).SerializeToString()) proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env) dummy_transform = beam_runner_api_pb2.PTransform( environment_id='dummy_env_id') proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom( dummy_transform) env = apiclient.Environment( [], # packages pipeline_options, '2.0.0', # any environment version FAKE_PIPELINE_URL, proto_pipeline, _sdk_image_overrides={ '.*dummy.*': 'dummy_image', '.*test.*': 'test_default_image' }) worker_pool = env.proto.workerPools[0] self.assertEqual(2, len(worker_pool.sdkHarnessContainerImages)) images_from_proto = [ sdk_info.containerImage for sdk_info in worker_pool.sdkHarnessContainerImages ] self.assertIn('test_default_image', images_from_proto)
def test_harness_override_custom_in_released_sdks(self): pipeline_options = PipelineOptions([ '--temp_location', 'gs://any-location/temp', '--streaming', '--experiments=runner_harness_container_image=fake_image' ]) env = apiclient.Environment( [], #packages pipeline_options, '2.0.0', #any environment version FAKE_PIPELINE_URL) self.assertEqual( 1, len([ x for x in env.proto.experiments if x.startswith('runner_harness_container_image=') ])) self.assertIn('runner_harness_container_image=fake_image', env.proto.experiments)
def test_get_python_sdk_name(self): pipeline_options = PipelineOptions([ '--project', 'test_project', '--job_name', 'test_job_name', '--temp_location', 'gs://test-location/temp', '--experiments', 'beam_fn_api', '--experiments', 'use_multiple_sdk_containers' ]) environment = apiclient.Environment([], pipeline_options, 1, FAKE_PIPELINE_URL) self.assertEqual( 'Apache Beam Python 3.5 SDK', environment._get_python_sdk_name())
def test_default_ip_configuration(self): pipeline_options = PipelineOptions( ['--temp_location', 'gs://any-location/temp']) env = apiclient.Environment([], pipeline_options, '2.0.0') self.assertEqual(env.proto.workerPools[0].ipConfiguration, None)