Example #1
0
    def test_parent_pointer(self):
        class MyPTransform(beam.PTransform):
            def expand(self, p):
                self.p = p
                return p | beam.Create([None])

        p = beam.Pipeline()
        p | MyPTransform()  # pylint: disable=expression-not-assigned
        p = Pipeline.from_runner_api(
            Pipeline.to_runner_api(p, use_fake_coders=True), None, None)
        self.assertIsNotNone(p.transforms_stack[0].parts[0].parent)
        self.assertEqual(p.transforms_stack[0].parts[0].parent,
                         p.transforms_stack[0])
Example #2
0
  def test_parent_pointer(self):
    class MyPTransform(beam.PTransform):

      def expand(self, p):
        self.p = p
        return p | beam.Create([None])

    p = beam.Pipeline()
    p | MyPTransform()  # pylint: disable=expression-not-assigned
    p = Pipeline.from_runner_api(Pipeline.to_runner_api(p), None, None)
    self.assertIsNotNone(p.transforms_stack[0].parts[0].parent)
    self.assertEquals(p.transforms_stack[0].parts[0].parent,
                      p.transforms_stack[0])
Example #3
0
    def test_sdk_harness_container_images_get_set(self):

        pipeline_options = PipelineOptions([
            '--experiments=beam_fn_api', '--experiments=use_unified_worker',
            '--temp_location', 'gs://any-location/temp'
        ])

        pipeline = Pipeline(options=pipeline_options)
        pipeline | Create([1, 2, 3]) | ParDo(DoFn())  # pylint:disable=expression-not-assigned

        test_environment = DockerEnvironment(
            container_image='test_default_image')
        proto_pipeline, _ = pipeline.to_runner_api(
            return_context=True, default_environment=test_environment)

        # We have to manually add environments since Dataflow only sets
        # 'sdkHarnessContainerImages' when there are at least two environments.
        dummy_env = beam_runner_api_pb2.Environment(
            urn=common_urns.environments.DOCKER.urn,
            payload=(beam_runner_api_pb2.DockerPayload(
                container_image='dummy_image')).SerializeToString())
        proto_pipeline.components.environments['dummy_env_id'].CopyFrom(
            dummy_env)

        dummy_transform = beam_runner_api_pb2.PTransform(
            environment_id='dummy_env_id')
        proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom(
            dummy_transform)

        env = apiclient.Environment(
            [],  # packages
            pipeline_options,
            '2.0.0',  # any environment version
            FAKE_PIPELINE_URL,
            proto_pipeline,
            _sdk_image_overrides={
                '.*dummy.*': 'dummy_image',
                '.*test.*': 'test_default_image'
            })
        worker_pool = env.proto.workerPools[0]

        # For the test, a third environment get added since actual default
        # container image for Dataflow is different from 'test_default_image'
        # we've provided above.
        self.assertEqual(3, len(worker_pool.sdkHarnessContainerImages))

        # Container image should be overridden by a Dataflow specific URL.
        self.assertTrue(
            str.startswith(
                (worker_pool.sdkHarnessContainerImages[0]).containerImage,
                'gcr.io/cloud-dataflow/v1beta3/python'))
Example #4
0
    def test_java_sdk_harness_dedup(self):
        pipeline_options = PipelineOptions([
            '--experiments=beam_fn_api', '--experiments=use_unified_worker',
            '--temp_location', 'gs://any-location/temp'
        ])

        pipeline = Pipeline(options=pipeline_options)
        pipeline | Create([1, 2, 3]) | ParDo(DoFn())  # pylint:disable=expression-not-assigned

        proto_pipeline, _ = pipeline.to_runner_api(return_context=True)

        dummy_env_1 = beam_runner_api_pb2.Environment(
            urn=common_urns.environments.DOCKER.urn,
            payload=(beam_runner_api_pb2.DockerPayload(
                container_image='apache/beam_java:dummy_tag')
                     ).SerializeToString())
        proto_pipeline.components.environments['dummy_env_id_1'].CopyFrom(
            dummy_env_1)

        dummy_transform_1 = beam_runner_api_pb2.PTransform(
            environment_id='dummy_env_id_1')
        proto_pipeline.components.transforms['dummy_transform_id_1'].CopyFrom(
            dummy_transform_1)

        dummy_env_2 = beam_runner_api_pb2.Environment(
            urn=common_urns.environments.DOCKER.urn,
            payload=(beam_runner_api_pb2.DockerPayload(
                container_image='apache/beam_java:dummy_tag')
                     ).SerializeToString())
        proto_pipeline.components.environments['dummy_env_id_2'].CopyFrom(
            dummy_env_2)

        dummy_transform_2 = beam_runner_api_pb2.PTransform(
            environment_id='dummy_env_id_2')
        proto_pipeline.components.transforms['dummy_transform_id_2'].CopyFrom(
            dummy_transform_2)

        # Accessing non-public method for testing.
        apiclient.DataflowApplicationClient._apply_sdk_environment_overrides(
            proto_pipeline, dict(), pipeline_options)

        # Only one of 'dummy_env_id_1' or 'dummy_env_id_2' should be in the set of
        # environment IDs used by the proto after Java environment de-duping.
        env_ids_from_transforms = [
            proto_pipeline.components.transforms[transform_id].environment_id
            for transform_id in proto_pipeline.components.transforms
        ]
        if 'dummy_env_id_1' in env_ids_from_transforms:
            self.assertTrue('dummy_env_id_2' not in env_ids_from_transforms)
        else:
            self.assertTrue('dummy_env_id_2' in env_ids_from_transforms)
Example #5
0
    def test_visitor_not_sorted(self):
        p = Pipeline()
        # pylint: disable=expression-not-assigned
        from apache_beam.testing.test_stream import TestStream
        p | TestStream().add_elements(['']) | beam.Map(lambda _: _)

        original_graph = p.to_runner_api(return_context=False)
        out_of_order_graph = p.to_runner_api(return_context=False)

        root_id = out_of_order_graph.root_transform_ids[0]
        root = out_of_order_graph.components.transforms[root_id]
        tmp = root.subtransforms[0]
        root.subtransforms[0] = root.subtransforms[1]
        root.subtransforms[1] = tmp

        p = beam.Pipeline().from_runner_api(out_of_order_graph,
                                            runner='BundleBasedDirectRunner',
                                            options=None)
        v_out_of_order = ConsumerTrackingPipelineVisitor()
        p.visit(v_out_of_order)

        p = beam.Pipeline().from_runner_api(original_graph,
                                            runner='BundleBasedDirectRunner',
                                            options=None)
        v_original = ConsumerTrackingPipelineVisitor()
        p.visit(v_original)

        # Convert to string to assert they are equal.
        out_of_order_labels = {
            str(k): [str(t) for t in v_out_of_order.value_to_consumers[k]]
            for k in v_out_of_order.value_to_consumers
        }

        original_labels = {
            str(k): [str(t) for t in v_original.value_to_consumers[k]]
            for k in v_original.value_to_consumers
        }
        self.assertDictEqual(out_of_order_labels, original_labels)
    def test_bad_path(self):
        dummy_sdk_file = tempfile.NamedTemporaryFile()
        remote_runner = DataflowRunner()
        pipeline = Pipeline(
            remote_runner,
            options=PipelineOptions([
                '--dataflow_endpoint=ignored',
                '--sdk_location=' + dummy_sdk_file.name, '--job_name=test-job',
                '--project=test-project', '--staging_location=ignored',
                '--temp_location=/dev/null', '--template_location=/bad/path',
                '--no_auth'
            ]))
        remote_runner.job = apiclient.Job(pipeline._options,
                                          pipeline.to_runner_api())

        with self.assertRaises(IOError):
            pipeline.run().wait_until_finish()
Example #7
0
  def test_default_environment_get_set(self):

    pipeline_options = PipelineOptions([
        '--experiments=beam_fn_api',
        '--experiments=use_unified_worker',
        '--temp_location',
        'gs://any-location/temp'
    ])

    pipeline = Pipeline(options=pipeline_options)
    pipeline | Create([1, 2, 3]) | ParDo(DoFn())  # pylint:disable=expression-not-assigned

    test_environment = DockerEnvironment(container_image='test_default_image')
    proto_pipeline, _ = pipeline.to_runner_api(
        return_context=True, default_environment=test_environment)

    dummy_env = beam_runner_api_pb2.Environment(
        urn=common_urns.environments.DOCKER.urn,
        payload=(
            beam_runner_api_pb2.DockerPayload(
                container_image='dummy_image')).SerializeToString())
    proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env)

    dummy_transform = beam_runner_api_pb2.PTransform(
        environment_id='dummy_env_id')
    proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom(
        dummy_transform)

    env = apiclient.Environment(
        [],  # packages
        pipeline_options,
        '2.0.0',  # any environment version
        FAKE_PIPELINE_URL,
        proto_pipeline,
        _sdk_image_overrides={
            '.*dummy.*': 'dummy_image', '.*test.*': 'test_default_image'
        })
    worker_pool = env.proto.workerPools[0]

    self.assertEqual(2, len(worker_pool.sdkHarnessContainerImages))

    images_from_proto = [
        sdk_info.containerImage
        for sdk_info in worker_pool.sdkHarnessContainerImages
    ]
    self.assertIn('test_default_image', images_from_proto)
Example #8
0
  def test_bad_path(self):
    dummy_sdk_file = tempfile.NamedTemporaryFile()
    remote_runner = DataflowRunner()
    pipeline = Pipeline(remote_runner,
                        options=PipelineOptions([
                            '--dataflow_endpoint=ignored',
                            '--sdk_location=' + dummy_sdk_file.name,
                            '--job_name=test-job',
                            '--project=test-project',
                            '--staging_location=ignored',
                            '--temp_location=/dev/null',
                            '--template_location=/bad/path',
                            '--no_auth=True']))
    remote_runner.job = apiclient.Job(pipeline._options,
                                      pipeline.to_runner_api())

    with self.assertRaises(IOError):
      pipeline.run().wait_until_finish()
Example #9
0
  def test_pipeline_sdk_not_overridden(self):
    pipeline_options = PipelineOptions([
        '--experiments=beam_fn_api',
        '--experiments=use_unified_worker',
        '--temp_location',
        'gs://any-location/temp',
        '--worker_harness_container_image=dummy_prefix/dummy_name:dummy_tag'
    ])

    pipeline = Pipeline(options=pipeline_options)
    pipeline | Create([1, 2, 3]) | ParDo(DoFn())  # pylint:disable=expression-not-assigned

    proto_pipeline, _ = pipeline.to_runner_api(return_context=True)

    dummy_env = beam_runner_api_pb2.Environment(
        urn=common_urns.environments.DOCKER.urn,
        payload=(
            beam_runner_api_pb2.DockerPayload(
                container_image='dummy_prefix/dummy_name:dummy_tag')
        ).SerializeToString())
    proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env)

    dummy_transform = beam_runner_api_pb2.PTransform(
        environment_id='dummy_env_id')
    proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom(
        dummy_transform)

    # Accessing non-public method for testing.
    apiclient.DataflowApplicationClient._apply_sdk_environment_overrides(
        proto_pipeline, dict(), pipeline_options)

    self.assertIsNotNone(2, len(proto_pipeline.components.environments))

    from apache_beam.utils import proto_utils
    found_override = False
    for env in proto_pipeline.components.environments.values():
      docker_payload = proto_utils.parse_Bytes(
          env.payload, beam_runner_api_pb2.DockerPayload)
      if docker_payload.container_image.startswith(
          names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY):
        found_override = True

    self.assertFalse(found_override)
Example #10
0
    def test_sdk_harness_container_image_overrides(self):
        if 'sdkHarnessContainerImages' not in dataflow.WorkerPool.__dict__:
            _LOGGER.warning(
                'Skipping test \'test_sdk_harness_container_image_overrides\' since '
                'Dataflow API WorkerPool does not have attribute '
                '\'sdkHarnessContainerImages\'')
            return
        pipeline_options = PipelineOptions([
            '--experiments=beam_fn_api',
            '--experiments=use_unified_worker',
            '--temp_location',
            'gs://any-location/temp',
            '--project',
            'dummy_project',
            '--sdk_harness_container_image_overrides',
            '.*dummy.*,new_dummy_container_image',
        ])

        pipeline = Pipeline(options=pipeline_options)

        test_environment = DockerEnvironment(
            container_image='dummy_container_image')
        proto_pipeline, _ = pipeline.to_runner_api(
            return_context=True, default_environment=test_environment)
        dataflow_client = apiclient.DataflowApplicationClient(pipeline_options)

        # Accessing non-public method for testing.
        dataflow_client._apply_sdk_environment_overrides(proto_pipeline)

        self.assertIsNotNone(1, len(proto_pipeline.components.environments))
        env = list(proto_pipeline.components.environments.values())[0]

        from apache_beam.utils import proto_utils
        docker_payload = proto_utils.parse_Bytes(
            env.payload, beam_runner_api_pb2.DockerPayload)

        # Container image should be overridden by a the given override.
        self.assertEqual(docker_payload.container_image,
                         'new_dummy_container_image')
Example #11
0
  def test_display_data(self):
    class MyParentTransform(beam.PTransform):
      def expand(self, p):
        self.p = p
        return p | beam.Create([None])

      def display_data(self):  # type: () -> dict
        parent_dd = super(MyParentTransform, self).display_data()
        parent_dd['p_dd_string'] = DisplayDataItem(
            'p_dd_string_value', label='p_dd_string_label')
        parent_dd['p_dd_bool'] = DisplayDataItem(True, label='p_dd_bool_label')
        parent_dd['p_dd_int'] = DisplayDataItem(1, label='p_dd_int_label')
        return parent_dd

    class MyPTransform(MyParentTransform):
      def expand(self, p):
        self.p = p
        return p | beam.Create([None])

      def display_data(self):  # type: () -> dict
        parent_dd = super(MyPTransform, self).display_data()
        parent_dd['dd_string'] = DisplayDataItem(
            'dd_string_value', label='dd_string_label')
        parent_dd['dd_bool'] = DisplayDataItem(False, label='dd_bool_label')
        parent_dd['dd_int'] = DisplayDataItem(1.1, label='dd_int_label')
        return parent_dd

    p = beam.Pipeline()
    p | MyPTransform()  # pylint: disable=expression-not-assigned
    from apache_beam.portability.api import beam_runner_api_pb2

    proto_pipeline = Pipeline.to_runner_api(p, use_fake_coders=True)
    my_transform, = [
        transform
        for transform in proto_pipeline.components.transforms.values()
        if transform.unique_name == 'MyPTransform'
    ]
    self.assertIsNotNone(my_transform)
    self.assertListEqual(
        list(my_transform.display_data),
        [
            beam_runner_api_pb2.DisplayData(
                urn=common_urns.StandardDisplayData.DisplayData.LABELLED.urn,
                payload=beam_runner_api_pb2.LabelledPayload(
                    label='p_dd_string_label',
                    string_value='p_dd_string_value').SerializeToString()),
            beam_runner_api_pb2.DisplayData(
                urn=common_urns.StandardDisplayData.DisplayData.LABELLED.urn,
                payload=beam_runner_api_pb2.LabelledPayload(
                    label='p_dd_bool_label',
                    bool_value=True).SerializeToString()),
            beam_runner_api_pb2.DisplayData(
                urn=common_urns.StandardDisplayData.DisplayData.LABELLED.urn,
                payload=beam_runner_api_pb2.LabelledPayload(
                    label='p_dd_int_label',
                    double_value=1).SerializeToString()),
            beam_runner_api_pb2.DisplayData(
                urn=common_urns.StandardDisplayData.DisplayData.LABELLED.urn,
                payload=beam_runner_api_pb2.LabelledPayload(
                    label='dd_string_label',
                    string_value='dd_string_value').SerializeToString()),
            beam_runner_api_pb2.DisplayData(
                urn=common_urns.StandardDisplayData.DisplayData.LABELLED.urn,
                payload=beam_runner_api_pb2.LabelledPayload(
                    label='dd_bool_label',
                    bool_value=False).SerializeToString()),
            beam_runner_api_pb2.DisplayData(
                urn=common_urns.StandardDisplayData.DisplayData.LABELLED.urn,
                payload=beam_runner_api_pb2.LabelledPayload(
                    label='dd_int_label',
                    double_value=1.1).SerializeToString()),
        ])