Example #1
0
    def record_pipeline(self):
        # type: () -> bool
        """Starts a background caching job for this RecordingManager's pipeline."""

        runner = self.user_pipeline.runner
        if isinstance(runner, ir.InteractiveRunner):
            runner = runner._underlying_runner

        # Make sure that sources without a user reference are still cached.
        ie.current_env().add_user_pipeline(self.user_pipeline)
        utils.watch_sources(self.user_pipeline)

        # Attempt to run background caching job to record any sources.
        warnings.filterwarnings(
            'ignore',
            'options is deprecated since First stable release. References to '
            '<pipeline>.options will not be supported',
            category=DeprecationWarning)
        if bcj.attempt_to_run_background_caching_job(
                runner,
                self.user_pipeline,
                options=self.user_pipeline.options,
                limiters=self._test_limiters):
            self._start_time_sec = time.time()
            return True
        return False
    def test_instrument_example_unbounded_pipeline_to_multiple_read_cache(
            self):
        """Tests that the instrumenter works for multiple unbounded sources.
    """
        # Create the pipeline that will be instrumented.
        p_original = beam.Pipeline(interactive_runner.InteractiveRunner())
        ie.current_env().set_cache_manager(StreamingCache(cache_dir=None),
                                           p_original)
        source_1 = p_original | 'source1' >> beam.io.ReadFromPubSub(
            subscription='projects/fake-project/subscriptions/fake_sub')
        source_2 = p_original | 'source2' >> beam.io.ReadFromPubSub(
            subscription='projects/fake-project/subscriptions/fake_sub')
        # pylint: disable=possibly-unused-variable
        pcoll_1 = source_1 | 'square1' >> beam.Map(lambda x: x * x)
        # pylint: disable=possibly-unused-variable
        pcoll_2 = source_2 | 'square2' >> beam.Map(lambda x: x * x)

        # Mock as if cacheable PCollections are cached.
        ib.watch(locals())
        # This should be noop.
        utils.watch_sources(p_original)
        for name, pcoll in locals().items():
            if not isinstance(pcoll, beam.pvalue.PCollection):
                continue
            cache_key = self.cache_key_of(name, pcoll)
            self._mock_write_cache(p_original, [], cache_key)

        # Instrument the original pipeline to create the pipeline the user will see.
        instrumenter = instr.build_pipeline_instrument(p_original)
        actual_pipeline = beam.Pipeline.from_runner_api(
            proto=instrumenter.instrumented_pipeline_proto(),
            runner=interactive_runner.InteractiveRunner(),
            options=None)

        # Now, build the expected pipeline which replaces the unbounded source with
        # a TestStream.
        source_1_cache_key = self.cache_key_of('source_1', source_1)
        source_2_cache_key = self.cache_key_of('source_2', source_2)
        p_expected = beam.Pipeline()
        test_stream = (p_expected
                       | TestStream(output_tags=[
                           self.cache_key_of('source_1', source_1),
                           self.cache_key_of('source_2', source_2)
                       ]))
        # pylint: disable=expression-not-assigned
        test_stream[source_1_cache_key] | 'square1' >> beam.Map(
            lambda x: x * x)
        # pylint: disable=expression-not-assigned
        test_stream[source_2_cache_key] | 'square2' >> beam.Map(
            lambda x: x * x)

        # Test that the TestStream is outputting to the correct PCollection.
        class TestStreamVisitor(PipelineVisitor):
            def __init__(self):
                self.output_tags = set()

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                transform = transform_node.transform
                if isinstance(transform, TestStream):
                    self.output_tags = transform.output_tags

        v = TestStreamVisitor()
        actual_pipeline.visit(v)
        expected_output_tags = set([source_1_cache_key, source_2_cache_key])
        actual_output_tags = v.output_tags
        self.assertSetEqual(expected_output_tags, actual_output_tags)

        # Test that the pipeline is as expected.
        assert_pipeline_proto_equal(self, p_expected.to_runner_api(),
                                    instrumenter.instrumented_pipeline_proto())
    def test_instrument_example_unbounded_pipeline_to_read_cache_not_cached(
            self):
        """Tests that the instrumenter works when the PCollection is not cached.
    """
        # Create the pipeline that will be instrumented.
        from apache_beam.options.pipeline_options import StandardOptions
        options = StandardOptions(streaming=True)
        p_original_read_cache = beam.Pipeline(
            interactive_runner.InteractiveRunner(), options)
        ie.current_env().set_cache_manager(StreamingCache(cache_dir=None),
                                           p_original_read_cache)
        source_1 = p_original_read_cache | 'source1' >> beam.io.ReadFromPubSub(
            subscription='projects/fake-project/subscriptions/fake_sub')
        # pylint: disable=possibly-unused-variable
        pcoll_1 = source_1 | 'square1' >> beam.Map(lambda x: x * x)

        # Watch but do not cache the PCollections.
        ib.watch(locals())
        # This should be noop.
        utils.watch_sources(p_original_read_cache)
        # Instrument the original pipeline to create the pipeline the user will see.
        p_copy = beam.Pipeline.from_runner_api(
            p_original_read_cache.to_runner_api(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)
        ie.current_env().add_derived_pipeline(p_original_read_cache, p_copy)
        instrumenter = instr.build_pipeline_instrument(p_copy)
        actual_pipeline = beam.Pipeline.from_runner_api(
            proto=instrumenter.instrumented_pipeline_proto(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)

        # Now, build the expected pipeline which replaces the unbounded source with
        # a TestStream.
        source_1_cache_key = self.cache_key_of('source_1', source_1)
        p_expected = beam.Pipeline()
        ie.current_env().set_cache_manager(StreamingCache(cache_dir=None),
                                           p_expected)
        test_stream = (p_expected
                       | TestStream(output_tags=[source_1_cache_key]))
        # pylint: disable=expression-not-assigned
        (test_stream[source_1_cache_key]
         | 'square1' >> beam.Map(lambda x: x * x)
         | 'reify' >> beam.Map(lambda _: _)
         | cache.WriteCache(ie.current_env().get_cache_manager(p_expected),
                            'unused'))

        # Test that the TestStream is outputting to the correct PCollection.
        class TestStreamVisitor(PipelineVisitor):
            def __init__(self):
                self.output_tags = set()

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                transform = transform_node.transform
                if isinstance(transform, TestStream):
                    self.output_tags = transform.output_tags

        v = TestStreamVisitor()
        actual_pipeline.visit(v)
        expected_output_tags = set([source_1_cache_key])
        actual_output_tags = v.output_tags
        self.assertSetEqual(expected_output_tags, actual_output_tags)

        # Test that the pipeline is as expected.
        assert_pipeline_proto_equal(self, p_expected.to_runner_api(),
                                    instrumenter.instrumented_pipeline_proto())
    def test_instrument_mixed_streaming_batch(self):
        """Tests caching for both batch and streaming sources in the same pipeline.

    This ensures that cached bounded and unbounded sources are read from the
    TestStream.
    """
        # Create the pipeline that will be instrumented.
        from apache_beam.options.pipeline_options import StandardOptions
        options = StandardOptions(streaming=True)
        p_original = beam.Pipeline(interactive_runner.InteractiveRunner(),
                                   options)
        streaming_cache_manager = StreamingCache(cache_dir=None)
        ie.current_env().set_cache_manager(streaming_cache_manager, p_original)
        source_1 = p_original | 'source1' >> beam.io.ReadFromPubSub(
            subscription='projects/fake-project/subscriptions/fake_sub')
        source_2 = p_original | 'source2' >> beam.Create([1, 2, 3, 4, 5])

        # pylint: disable=possibly-unused-variable
        pcoll_1 = ((source_1, source_2)
                   | beam.Flatten()
                   | 'square1' >> beam.Map(lambda x: x * x))

        # Watch but do not cache the PCollections.
        ib.watch(locals())
        # This should be noop.
        utils.watch_sources(p_original)
        self._mock_write_cache(p_original, [],
                               self.cache_key_of('source_2', source_2))
        ie.current_env().mark_pcollection_computed([source_2])

        # Instrument the original pipeline to create the pipeline the user will see.
        p_copy = beam.Pipeline.from_runner_api(
            p_original.to_runner_api(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)
        ie.current_env().add_derived_pipeline(p_original, p_copy)
        instrumenter = instr.build_pipeline_instrument(p_copy)
        actual_pipeline = beam.Pipeline.from_runner_api(
            proto=instrumenter.instrumented_pipeline_proto(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)

        # Now, build the expected pipeline which replaces the unbounded source with
        # a TestStream.
        source_1_cache_key = self.cache_key_of('source_1', source_1)
        source_2_cache_key = self.cache_key_of('source_2', source_2)
        p_expected = beam.Pipeline()
        ie.current_env().set_cache_manager(streaming_cache_manager, p_expected)
        test_stream = (
            p_expected
            | TestStream(output_tags=[source_1_cache_key, source_2_cache_key]))
        # pylint: disable=expression-not-assigned
        ((test_stream[self.cache_key_of('source_1', source_1)],
          test_stream[self.cache_key_of('source_2', source_2)])
         | beam.Flatten()
         | 'square1' >> beam.Map(lambda x: x * x)
         | 'reify' >> beam.Map(lambda _: _)
         | cache.WriteCache(ie.current_env().get_cache_manager(p_expected),
                            'unused'))

        # Test that the TestStream is outputting to the correct PCollection.
        class TestStreamVisitor(PipelineVisitor):
            def __init__(self):
                self.output_tags = set()

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                transform = transform_node.transform
                if isinstance(transform, TestStream):
                    self.output_tags = transform.output_tags

        v = TestStreamVisitor()
        actual_pipeline.visit(v)
        expected_output_tags = set([source_1_cache_key, source_2_cache_key])
        actual_output_tags = v.output_tags
        self.assertSetEqual(expected_output_tags, actual_output_tags)

        # Test that the pipeline is as expected.
        assert_pipeline_proto_equal(self, p_expected.to_runner_api(),
                                    instrumenter.instrumented_pipeline_proto())
    def test_able_to_cache_intermediate_unbounded_source_pcollection(self):
        """Tests being able to cache an intermediate source PCollection.

    In the following pipeline, the source doesn't have a reference and so is
    not automatically cached in the watch() command. This tests that this case
    is taken care of.
    """
        # Create the pipeline that will be instrumented.
        from apache_beam.options.pipeline_options import StandardOptions
        options = StandardOptions(streaming=True)
        streaming_cache_manager = StreamingCache(cache_dir=None)
        p_original_cache_source = beam.Pipeline(
            interactive_runner.InteractiveRunner(), options)
        ie.current_env().set_cache_manager(streaming_cache_manager,
                                           p_original_cache_source)

        # pylint: disable=possibly-unused-variable
        source_1 = (
            p_original_cache_source
            | 'source1' >> beam.io.ReadFromPubSub(
                subscription='projects/fake-project/subscriptions/fake_sub')
            | beam.Map(lambda e: e))

        # Watch but do not cache the PCollections.
        ib.watch(locals())
        # Make sure that sources without a user reference are still cached.
        utils.watch_sources(p_original_cache_source)

        intermediate_source_pcoll = None
        for watching in ie.current_env().watching():
            watching = list(watching)
            for var, watchable in watching:
                if 'synthetic' in var:
                    intermediate_source_pcoll = watchable
                    break

        # Instrument the original pipeline to create the pipeline the user will see.
        p_copy = beam.Pipeline.from_runner_api(
            p_original_cache_source.to_runner_api(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)
        ie.current_env().add_derived_pipeline(p_original_cache_source, p_copy)
        instrumenter = instr.build_pipeline_instrument(p_copy)
        actual_pipeline = beam.Pipeline.from_runner_api(
            proto=instrumenter.instrumented_pipeline_proto(),
            runner=interactive_runner.InteractiveRunner(),
            options=options)
        ie.current_env().add_derived_pipeline(p_original_cache_source,
                                              actual_pipeline)

        # Now, build the expected pipeline which replaces the unbounded source with
        # a TestStream.
        intermediate_source_pcoll_cache_key = \
            self.cache_key_of('synthetic_var_' + str(id(intermediate_source_pcoll)),
                         intermediate_source_pcoll)
        p_expected = beam.Pipeline()
        ie.current_env().set_cache_manager(streaming_cache_manager, p_expected)
        test_stream = (
            p_expected
            | TestStream(output_tags=[intermediate_source_pcoll_cache_key]))
        # pylint: disable=expression-not-assigned
        (test_stream[intermediate_source_pcoll_cache_key]
         | 'square1' >> beam.Map(lambda e: e)
         | 'reify' >> beam.Map(lambda _: _)
         | cache.WriteCache(ie.current_env().get_cache_manager(p_expected),
                            'unused'))

        # Test that the TestStream is outputting to the correct PCollection.
        class TestStreamVisitor(PipelineVisitor):
            def __init__(self):
                self.output_tags = set()

            def enter_composite_transform(self, transform_node):
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                transform = transform_node.transform
                if isinstance(transform, TestStream):
                    self.output_tags = transform.output_tags

        v = TestStreamVisitor()
        actual_pipeline.visit(v)
        expected_output_tags = set([intermediate_source_pcoll_cache_key])
        actual_output_tags = v.output_tags
        self.assertSetEqual(expected_output_tags, actual_output_tags)

        # Test that the pipeline is as expected.
        assert_pipeline_proto_equal(self, p_expected.to_runner_api(),
                                    instrumenter.instrumented_pipeline_proto())
Example #6
0
    def run_pipeline(self, pipeline, options):
        if not ie.current_env().options.enable_recording_replay:
            capture_control.evict_captured_data()
        if self._force_compute:
            ie.current_env().evict_computed_pcollections()

        # Make sure that sources without a user reference are still cached.
        watch_sources(pipeline)

        user_pipeline = ie.current_env().user_pipeline(pipeline)
        pipeline_instrument = inst.build_pipeline_instrument(pipeline, options)

        # The user_pipeline analyzed might be None if the pipeline given has nothing
        # to be cached and tracing back to the user defined pipeline is impossible.
        # When it's None, there is no need to cache including the background
        # caching job and no result to track since no background caching job is
        # started at all.
        if user_pipeline:
            # Should use the underlying runner and run asynchronously.
            background_caching_job.attempt_to_run_background_caching_job(
                self._underlying_runner, user_pipeline, options)
            if (background_caching_job.has_source_to_cache(user_pipeline)
                    and not background_caching_job.
                    is_a_test_stream_service_running(user_pipeline)):
                streaming_cache_manager = ie.current_env().get_cache_manager(
                    user_pipeline)

                # Only make the server if it doesn't exist already.
                if (streaming_cache_manager and not ie.current_env().
                        get_test_stream_service_controller(user_pipeline)):

                    def exception_handler(e):
                        _LOGGER.error(str(e))
                        return True

                    test_stream_service = TestStreamServiceController(
                        streaming_cache_manager,
                        exception_handler=exception_handler)
                    test_stream_service.start()
                    ie.current_env().set_test_stream_service_controller(
                        user_pipeline, test_stream_service)

        pipeline_to_execute = beam.pipeline.Pipeline.from_runner_api(
            pipeline_instrument.instrumented_pipeline_proto(),
            self._underlying_runner, options)

        if ie.current_env().get_test_stream_service_controller(user_pipeline):
            endpoint = ie.current_env().get_test_stream_service_controller(
                user_pipeline).endpoint

            # TODO: make the StreamingCacheManager and TestStreamServiceController
            # constructed when the InteractiveEnvironment is imported.
            class TestStreamVisitor(PipelineVisitor):
                def visit_transform(self, transform_node):
                    from apache_beam.testing.test_stream import TestStream
                    if (isinstance(transform_node.transform, TestStream)
                            and not transform_node.transform._events):
                        transform_node.transform._endpoint = endpoint

            pipeline_to_execute.visit(TestStreamVisitor())

        if not self._skip_display:
            a_pipeline_graph = pipeline_graph.PipelineGraph(
                pipeline_instrument.original_pipeline_proto,
                render_option=self._render_option)
            a_pipeline_graph.display_graph()

        main_job_result = PipelineResult(pipeline_to_execute.run(),
                                         pipeline_instrument)
        # In addition to this pipeline result setting, redundant result setting from
        # outer scopes are also recommended since the user_pipeline might not be
        # available from within this scope.
        if user_pipeline:
            ie.current_env().set_pipeline_result(user_pipeline,
                                                 main_job_result)

        if self._blocking:
            main_job_result.wait_until_finish()

        if main_job_result.state is beam.runners.runner.PipelineState.DONE:
            # pylint: disable=dict-values-not-iterating
            ie.current_env().mark_pcollection_computed(
                pipeline_instrument.cached_pcolls)

        return main_job_result