def run_pipeline(self, pipeline, options): if not ie.current_env().options.enable_capture_replay: capture_control.evict_captured_data() if self._force_compute: ie.current_env().evict_computed_pcollections() pipeline_instrument = inst.build_pipeline_instrument(pipeline, options) # The user_pipeline analyzed might be None if the pipeline given has nothing # to be cached and tracing back to the user defined pipeline is impossible. # When it's None, there is no need to cache including the background # caching job and no result to track since no background caching job is # started at all. user_pipeline = pipeline_instrument.user_pipeline if user_pipeline: # Should use the underlying runner and run asynchronously. background_caching_job.attempt_to_run_background_caching_job( self._underlying_runner, user_pipeline, options) pipeline_to_execute = beam.pipeline.Pipeline.from_runner_api( pipeline_instrument.instrumented_pipeline_proto(), self._underlying_runner, options) if not self._skip_display: a_pipeline_graph = pipeline_graph.PipelineGraph( pipeline_instrument.original_pipeline, render_option=self._render_option) a_pipeline_graph.display_graph() main_job_result = PipelineResult(pipeline_to_execute.run(), pipeline_instrument) # In addition to this pipeline result setting, redundant result setting from # outer scopes are also recommended since the user_pipeline might not be # available from within this scope. if user_pipeline: ie.current_env().set_pipeline_result(user_pipeline, main_job_result) if self._blocking: main_job_result.wait_until_finish() if main_job_result.state is beam.runners.runner.PipelineState.DONE: # pylint: disable=dict-values-not-iterating ie.current_env().mark_pcollection_computed( pipeline_instrument.runner_pcoll_to_user_pcoll.values()) return main_job_result
def test_decoration(self): p = beam.Pipeline(ir.InteractiveRunner()) # We are examining if literal `"` and trailing literal `\` are decorated # correctly. pcoll = p | '"Cell 1": "Create\\"' >> beam.Create(range(1000)) ib.watch(locals()) self.assertEqual( ('digraph G {\n' 'node [color=blue, fontcolor=blue, shape=box];\n' # The py string literal from `\\\\\\"` is `\\\"` in dot and will be # rendered as `\"` because they are enclosed by `"`. '"\\"Cell 1\\": \\"Create\\\\\\"";\n' 'pcoll [shape=circle];\n' '"\\"Cell 1\\": \\"Create\\\\\\"" -> pcoll;\n' '}\n'), pipeline_graph.PipelineGraph(p).get_dot())
def run_pipeline(self, pipeline, options): pipeline_instrument = inst.pin(pipeline, options) pipeline_to_execute = beam.pipeline.Pipeline.from_runner_api( pipeline_instrument.instrumented_pipeline_proto(), self._underlying_runner, options) if not self._skip_display: a_pipeline_graph = pipeline_graph.PipelineGraph( pipeline_instrument.original_pipeline, render_option=self._render_option) a_pipeline_graph.display_graph() result = pipeline_to_execute.run() result.wait_until_finish() return PipelineResult(result, pipeline_instrument)
def test_get_dot(self): p = beam.Pipeline(ir.InteractiveRunner()) init_pcoll = p | 'Init' >> beam.Create(range(10)) squares = init_pcoll | 'Square' >> beam.Map(lambda x: x * x) cubes = init_pcoll | 'Cube' >> beam.Map(lambda x: x**3) ib.watch(locals()) self.assertEqual(('digraph G {\n' 'node [color=blue, fontcolor=blue, shape=box];\n' '"Init";\n' 'init_pcoll [shape=circle];\n' '"Square";\n' 'squares [shape=circle];\n' '"Cube";\n' 'cubes [shape=circle];\n' '"Init" -> init_pcoll;\n' 'init_pcoll -> "Square";\n' 'init_pcoll -> "Cube";\n' '"Square" -> squares;\n' '"Cube" -> cubes;\n' '}\n'), pipeline_graph.PipelineGraph(p).get_dot())
def run_pipeline(self, pipeline, options): pipeline_instrument = inst.pin(pipeline, options) # The user_pipeline analyzed might be None if the pipeline given has nothing # to be cached and tracing back to the user defined pipeline is impossible. # When it's None, there is no need to cache including the background # caching job and no result to track since no background caching job is # started at all. user_pipeline = pipeline_instrument.user_pipeline if user_pipeline: # Should use the underlying runner and run asynchronously. background_caching_job.attempt_to_run_background_caching_job( self._underlying_runner, user_pipeline, options) pipeline_to_execute = beam.pipeline.Pipeline.from_runner_api( pipeline_instrument.instrumented_pipeline_proto(), self._underlying_runner, options) if not self._skip_display: a_pipeline_graph = pipeline_graph.PipelineGraph( pipeline_instrument.original_pipeline, render_option=self._render_option) a_pipeline_graph.display_graph() main_job_result = PipelineResult(pipeline_to_execute.run(), pipeline_instrument) # In addition to this pipeline result setting, redundant result setting from # outer scopes are also recommended since the user_pipeline might not be # available from within this scope. if user_pipeline: ie.current_env().set_pipeline_result( user_pipeline, main_job_result, is_main_job=True) main_job_result.wait_until_finish() return main_job_result
def test_get_dot_within_notebook(self, cell): # Assume a mocked ipython kernel and notebook frontend have been set up. ie.current_env()._is_in_ipython = True ie.current_env()._is_in_notebook = True with cell: # Cell 1 p = beam.Pipeline(ir.InteractiveRunner()) # Immediately track this local pipeline so that ipython prompts when # applying transforms will be tracked and used for labels. ib.watch(locals()) with cell: # Cell 2 init_pcoll = p | 'Init' >> beam.Create(range(10)) with cell: # Cell 3 squares = init_pcoll | 'Square' >> beam.Map(lambda x: x * x) with cell: # Cell 4 cubes = init_pcoll | 'Cube' >> beam.Map(lambda x: x**3) # Tracks all PCollections defined so far. ib.watch(locals()) self.assertEqual(( 'digraph G {\n' 'node [color=blue, fontcolor=blue, shape=box];\n' '"[2]: Init";\n' 'init_pcoll [shape=circle];\n' '"[3]: Square";\n' 'squares [shape=circle];\n' '"[4]: Cube";\n' 'cubes [shape=circle];\n' '"[2]: Init" -> init_pcoll;\n' 'init_pcoll -> "[3]: Square";\n' 'init_pcoll -> "[4]: Cube";\n' '"[3]: Square" -> squares;\n' '"[4]: Cube" -> cubes;\n' '}\n'), pipeline_graph.PipelineGraph(p).get_dot())
def show_graph(pipeline): """Shows the current pipeline shape of a given Beam pipeline as a DAG. """ pipeline_graph.PipelineGraph(pipeline).display_graph()
def run_pipeline(self, pipeline, options): if not ie.current_env().options.enable_recording_replay: capture_control.evict_captured_data() if self._force_compute: ie.current_env().evict_computed_pcollections() # Make sure that sources without a user reference are still cached. watch_sources(pipeline) user_pipeline = ie.current_env().user_pipeline(pipeline) pipeline_instrument = inst.build_pipeline_instrument(pipeline, options) # The user_pipeline analyzed might be None if the pipeline given has nothing # to be cached and tracing back to the user defined pipeline is impossible. # When it's None, there is no need to cache including the background # caching job and no result to track since no background caching job is # started at all. if user_pipeline: # Should use the underlying runner and run asynchronously. background_caching_job.attempt_to_run_background_caching_job( self._underlying_runner, user_pipeline, options) if (background_caching_job.has_source_to_cache(user_pipeline) and not background_caching_job. is_a_test_stream_service_running(user_pipeline)): streaming_cache_manager = ie.current_env().get_cache_manager( user_pipeline) # Only make the server if it doesn't exist already. if (streaming_cache_manager and not ie.current_env(). get_test_stream_service_controller(user_pipeline)): def exception_handler(e): _LOGGER.error(str(e)) return True test_stream_service = TestStreamServiceController( streaming_cache_manager, exception_handler=exception_handler) test_stream_service.start() ie.current_env().set_test_stream_service_controller( user_pipeline, test_stream_service) pipeline_to_execute = beam.pipeline.Pipeline.from_runner_api( pipeline_instrument.instrumented_pipeline_proto(), self._underlying_runner, options) if ie.current_env().get_test_stream_service_controller(user_pipeline): endpoint = ie.current_env().get_test_stream_service_controller( user_pipeline).endpoint # TODO: make the StreamingCacheManager and TestStreamServiceController # constructed when the InteractiveEnvironment is imported. class TestStreamVisitor(PipelineVisitor): def visit_transform(self, transform_node): from apache_beam.testing.test_stream import TestStream if (isinstance(transform_node.transform, TestStream) and not transform_node.transform._events): transform_node.transform._endpoint = endpoint pipeline_to_execute.visit(TestStreamVisitor()) if not self._skip_display: a_pipeline_graph = pipeline_graph.PipelineGraph( pipeline_instrument.original_pipeline_proto, render_option=self._render_option) a_pipeline_graph.display_graph() main_job_result = PipelineResult(pipeline_to_execute.run(), pipeline_instrument) # In addition to this pipeline result setting, redundant result setting from # outer scopes are also recommended since the user_pipeline might not be # available from within this scope. if user_pipeline: ie.current_env().set_pipeline_result(user_pipeline, main_job_result) if self._blocking: main_job_result.wait_until_finish() if main_job_result.state is beam.runners.runner.PipelineState.DONE: # pylint: disable=dict-values-not-iterating ie.current_env().mark_pcollection_computed( pipeline_instrument.cached_pcolls) return main_job_result