예제 #1
0
    def run_pipeline(self, pipeline, options):
        if not ie.current_env().options.enable_capture_replay:
            capture_control.evict_captured_data()
        if self._force_compute:
            ie.current_env().evict_computed_pcollections()

        pipeline_instrument = inst.build_pipeline_instrument(pipeline, options)

        # The user_pipeline analyzed might be None if the pipeline given has nothing
        # to be cached and tracing back to the user defined pipeline is impossible.
        # When it's None, there is no need to cache including the background
        # caching job and no result to track since no background caching job is
        # started at all.
        user_pipeline = pipeline_instrument.user_pipeline
        if user_pipeline:
            # Should use the underlying runner and run asynchronously.
            background_caching_job.attempt_to_run_background_caching_job(
                self._underlying_runner, user_pipeline, options)

        pipeline_to_execute = beam.pipeline.Pipeline.from_runner_api(
            pipeline_instrument.instrumented_pipeline_proto(),
            self._underlying_runner, options)

        if not self._skip_display:
            a_pipeline_graph = pipeline_graph.PipelineGraph(
                pipeline_instrument.original_pipeline,
                render_option=self._render_option)
            a_pipeline_graph.display_graph()

        main_job_result = PipelineResult(pipeline_to_execute.run(),
                                         pipeline_instrument)
        # In addition to this pipeline result setting, redundant result setting from
        # outer scopes are also recommended since the user_pipeline might not be
        # available from within this scope.
        if user_pipeline:
            ie.current_env().set_pipeline_result(user_pipeline,
                                                 main_job_result)

        if self._blocking:
            main_job_result.wait_until_finish()

        if main_job_result.state is beam.runners.runner.PipelineState.DONE:
            # pylint: disable=dict-values-not-iterating
            ie.current_env().mark_pcollection_computed(
                pipeline_instrument.runner_pcoll_to_user_pcoll.values())

        return main_job_result
예제 #2
0
  def test_decoration(self):
    p = beam.Pipeline(ir.InteractiveRunner())
    # We are examining if literal `"` and trailing literal `\` are decorated
    # correctly.
    pcoll = p | '"Cell 1": "Create\\"' >> beam.Create(range(1000))
    ib.watch(locals())

    self.assertEqual(
        ('digraph G {\n'
         'node [color=blue, fontcolor=blue, shape=box];\n'
         # The py string literal from `\\\\\\"` is `\\\"` in dot and will be
         # rendered as `\"` because they are enclosed by `"`.
         '"\\"Cell 1\\": \\"Create\\\\\\"";\n'
         'pcoll [shape=circle];\n'
         '"\\"Cell 1\\": \\"Create\\\\\\"" -> pcoll;\n'
         '}\n'),
        pipeline_graph.PipelineGraph(p).get_dot())
예제 #3
0
    def run_pipeline(self, pipeline, options):
        pipeline_instrument = inst.pin(pipeline, options)

        pipeline_to_execute = beam.pipeline.Pipeline.from_runner_api(
            pipeline_instrument.instrumented_pipeline_proto(),
            self._underlying_runner, options)

        if not self._skip_display:
            a_pipeline_graph = pipeline_graph.PipelineGraph(
                pipeline_instrument.original_pipeline,
                render_option=self._render_option)
            a_pipeline_graph.display_graph()

        result = pipeline_to_execute.run()
        result.wait_until_finish()

        return PipelineResult(result, pipeline_instrument)
예제 #4
0
    def test_get_dot(self):
        p = beam.Pipeline(ir.InteractiveRunner())
        init_pcoll = p | 'Init' >> beam.Create(range(10))
        squares = init_pcoll | 'Square' >> beam.Map(lambda x: x * x)
        cubes = init_pcoll | 'Cube' >> beam.Map(lambda x: x**3)
        ib.watch(locals())

        self.assertEqual(('digraph G {\n'
                          'node [color=blue, fontcolor=blue, shape=box];\n'
                          '"Init";\n'
                          'init_pcoll [shape=circle];\n'
                          '"Square";\n'
                          'squares [shape=circle];\n'
                          '"Cube";\n'
                          'cubes [shape=circle];\n'
                          '"Init" -> init_pcoll;\n'
                          'init_pcoll -> "Square";\n'
                          'init_pcoll -> "Cube";\n'
                          '"Square" -> squares;\n'
                          '"Cube" -> cubes;\n'
                          '}\n'),
                         pipeline_graph.PipelineGraph(p).get_dot())
예제 #5
0
  def run_pipeline(self, pipeline, options):
    pipeline_instrument = inst.pin(pipeline, options)

    # The user_pipeline analyzed might be None if the pipeline given has nothing
    # to be cached and tracing back to the user defined pipeline is impossible.
    # When it's None, there is no need to cache including the background
    # caching job and no result to track since no background caching job is
    # started at all.
    user_pipeline = pipeline_instrument.user_pipeline
    if user_pipeline:
      # Should use the underlying runner and run asynchronously.
      background_caching_job.attempt_to_run_background_caching_job(
          self._underlying_runner, user_pipeline, options)

    pipeline_to_execute = beam.pipeline.Pipeline.from_runner_api(
        pipeline_instrument.instrumented_pipeline_proto(),
        self._underlying_runner,
        options)

    if not self._skip_display:
      a_pipeline_graph = pipeline_graph.PipelineGraph(
          pipeline_instrument.original_pipeline,
          render_option=self._render_option)
      a_pipeline_graph.display_graph()

    main_job_result = PipelineResult(pipeline_to_execute.run(),
                                     pipeline_instrument)
    # In addition to this pipeline result setting, redundant result setting from
    # outer scopes are also recommended since the user_pipeline might not be
    # available from within this scope.
    if user_pipeline:
      ie.current_env().set_pipeline_result(
          user_pipeline,
          main_job_result,
          is_main_job=True)
    main_job_result.wait_until_finish()

    return main_job_result
예제 #6
0
  def test_get_dot_within_notebook(self, cell):
    # Assume a mocked ipython kernel and notebook frontend have been set up.
    ie.current_env()._is_in_ipython = True
    ie.current_env()._is_in_notebook = True
    with cell:  # Cell 1
      p = beam.Pipeline(ir.InteractiveRunner())
      # Immediately track this local pipeline so that ipython prompts when
      # applying transforms will be tracked and used for labels.
      ib.watch(locals())

    with cell:  # Cell 2
      init_pcoll = p | 'Init' >> beam.Create(range(10))

    with cell:  # Cell 3
      squares = init_pcoll | 'Square' >> beam.Map(lambda x: x * x)

    with cell:  # Cell 4
      cubes = init_pcoll | 'Cube' >> beam.Map(lambda x: x**3)

    # Tracks all PCollections defined so far.
    ib.watch(locals())
    self.assertEqual((
        'digraph G {\n'
        'node [color=blue, fontcolor=blue, shape=box];\n'
        '"[2]: Init";\n'
        'init_pcoll [shape=circle];\n'
        '"[3]: Square";\n'
        'squares [shape=circle];\n'
        '"[4]: Cube";\n'
        'cubes [shape=circle];\n'
        '"[2]: Init" -> init_pcoll;\n'
        'init_pcoll -> "[3]: Square";\n'
        'init_pcoll -> "[4]: Cube";\n'
        '"[3]: Square" -> squares;\n'
        '"[4]: Cube" -> cubes;\n'
        '}\n'),
                     pipeline_graph.PipelineGraph(p).get_dot())
예제 #7
0
def show_graph(pipeline):
    """Shows the current pipeline shape of a given Beam pipeline as a DAG.
  """
    pipeline_graph.PipelineGraph(pipeline).display_graph()
예제 #8
0
    def run_pipeline(self, pipeline, options):
        if not ie.current_env().options.enable_recording_replay:
            capture_control.evict_captured_data()
        if self._force_compute:
            ie.current_env().evict_computed_pcollections()

        # Make sure that sources without a user reference are still cached.
        watch_sources(pipeline)

        user_pipeline = ie.current_env().user_pipeline(pipeline)
        pipeline_instrument = inst.build_pipeline_instrument(pipeline, options)

        # The user_pipeline analyzed might be None if the pipeline given has nothing
        # to be cached and tracing back to the user defined pipeline is impossible.
        # When it's None, there is no need to cache including the background
        # caching job and no result to track since no background caching job is
        # started at all.
        if user_pipeline:
            # Should use the underlying runner and run asynchronously.
            background_caching_job.attempt_to_run_background_caching_job(
                self._underlying_runner, user_pipeline, options)
            if (background_caching_job.has_source_to_cache(user_pipeline)
                    and not background_caching_job.
                    is_a_test_stream_service_running(user_pipeline)):
                streaming_cache_manager = ie.current_env().get_cache_manager(
                    user_pipeline)

                # Only make the server if it doesn't exist already.
                if (streaming_cache_manager and not ie.current_env().
                        get_test_stream_service_controller(user_pipeline)):

                    def exception_handler(e):
                        _LOGGER.error(str(e))
                        return True

                    test_stream_service = TestStreamServiceController(
                        streaming_cache_manager,
                        exception_handler=exception_handler)
                    test_stream_service.start()
                    ie.current_env().set_test_stream_service_controller(
                        user_pipeline, test_stream_service)

        pipeline_to_execute = beam.pipeline.Pipeline.from_runner_api(
            pipeline_instrument.instrumented_pipeline_proto(),
            self._underlying_runner, options)

        if ie.current_env().get_test_stream_service_controller(user_pipeline):
            endpoint = ie.current_env().get_test_stream_service_controller(
                user_pipeline).endpoint

            # TODO: make the StreamingCacheManager and TestStreamServiceController
            # constructed when the InteractiveEnvironment is imported.
            class TestStreamVisitor(PipelineVisitor):
                def visit_transform(self, transform_node):
                    from apache_beam.testing.test_stream import TestStream
                    if (isinstance(transform_node.transform, TestStream)
                            and not transform_node.transform._events):
                        transform_node.transform._endpoint = endpoint

            pipeline_to_execute.visit(TestStreamVisitor())

        if not self._skip_display:
            a_pipeline_graph = pipeline_graph.PipelineGraph(
                pipeline_instrument.original_pipeline_proto,
                render_option=self._render_option)
            a_pipeline_graph.display_graph()

        main_job_result = PipelineResult(pipeline_to_execute.run(),
                                         pipeline_instrument)
        # In addition to this pipeline result setting, redundant result setting from
        # outer scopes are also recommended since the user_pipeline might not be
        # available from within this scope.
        if user_pipeline:
            ie.current_env().set_pipeline_result(user_pipeline,
                                                 main_job_result)

        if self._blocking:
            main_job_result.wait_until_finish()

        if main_job_result.state is beam.runners.runner.PipelineState.DONE:
            # pylint: disable=dict-values-not-iterating
            ie.current_env().mark_pcollection_computed(
                pipeline_instrument.cached_pcolls)

        return main_job_result