Ejemplo n.º 1
0
  def record(self, pcolls, max_n, max_duration):
    # type: (List[beam.pvalue.PCollection], int, Union[int,str]) -> Recording

    """Records the given PCollections."""

    # Assert that all PCollection come from the same user_pipeline.
    for pcoll in pcolls:
      assert pcoll.pipeline is self.user_pipeline, (
        '{} belongs to a different user-defined pipeline ({}) than that of'
        ' other PCollections ({}).'.format(
            pcoll, pcoll.pipeline, self.user_pipeline))

    if isinstance(max_duration, str) and max_duration != 'inf':
      max_duration_secs = pd.to_timedelta(max_duration).total_seconds()
    else:
      max_duration_secs = max_duration

    # Make sure that all PCollections to be shown are watched. If a PCollection
    # has not been watched, make up a variable name for that PCollection and
    # watch it. No validation is needed here because the watch logic can handle
    # arbitrary variables.
    self._watch(pcolls)
    pipeline_instrument = pi.PipelineInstrument(self.user_pipeline)

    pipeline_instrument = pi.PipelineInstrument(self.user_pipeline)
    self.record_pipeline()

    # Get the subset of computed PCollections. These do not to be recomputed.
    computed_pcolls = set(
        pcoll for pcoll in pcolls
        if pcoll in ie.current_env().computed_pcollections)

    # Start a pipeline fragment to start computing the PCollections.
    uncomputed_pcolls = set(pcolls).difference(computed_pcolls)
    if uncomputed_pcolls:
      # Clear the cache of the given uncomputed PCollections because they are
      # incomplete.
      self._clear(pipeline_instrument)

      warnings.filterwarnings(
          'ignore',
          'options is deprecated since First stable release. References to '
          '<pipeline>.options will not be supported',
          category=DeprecationWarning)
      pf.PipelineFragment(list(uncomputed_pcolls),
                          self.user_pipeline.options).run()
      result = ie.current_env().pipeline_result(self.user_pipeline)
    else:
      result = None

    recording = Recording(
        self.user_pipeline,
        pcolls,
        result,
        pipeline_instrument,
        max_n,
        max_duration_secs)
    self._recordings.add(recording)

    return recording
Ejemplo n.º 2
0
 def __init__(self, pcoll, include_window_info=False, display_facets=False):
     assert _pcoll_visualization_ready, (
         'Dependencies for PCollection visualization are not available. Please '
         'use `pip install apache-beam[interactive]` to install necessary '
         'dependencies and make sure that you are executing code in an '
         'interactive environment such as a Jupyter notebook.')
     assert isinstance(pcoll, pvalue.PCollection), (
         'pcoll should be apache_beam.pvalue.PCollection')
     self._pcoll = pcoll
     # This allows us to access cache key and other meta data about the pipeline
     # whether it's the pipeline defined in user code or a copy of that pipeline.
     # Thus, this module doesn't need any other user input but the PCollection
     # variable to be visualized. It then automatically figures out the pipeline
     # definition, materialized data and the pipeline result for the execution
     # even if the user never assigned or waited the result explicitly.
     # With only the constructor of PipelineInstrument, any interactivity related
     # pre-process or instrument is not triggered for performance concerns.
     self._pin = instr.PipelineInstrument(pcoll.pipeline)
     # Variable name as the title for element value in the rendered data table.
     self._pcoll_var = self._pin.cacheable_var_by_pcoll_id(
         self._pin.pcolls_to_pcoll_id.get(str(pcoll), None))
     if not self._pcoll_var:
         self._pcoll_var = 'Value'
     self._cache_key = self._pin.cache_key(self._pcoll)
     obfuscated_id = obfuscate(self._cache_key, id(self))
     self._dive_display_id = 'facets_dive_{}'.format(obfuscated_id)
     self._overview_display_id = 'facets_overview_{}'.format(obfuscated_id)
     self._df_display_id = 'df_{}'.format(obfuscated_id)
     self._include_window_info = include_window_info
     self._display_facets = display_facets
     self._is_datatable_empty = True
Ejemplo n.º 3
0
    def test_recording_manager_clears_cache(self):
        """Tests that the RecordingManager clears the cache before recording.

    A job may have incomplete PCollections when the job terminates. Clearing the
    cache ensures that correct results are computed every run.
    """
        # Add the TestStream so that it can be cached.
        ib.options.capturable_sources.add(TestStream)
        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        elems = (p
                 | TestStream().advance_watermark_to(
                     0).advance_processing_time(1).add_elements(list(
                         range(10))).advance_processing_time(1))
        squares = elems | beam.Map(lambda x: x**2)

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Do the first recording to get the timestamp of the first time the fragment
        # was run.
        rm = RecordingManager(p)
        rm.record([squares], max_n=10, max_duration=2)
        first_recording_start = rm.describe()['start']
        rm.cancel()

        # Get the cache, key, and coder to read the PCollection from the cache.
        pipeline_instrument = pi.PipelineInstrument(p)
        cache = ie.current_env().get_cache_manager(p)
        cache_key = pipeline_instrument.cache_key(squares)

        # Set up a mock for the Cache's clear function which will be used to clear
        # uncomputed PCollections.
        cache.clear = MagicMock()

        # Rerun the fragment. If the cache was cleared correctly then the starting
        # time of the second recording will be later than the first. This is because
        # the PCollection wasn't considered to be computedand was cleared from
        # cache. Thus the pipeline fragment was rerun for that PCollection at a
        # later time.
        rm.record([squares], max_n=10, max_duration=1)
        second_recording_start = rm.describe()['start']
        rm.cancel()
        self.assertGreater(second_recording_start, first_recording_start)

        # Assert that the cache cleared the PCollection.
        cache.clear.assert_called_with('full', cache_key)
Ejemplo n.º 4
0
    def test_describe(self):
        p = beam.Pipeline(InteractiveRunner())
        numbers = p | 'numbers' >> beam.Create([0, 1, 2])
        letters = p | 'letters' >> beam.Create(['a', 'b', 'c'])

        ib.watch(locals())

        # Create a MockPipelineResult to control the state of a fake run of the
        # pipeline.
        mock_result = MockPipelineResult()
        ie.current_env().track_user_pipelines()
        ie.current_env().set_pipeline_result(p, mock_result)

        cache_manager = InMemoryCache()
        ie.current_env().set_cache_manager(cache_manager, p)

        # Create a recording with an arbitrary start time.
        start_time = 100
        recording = Recording(p, [numbers, letters],
                              mock_result,
                              pi.PipelineInstrument(p),
                              max_n=10,
                              max_duration_secs=60,
                              start_time_for_test=start_time)

        # Get the cache key of the stream and write something to cache. This is
        # so that a pipeline doesn't have to run in the test.
        numbers_stream = recording.stream(numbers)
        cache_manager.write([0, 1, 2], 'full', numbers_stream.cache_key)
        cache_manager.save_pcoder(None, 'full', numbers_stream.cache_key)

        letters_stream = recording.stream(letters)
        cache_manager.write(['a', 'b', 'c'], 'full', letters_stream.cache_key)
        cache_manager.save_pcoder(None, 'full', letters_stream.cache_key)

        # Get the description.
        description = recording.describe()
        size = description['size']
        start = description['start']

        self.assertEqual(
            size,
            cache_manager.size('full', numbers_stream.cache_key) +
            cache_manager.size('full', letters_stream.cache_key))
        self.assertEqual(start, start_time)
Ejemplo n.º 5
0
    def test_recording_manager_clears_cache(self):
        """Tests that the RecordingManager clears the cache before recording.

    A job may have incomplete PCollections when the job terminates. Clearing the
    cache ensures that correct results are computed every run.
    """
        # Add the TestStream so that it can be cached.
        ib.options.recordable_sources.add(TestStream)
        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        elems = (p
                 | TestStream().advance_watermark_to(
                     0).advance_processing_time(1).add_elements(list(
                         range(10))).advance_processing_time(1))
        squares = elems | beam.Map(lambda x: x**2)

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Do the first recording to get the timestamp of the first time the fragment
        # was run.
        rm = RecordingManager(p)

        # Get the cache, key, and coder to read the PCollection from the cache.
        pipeline_instrument = pi.PipelineInstrument(p)

        # Set up a mock for the Cache's clear function which will be used to clear
        # uncomputed PCollections.
        rm._clear_pcolls = MagicMock()
        rm.record([squares], max_n=1, max_duration=500)
        rm.cancel()

        # Assert that the cache cleared the PCollection.
        rm._clear_pcolls.assert_any_call(
            unittest.mock.ANY,
            set(pipeline_instrument.cache_key(pc) for pc in (elems, squares)))
Ejemplo n.º 6
0
def head(pcoll, n=5, include_window_info=False):
    """Materializes the first n elements from a PCollection into a Dataframe.

  This reads each element from file and reads only the amount that it needs
  into memory.
  For example::

    p = beam.Pipeline(InteractiveRunner())
    init = p | 'Init' >> beam.Create(range(10))
    square = init | 'Square' >> beam.Map(lambda x: x * x)

    # Run the pipeline and bring the PCollection into memory as a Dataframe.
    in_memory_square = head(square, n=5)
  """
    assert isinstance(pcoll, beam.pvalue.PCollection), (
        '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))

    user_pipeline = pcoll.pipeline
    runner = user_pipeline.runner
    if isinstance(runner, ir.InteractiveRunner):
        runner = runner._underlying_runner

    # Make sure that sources without a user reference are still cached.
    pi.watch_sources(user_pipeline)

    # Make sure that all PCollections to be shown are watched. If a PCollection
    # has not been watched, make up a variable name for that PCollection and watch
    # it. No validation is needed here because the watch logic can handle
    # arbitrary variables.
    watched_pcollections = set()
    for watching in ie.current_env().watching():
        for _, val in watching:
            if hasattr(val, '__class__') and isinstance(
                    val, beam.pvalue.PCollection):
                watched_pcollections.add(val)
    if pcoll not in watched_pcollections:
        watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll})

    warnings.filterwarnings('ignore', category=DeprecationWarning)
    # Attempt to run background caching job since we have the reference to the
    # user-defined pipeline.
    bcj.attempt_to_run_background_caching_job(runner, user_pipeline,
                                              user_pipeline.options)

    if pcoll in ie.current_env().computed_pcollections:
        # Read from pcoll cache, then convert to DF
        pipeline_instrument = pi.PipelineInstrument(pcoll.pipeline)
        key = pipeline_instrument.cache_key(pcoll)
        cache_manager = ie.current_env().cache_manager()

        coder = cache_manager.load_pcoder('full', key)
        reader, _ = cache_manager.read('full', key)
        elements = to_element_list(reader, coder, include_window_info=True)
    else:

        # Build a pipeline fragment for the PCollections and run it.
        result = pf.PipelineFragment([pcoll], user_pipeline.options).run()
        ie.current_env().set_pipeline_result(user_pipeline, result)

        # Invoke wait_until_finish to ensure the blocking nature of this API without
        # relying on the run to be blocking.
        result.wait_until_finish()

        # If the pipeline execution is successful at this stage, mark the
        # computation completeness for the given PCollections so that when further
        # `show` invocation occurs, Interactive Beam wouldn't need to re-compute.
        if result.state is beam.runners.runner.PipelineState.DONE:
            ie.current_env().mark_pcollection_computed([pcoll])

        elements = result.read(pcoll, include_window_info=True)

    results = []
    for e in elements:
        results.append(e)
        if len(results) >= n and n > 0:
            break

    return elements_to_df(results, include_window_info=include_window_info)
Ejemplo n.º 7
0
    def __init__(
            self,
            pipeline,  # type: Union[beam_runner_api_pb2.Pipeline, beam.Pipeline]
            default_vertex_attrs={'shape': 'box'},
            default_edge_attrs=None,
            render_option=None):
        """Constructor of PipelineGraph.

    Examples:
      graph = pipeline_graph.PipelineGraph(pipeline_proto)
      graph.get_dot()

      or

      graph = pipeline_graph.PipelineGraph(pipeline)
      graph.get_dot()

    Args:
      pipeline: (Pipeline proto) or (Pipeline) pipeline to be rendered.
      default_vertex_attrs: (Dict[str, str]) a dict of default vertex attributes
      default_edge_attrs: (Dict[str, str]) a dict of default edge attributes
      render_option: (str) this parameter decides how the pipeline graph is
          rendered. See display.pipeline_graph_renderer for available options.
    """
        self._lock = threading.Lock()
        self._graph = None  # type: pydot.Dot
        self._pipeline_instrument = None
        if isinstance(pipeline, beam.Pipeline):
            self._pipeline_instrument = inst.PipelineInstrument(pipeline)
            # The pre-process links user pipeline to runner pipeline through analysis
            # but without mutating runner pipeline.
            self._pipeline_instrument.preprocess()

        if isinstance(pipeline, beam_runner_api_pb2.Pipeline):
            self._pipeline_proto = pipeline
        elif isinstance(pipeline, beam.Pipeline):
            self._pipeline_proto = pipeline.to_runner_api()
        else:
            raise TypeError(
                'pipeline should either be a %s or %s, while %s is given' %
                (beam_runner_api_pb2.Pipeline, beam.Pipeline, type(pipeline)))

        # A dict from PCollection ID to a list of its consuming Transform IDs
        self._consumers = collections.defaultdict(
            list)  # type: DefaultDict[str, List[str]]
        # A dict from PCollection ID to its producing Transform ID
        self._producers = {}  # type: Dict[str, str]

        for transform_id, transform_proto in self._top_level_transforms():
            for pcoll_id in transform_proto.inputs.values():
                self._consumers[pcoll_id].append(transform_id)
            for pcoll_id in transform_proto.outputs.values():
                self._producers[pcoll_id] = transform_id

        default_vertex_attrs = default_vertex_attrs or {'shape': 'box'}
        if 'color' not in default_vertex_attrs:
            default_vertex_attrs['color'] = 'blue'
        if 'fontcolor' not in default_vertex_attrs:
            default_vertex_attrs['fontcolor'] = 'blue'

        vertex_dict, edge_dict = self._generate_graph_dicts()
        self._construct_graph(vertex_dict, edge_dict, default_vertex_attrs,
                              default_edge_attrs)

        self._renderer = pipeline_graph_renderer.get_renderer(render_option)
Ejemplo n.º 8
0
    def test_computed(self):
        """Tests that a PCollection is marked as computed only in a complete state.

    Because the background caching job is now long-lived, repeated runs of a
    PipelineFragment may yield different results for the same PCollection.
    """

        p = beam.Pipeline(InteractiveRunner())
        elems = p | beam.Create([0, 1, 2])

        ib.watch(locals())

        # Create a MockPipelineResult to control the state of a fake run of the
        # pipeline.
        mock_result = MockPipelineResult()
        ie.current_env().track_user_pipelines()
        ie.current_env().set_pipeline_result(p, mock_result)

        # Create a mock BackgroundCachingJob that will control whether to set the
        # PCollections as computed or not.
        bcj_mock_result = MockPipelineResult()
        background_caching_job = bcj.BackgroundCachingJob(bcj_mock_result, [])

        # Create a recording.
        recording = Recording(p, [elems],
                              mock_result,
                              pi.PipelineInstrument(p),
                              max_n=10,
                              max_duration_secs=60)

        # The background caching job and the recording isn't done yet so there may
        # be more elements to be recorded.
        self.assertFalse(recording.is_computed())
        self.assertFalse(recording.computed())
        self.assertTrue(recording.uncomputed())

        # The recording is finished but the background caching job is not. There
        # may still be more elements to record, or the intermediate PCollection may
        # have stopped caching in an incomplete state, e.g. before a window could
        # fire.
        mock_result.set_state(PipelineState.DONE)
        recording.wait_until_finish()

        self.assertFalse(recording.is_computed())
        self.assertFalse(recording.computed())
        self.assertTrue(recording.uncomputed())

        # The background caching job finished before we started a recording which
        # is a sure signal that there will be no more elements.
        bcj_mock_result.set_state(PipelineState.DONE)
        ie.current_env().set_background_caching_job(p, background_caching_job)
        recording = Recording(p, [elems],
                              mock_result,
                              pi.PipelineInstrument(p),
                              max_n=10,
                              max_duration_secs=60)
        recording.wait_until_finish()

        # There are no more elements and the recording finished, meaning that the
        # intermediate PCollections are in a complete state. They can now be marked
        # as computed.
        self.assertTrue(recording.is_computed())
        self.assertTrue(recording.computed())
        self.assertFalse(recording.uncomputed())
Ejemplo n.º 9
0
    def record(self, pcolls, max_n, max_duration_secs):
        # type: (List[beam.pvalue.PCollection], int, int) -> Recording
        """Records the given PCollections."""

        # Assert that all PCollection come from the same user_pipeline.
        for pcoll in pcolls:
            assert pcoll.pipeline is self.user_pipeline, (
                '{} belongs to a different user-defined pipeline ({}) than that of'
                ' other PCollections ({}).'.format(pcoll, pcoll.pipeline,
                                                   self.user_pipeline))

        runner = self.user_pipeline.runner
        if isinstance(runner, ir.InteractiveRunner):
            runner = runner._underlying_runner

        # Make sure that sources without a user reference are still cached.
        pi.watch_sources(self.user_pipeline)

        # Make sure that all PCollections to be shown are watched. If a PCollection
        # has not been watched, make up a variable name for that PCollection and
        # watch it. No validation is needed here because the watch logic can handle
        # arbitrary variables.
        self._watch(pcolls)
        pipeline_instrument = pi.PipelineInstrument(self.user_pipeline)

        # Attempt to run background caching job to record any sources.
        if ie.current_env().is_in_ipython:
            warnings.filterwarnings(
                'ignore',
                'options is deprecated since First stable release. References to '
                '<pipeline>.options will not be supported',
                category=DeprecationWarning)
        bcj.attempt_to_run_background_caching_job(
            runner, self.user_pipeline, options=self.user_pipeline.options)

        # Get the subset of computed PCollections. These do not to be recomputed.
        computed_pcolls = set(
            pcoll for pcoll in pcolls
            if pcoll in ie.current_env().computed_pcollections)

        # Start a pipeline fragment to start computing the PCollections.
        uncomputed_pcolls = set(pcolls).difference(computed_pcolls)
        if uncomputed_pcolls:
            # Clear the cache of the given uncomputed PCollections because they are
            # incomplete.
            self._clear(pipeline_instrument)

            warnings.filterwarnings(
                'ignore',
                'options is deprecated since First stable release. References to '
                '<pipeline>.options will not be supported',
                category=DeprecationWarning)
            pf.PipelineFragment(list(uncomputed_pcolls),
                                self.user_pipeline.options).run()
            result = ie.current_env().pipeline_result(self.user_pipeline)
        else:
            result = None

        recording = Recording(self.user_pipeline, pcolls, result,
                              pipeline_instrument, max_n, max_duration_secs)
        self._recordings.add(recording)

        return recording
Ejemplo n.º 10
0
 def __init__(self, user_pipeline):
     # type: (beam.Pipeline, List[Limiter]) -> None
     self.user_pipeline = user_pipeline
     self._pipeline_instrument = pi.PipelineInstrument(self.user_pipeline)