def test_cancel_stops_recording(self):
        # Add the TestStream so that it can be cached.
        ib.options.capturable_sources.add(TestStream)

        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        elems = (p
                 | TestStream().advance_watermark_to(
                     0).advance_processing_time(1).add_elements(list(
                         range(10))).advance_processing_time(1))
        squares = elems | beam.Map(lambda x: x**2)

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Get the recording then the BackgroundCachingJob.
        rm = RecordingManager(p)
        recording = rm.record([squares], max_n=10, max_duration=30)

        # The BackgroundCachingJob is still waiting for more elements, so it isn't
        # done yet.
        bcj = ie.current_env().get_background_caching_job(p)
        self.assertFalse(bcj.is_done())

        # Assert that something was read and that the BackgroundCachingJob was
        # sucessfully stopped.
        self.assertTrue(list(recording.stream(squares).read()))
        rm.cancel()
        self.assertTrue(bcj.is_done())
    def test_basic_wordcount(self):
        """A wordcount to be used as a smoke test."""

        # Create the pipeline that will emit 0, 1, 2.
        p = beam.Pipeline(InteractiveRunner())
        elems = p | beam.Create([0, 1, 2])

        # Watch the pipeline and PCollections. This is normally done in a notebook
        # environment automatically, but we have to do it manually here.
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Create the recording objects. By calling `record` a new PipelineFragment
        # is started to compute the given PCollections and cache to disk.
        rm = RecordingManager(p)
        recording = rm.record([elems], max_n=3, max_duration_secs=500)
        stream = recording.stream(elems)
        recording.wait_until_finish()

        # Once the pipeline fragment completes, we can read from the stream and know
        # that all elements were written to cache.
        elems = list(stream.read())
        expected_elems = [
            WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3)
        ]
        self.assertListEqual(elems, expected_elems)
Example #3
0
def collect(pcoll, n='inf', duration='inf', include_window_info=False):
    """Materializes the elements from a PCollection into a Dataframe.

  This reads each element from file and reads only the amount that it needs
  into memory. The user can specify either the max number of elements to read
  or the maximum duration of elements to read. When a limiter is not supplied,
  it is assumed to be infinite.

  Args:
    n: (optional) max number of elements to visualize. Default 'inf'.
    duration: (optional) max duration of elements to read. Default 'inf'.

  For example::

    p = beam.Pipeline(InteractiveRunner())
    init = p | 'Init' >> beam.Create(range(10))
    square = init | 'Square' >> beam.Map(lambda x: x * x)

    # Run the pipeline and bring the PCollection into memory as a Dataframe.
    in_memory_square = head(square, n=5)
  """
    assert isinstance(pcoll, beam.pvalue.PCollection), (
        '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))

    if isinstance(n, str):
        assert n == 'inf', (
            'Currently only the string \'inf\' is supported. This denotes reading '
            'elements until the recording is stopped via a kernel interrupt.')
    elif isinstance(n, int):
        assert n > 0, 'n needs to be positive or the string \'inf\''

    if isinstance(duration, str):
        assert duration == 'inf', (
            'Currently only the string \'inf\' is supported. This denotes reading '
            'elements until the recording is stopped via a kernel interrupt.')
    elif isinstance(duration, int):
        assert duration > 0, 'duration needs to be positive or the string \'inf\''

    if n == 'inf':
        n = float('inf')

    if duration == 'inf':
        duration = float('inf')

    user_pipeline = pcoll.pipeline
    recording_manager = RecordingManager(user_pipeline)

    recording = recording_manager.record([pcoll],
                                         max_n=n,
                                         max_duration_secs=duration)

    try:
        elements = list(recording.stream(pcoll).read())
    except KeyboardInterrupt:
        recording.cancel()
        return pd.DataFrame()

    return elements_to_df(elements, include_window_info=include_window_info)
Example #4
0
    def test_duration_parsing(self):
        p = beam.Pipeline(InteractiveRunner())
        elems = p | beam.Create([0, 1, 2])

        # Watch the pipeline and PCollections. This is normally done in a notebook
        # environment automatically, but we have to do it manually here.
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Create the recording objects.
        rm = RecordingManager(p)
        recording = rm.record([elems], max_n=3, max_duration='500s')
        recording.wait_until_finish()

        # Assert that the duration was parsed correctly to integer seconds.
        self.assertEqual(recording.describe()['duration'], 500)
Example #5
0
  def test_set_get_recording_manager(self):
    ie._interactive_beam_env = None
    ie.new_env()

    p = beam.Pipeline()
    rm = RecordingManager(p)
    ie.current_env().set_recording_manager(rm, p)
    self.assertIs(rm, ie.current_env().get_recording_manager(p))
    def setUp(self):
        ie.new_env()
        # Allow unit test to run outside of ipython kernel since we don't test the
        # frontend rendering in unit tests.
        pv._pcoll_visualization_ready = True
        # Generally test the logic where notebook is connected to the assumed
        # ipython kernel by forcefully setting notebook check to True.
        ie.current_env()._is_in_notebook = True
        ib.options.display_timezone = pytz.timezone('US/Pacific')

        self._p = beam.Pipeline(ir.InteractiveRunner())
        # pylint: disable=range-builtin-not-iterating
        self._pcoll = self._p | 'Create' >> beam.Create(range(5))

        ib.watch(self)
        ie.current_env().track_user_pipelines()

        recording_manager = RecordingManager(self._p)
        recording = recording_manager.record([self._pcoll], 5, 5)
        self._stream = recording.stream(self._pcoll)
Example #7
0
    def test_record_pipeline(self):
        # Add the TestStream so that it can be cached.
        ib.options.recordable_sources.add(TestStream)
        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        # pylint: disable=unused-variable
        _ = (p
             | TestStream()
                 .advance_watermark_to(0)
                 .advance_processing_time(1)
                 .add_elements(list(range(10)))
                 .advance_processing_time(1))  # yapf: disable

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Create a lmiter that stops the background caching job when something is
        # written to cache. This is used to make ensure that the pipeline is
        # functioning properly and that there are no data races with the test.
        class SizeLimiter(Limiter):
            def __init__(self, p):
                self.pipeline = p
                self._rm = None

            def set_recording_manager(self, rm):
                self._rm = rm

            def is_triggered(self):
                return self._rm.describe()['size'] > 0 if self._rm else False

        # Do the first recording to get the timestamp of the first time the fragment
        # was run.
        size_limiter = SizeLimiter(p)
        rm = RecordingManager(p, test_limiters=[size_limiter])
        size_limiter.set_recording_manager(rm)
        self.assertEqual(rm.describe()['state'], PipelineState.STOPPED)
        self.assertTrue(rm.record_pipeline())

        # A recording is in progress, no need to start another one.
        self.assertFalse(rm.record_pipeline())

        for _ in range(60):
            if rm.describe()['state'] == PipelineState.CANCELLED:
                break
            time.sleep(1)
        self.assertTrue(
            rm.describe()['state'] == PipelineState.CANCELLED,
            'Test timed out waiting for pipeline to be cancelled. This indicates '
            'that the BackgroundCachingJob did not cache anything.')
Example #8
0
    def test_record_detects_remote_runner(self, mock_pipeline_fragment,
                                          mock_clear_pcolls):
        """Tests that a remote runner is detected, resulting in the
    PipelineFragment instance to have blocking enabled."""

        # Create the pipeline that will emit 0, 1, 2.
        p = beam.Pipeline(InteractiveRunner())
        numbers = p | 'numbers' >> beam.Create([0, 1, 2])

        # Set the cache directory for Interactive Beam to be in a GCS bucket.
        ib.options.cache_root = 'gs://test-bucket/'

        # Create the recording objects. By calling `record` a new PipelineFragment
        # is started to compute the given PCollections and cache to disk.
        rm = RecordingManager(p)

        # Run record() and check if the PipelineFragment.run had blocking set to
        # True due to the GCS cache_root value.
        rm.record([numbers], max_n=3, max_duration=500)
        mock_pipeline_fragment.assert_called_with(blocking=True)

        # Reset cache_root value.
        ib.options.cache_root = None
 def get_recording_manager(self, pipeline, create_if_absent=False):
   """Gets the recording manager for the given pipeline."""
   recording_manager = self._recording_managers.get(str(id(pipeline)), None)
   if not recording_manager and create_if_absent:
     # Get the pipeline variable name for the user. This is useful if the user
     # has multiple pipelines.
     pipeline_var = ''
     for w in self.watching():
       for var, val in w:
         if val is pipeline:
           pipeline_var = var
           break
     recording_manager = RecordingManager(pipeline, pipeline_var)
     self._recording_managers[str(id(pipeline))] = recording_manager
   return recording_manager
Example #10
0
    def test_clear(self):
        p1 = beam.Pipeline(InteractiveRunner())
        elems_1 = p1 | 'elems 1' >> beam.Create([0, 1, 2])

        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        recording_manager = RecordingManager(p1)
        recording = recording_manager.record([elems_1],
                                             max_n=3,
                                             max_duration=500)
        recording.wait_until_finish()
        record_describe = recording_manager.describe()
        self.assertGreater(record_describe['size'], 0)
        recording_manager.clear()
        self.assertEqual(recording_manager.describe()['size'], 0)
Example #11
0
    def test_basic_execution(self):
        """A basic pipeline to be used as a smoke test."""

        # Create the pipeline that will emit 0, 1, 2.
        p = beam.Pipeline(InteractiveRunner())
        numbers = p | 'numbers' >> beam.Create([0, 1, 2])
        letters = p | 'letters' >> beam.Create(['a', 'b', 'c'])

        # Watch the pipeline and PCollections. This is normally done in a notebook
        # environment automatically, but we have to do it manually here.
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Create the recording objects. By calling `record` a new PipelineFragment
        # is started to compute the given PCollections and cache to disk.
        rm = RecordingManager(p)
        numbers_recording = rm.record([numbers],
                                      max_n=3,
                                      max_duration_secs=500)
        numbers_stream = numbers_recording.stream(numbers)
        numbers_recording.wait_until_finish()

        # Once the pipeline fragment completes, we can read from the stream and know
        # that all elements were written to cache.
        elems = list(numbers_stream.read())
        expected_elems = [
            WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3)
        ]
        self.assertListEqual(elems, expected_elems)

        # Make an extra recording and test the description.
        letters_recording = rm.record([letters],
                                      max_n=3,
                                      max_duration_secs=500)
        letters_recording.wait_until_finish()

        self.assertEqual(
            rm.describe()['size'],
            numbers_recording.describe()['size'] +
            letters_recording.describe()['size'])

        rm.cancel()
    def test_recording_manager_clears_cache(self):
        """Tests that the RecordingManager clears the cache before recording.

    A job may have incomplete PCollections when the job terminates. Clearing the
    cache ensures that correct results are computed every run.
    """
        # Add the TestStream so that it can be cached.
        ib.options.recordable_sources.add(TestStream)
        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        elems = (p
                 | TestStream().advance_watermark_to(
                     0).advance_processing_time(1).add_elements(list(
                         range(10))).advance_processing_time(1))
        squares = elems | beam.Map(lambda x: x**2)

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Do the first recording to get the timestamp of the first time the fragment
        # was run.
        rm = RecordingManager(p)

        # Get the cache, key, and coder to read the PCollection from the cache.
        pipeline_instrument = pi.PipelineInstrument(p)

        # Set up a mock for the Cache's clear function which will be used to clear
        # uncomputed PCollections.
        rm._clear_pcolls = MagicMock()
        rm.record([squares], max_n=1, max_duration=500)
        rm.cancel()

        # Assert that the cache cleared the PCollection.
        rm._clear_pcolls.assert_any_call(
            unittest.mock.ANY,
            set(pipeline_instrument.cache_key(pc) for pc in (elems, squares)))
Example #13
0
    def test_clear_specific_pipeline(self):
        """Tests that clear can empty the cache for a specific pipeline."""

        # Create two pipelines so we can check that clearing the cache won't clear
        # all defined pipelines.
        p1 = beam.Pipeline(InteractiveRunner())
        elems_1 = p1 | 'elems 1' >> beam.Create([0, 1, 2])

        p2 = beam.Pipeline(InteractiveRunner())
        elems_2 = p2 | 'elems 2' >> beam.Create([0, 1, 2])

        # Watch the pipeline and PCollections. This is normally done in a notebook
        # environment automatically, but we have to do it manually here.
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Create the recording objects. By calling `record` a new PipelineFragment
        # is started to compute the given PCollections and cache to disk.
        rm_1 = RecordingManager(p1)
        recording = rm_1.record([elems_1], max_n=3, max_duration=500)
        recording.wait_until_finish()

        rm_2 = RecordingManager(p2)
        recording = rm_2.record([elems_2], max_n=3, max_duration=500)
        recording.wait_until_finish()
        # Assert that clearing only one recording clears that recording.
        if rm_1.describe()['state'] == PipelineState.STOPPED \
                and rm_2.describe()['state'] == PipelineState.STOPPED:

            self.assertGreater(rm_1.describe()['size'], 0)
            self.assertGreater(rm_2.describe()['size'], 0)
            rm_1.clear()
            self.assertEqual(rm_1.describe()['size'], 0)
            self.assertGreater(rm_2.describe()['size'], 0)

            rm_2.clear()
            self.assertEqual(rm_2.describe()['size'], 0)
Example #14
0
 def test_set_get_recording_manager(self):
     p = beam.Pipeline()
     rm = RecordingManager(p)
     ie.current_env().set_recording_manager(rm, p)
     self.assertIs(rm, ie.current_env().get_recording_manager(p))
    def test_recording_manager_clears_cache(self):
        """Tests that the RecordingManager clears the cache before recording.

    A job may have incomplete PCollections when the job terminates. Clearing the
    cache ensures that correct results are computed every run.
    """
        # Add the TestStream so that it can be cached.
        ib.options.capturable_sources.add(TestStream)
        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        elems = (p
                 | TestStream().advance_watermark_to(
                     0).advance_processing_time(1).add_elements(list(
                         range(10))).advance_processing_time(1))
        squares = elems | beam.Map(lambda x: x**2)

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Do the first recording to get the timestamp of the first time the fragment
        # was run.
        rm = RecordingManager(p)
        rm.record([squares], max_n=10, max_duration=2)
        first_recording_start = rm.describe()['start']
        rm.cancel()

        # Get the cache, key, and coder to read the PCollection from the cache.
        pipeline_instrument = pi.PipelineInstrument(p)
        cache = ie.current_env().get_cache_manager(p)
        cache_key = pipeline_instrument.cache_key(squares)

        # Set up a mock for the Cache's clear function which will be used to clear
        # uncomputed PCollections.
        cache.clear = MagicMock()

        # Rerun the fragment. If the cache was cleared correctly then the starting
        # time of the second recording will be later than the first. This is because
        # the PCollection wasn't considered to be computedand was cleared from
        # cache. Thus the pipeline fragment was rerun for that PCollection at a
        # later time.
        rm.record([squares], max_n=10, max_duration=1)
        second_recording_start = rm.describe()['start']
        rm.cancel()
        self.assertGreater(second_recording_start, first_recording_start)

        # Assert that the cache cleared the PCollection.
        cache.clear.assert_called_with('full', cache_key)
Example #16
0
def show(*pcolls, **configs):
    # type: (*Union[Dict[Any, PCollection], Iterable[PCollection], PCollection], **bool) -> None
    """Shows given PCollections in an interactive exploratory way if used within
  a notebook, or prints a heading sampled data if used within an ipython shell.
  Noop if used in a non-interactive environment.

  Args:
    include_window_info: (optional) if True, windowing information of the
        data will be visualized too. Default is false.
    visualize_data: (optional) by default, the visualization contains data
        tables rendering data from given pcolls separately as if they are
        converted into dataframes. If visualize_data is True, there will be a
        more dive-in widget and statistically overview widget of the data.
        Otherwise, those 2 data visualization widgets will not be displayed.
    n: (optional) max number of elements to visualize. Default 'inf'.
    duration: (optional) max duration of elements to read. Default 'inf'.

  The given pcolls can be dictionary of PCollections (as values), or iterable
  of PCollections or plain PCollection values.

  The user can specify either the max number of elements with `n` to read
  or the maximum duration of elements to read with `duration`. When a limiter is
  not supplied, it is assumed to be infinite.

  By default, the visualization contains data tables rendering data from given
  pcolls separately as if they are converted into dataframes. If visualize_data
  is True, there will be a more dive-in widget and statistically overview widget
  of the data. Otherwise, those 2 data visualization widgets will not be
  displayed.

  Ad hoc builds a pipeline fragment including only transforms that are
  necessary to produce data for given PCollections pcolls, runs the pipeline
  fragment to compute data for those pcolls and then visualizes the data.

  The function is always blocking. If used within a notebook, the data
  visualized might be dynamically updated before the function returns as more
  and more data could getting processed and emitted when the pipeline fragment
  is being executed. If used within an ipython shell, there will be no dynamic
  plotting but a static plotting in the end of pipeline fragment execution.

  The PCollections given must belong to the same pipeline.

    For example::

      p = beam.Pipeline(InteractiveRunner())
      init = p | 'Init' >> beam.Create(range(1000))
      square = init | 'Square' >> beam.Map(lambda x: x * x)
      cube = init | 'Cube' >> beam.Map(lambda x: x ** 3)

      # Below builds a pipeline fragment from the defined pipeline `p` that
      # contains only applied transforms of `Init` and `Square`. Then the
      # interactive runner runs the pipeline fragment implicitly to compute data
      # represented by PCollection `square` and visualizes it.
      show(square)

      # This is equivalent to `show(square)` because `square` depends on `init`
      # and `init` is included in the pipeline fragment and computed anyway.
      show(init, square)

      # Below is similar to running `p.run()`. It computes data for both
      # PCollection `square` and PCollection `cube`, then visualizes them.
      show(square, cube)
  """
    flatten_pcolls = []
    for pcoll_container in pcolls:
        if isinstance(pcoll_container, dict):
            flatten_pcolls.extend(pcoll_container.values())
        elif isinstance(pcoll_container, beam.pvalue.PCollection):
            flatten_pcolls.append(pcoll_container)
        else:
            try:
                flatten_pcolls.extend(iter(pcoll_container))
            except TypeError:
                raise ValueError(
                    'The given pcoll %s is not a dict, an iterable or a PCollection.'
                    % pcoll_container)
    pcolls = flatten_pcolls
    assert len(pcolls) > 0, (
        'Need at least 1 PCollection to show data visualization.')
    for pcoll in pcolls:
        assert isinstance(pcoll, beam.pvalue.PCollection), (
            '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))
    user_pipeline = pcolls[0].pipeline

    # TODO(BEAM-8288): Remove below pops and assertion once Python 2 is
    # deprecated from Beam.
    include_window_info = configs.pop('include_window_info', False)
    visualize_data = configs.pop('visualize_data', False)
    n = configs.pop('n', 'inf')
    duration = configs.pop('duration', 'inf')

    if isinstance(n, str):
        assert n == 'inf', (
            'Currently only the string \'inf\' is supported. This denotes reading '
            'elements until the recording is stopped via a kernel interrupt.')
    elif isinstance(n, int):
        assert n > 0, 'n needs to be positive or the string \'inf\''

    if isinstance(duration, str):
        assert duration == 'inf', (
            'Currently only the string \'inf\' is supported. This denotes reading '
            'elements until the recording is stopped via a kernel interrupt.')
    elif isinstance(duration, int):
        assert duration > 0, 'duration needs to be positive or the string \'inf\''

    if n == 'inf':
        n = float('inf')

    if duration == 'inf':
        duration = float('inf')

    # This assertion is to protect the backward compatibility for function
    # signature change after Python 2 deprecation.
    assert not configs, (
        'The only supported arguments are include_window_info, visualize_data, '
        'n, and duration')

    recording_manager = RecordingManager(user_pipeline)
    recording = recording_manager.record(pcolls,
                                         max_n=n,
                                         max_duration_secs=duration)

    # Catch a KeyboardInterrupt to gracefully cancel the recording and
    # visualizations.
    try:
        # If in notebook, static plotting computed pcolls as computation is done.
        if ie.current_env().is_in_notebook:
            for stream in recording.computed().values():
                visualize(stream,
                          include_window_info=include_window_info,
                          display_facets=visualize_data)
        elif ie.current_env().is_in_ipython:
            for stream in recording.computed().values():
                visualize(stream, include_window_info=include_window_info)

        if recording.is_computed():
            return

        # If in notebook, dynamic plotting as computation goes.
        if ie.current_env().is_in_notebook:
            for stream in recording.uncomputed().values():
                visualize(stream,
                          dynamic_plotting_interval=1,
                          include_window_info=include_window_info,
                          display_facets=visualize_data)

        # Invoke wait_until_finish to ensure the blocking nature of this API without
        # relying on the run to be blocking.
        recording.wait_until_finish()

        # If just in ipython shell, plotting once when the computation is completed.
        if ie.current_env(
        ).is_in_ipython and not ie.current_env().is_in_notebook:
            for stream in recording.computed().values():
                visualize(stream, include_window_info=include_window_info)

    except KeyboardInterrupt:
        if recording:
            recording.cancel()