예제 #1
0
 def test_auto_stop_dynamic_plotting_when_job_is_terminated(
         self, mocked_timeloop):
     fake_pipeline_result = runner.PipelineResult(
         runner.PipelineState.RUNNING)
     ie.current_env().set_pipeline_result(self._p, fake_pipeline_result)
     # Starts non-stopping async dynamic plotting until the job is terminated.
     pv.visualize(self._pcoll, dynamic_plotting_interval=0.001)
     # Blocking so the above async task can execute some iterations.
     time.sleep(1)
     mocked_timeloop.assert_not_called()
     fake_pipeline_result = runner.PipelineResult(runner.PipelineState.DONE)
     ie.current_env().set_pipeline_result(self._p, fake_pipeline_result)
     # Blocking so the above async task can execute some iterations.
     time.sleep(1)
     # "assert_called" is new in Python 3.6.
     mocked_timeloop.assert_called()
예제 #2
0
 def test_dynamic_plotting_return_handle(self, mocked_is_in_notebook,
                                         unused):
     mocked_is_in_notebook.return_value = True
     h = pv.visualize(self._stream,
                      dynamic_plotting_interval=1,
                      display_facets=True)
     self.assertIsInstance(h, timeloop.Timeloop)
     h.stop()
예제 #3
0
 def test_no_dynamic_plotting_when_not_in_notebook(self,
                                                   mocked_is_in_notebook,
                                                   unused):
     mocked_is_in_notebook.return_value = False
     h = pv.visualize(self._stream,
                      dynamic_plotting_interval=1,
                      display_facets=True)
     self.assertIsNone(h)
예제 #4
0
 def test_dynamic_plotting_update_same_display(self, mocked_display_facets):
     fake_pipeline_result = runner.PipelineResult(
         runner.PipelineState.RUNNING)
     ie.current_env().set_pipeline_result(self._p, fake_pipeline_result)
     # Starts async dynamic plotting that never ends in this test.
     h = pv.visualize(self._pcoll, dynamic_plotting_interval=0.001)
     # Blocking so the above async task can execute some iterations.
     time.sleep(1)
     # The first iteration doesn't provide updating_pv to display_facets.
     _, first_kwargs = mocked_display_facets.call_args_list[0]
     self.assertEqual(first_kwargs, {})
     # The following iterations use the same updating_pv to display_facets and so
     # on.
     _, second_kwargs = mocked_display_facets.call_args_list[1]
     updating_pv = second_kwargs['updating_pv']
     for call in mocked_display_facets.call_args_list[2:]:
         _, kwargs = call
         self.assertIs(kwargs['updating_pv'], updating_pv)
     h.stop()
예제 #5
0
def show(*pcolls,
         include_window_info=False,
         visualize_data=False,
         n='inf',
         duration='inf'):
    # type: (*Union[Dict[Any, PCollection], Iterable[PCollection], PCollection], bool, bool, Union[int, str], Union[int, str]) -> None
    """Shows given PCollections in an interactive exploratory way if used within
  a notebook, or prints a heading sampled data if used within an ipython shell.
  Noop if used in a non-interactive environment.

  Args:
    include_window_info: (optional) if True, windowing information of the
        data will be visualized too. Default is false.
    visualize_data: (optional) by default, the visualization contains data
        tables rendering data from given pcolls separately as if they are
        converted into dataframes. If visualize_data is True, there will be a
        more dive-in widget and statistically overview widget of the data.
        Otherwise, those 2 data visualization widgets will not be displayed.
    n: (optional) max number of elements to visualize. Default 'inf'.
    duration: (optional) max duration of elements to read in integer seconds or
        a string duration. Default 'inf'.

  The given pcolls can be dictionary of PCollections (as values), or iterable
  of PCollections or plain PCollection values.

  The user can specify either the max number of elements with `n` to read
  or the maximum duration of elements to read with `duration`. When a limiter is
  not supplied, it is assumed to be infinite.

  By default, the visualization contains data tables rendering data from given
  pcolls separately as if they are converted into dataframes. If visualize_data
  is True, there will be a more dive-in widget and statistically overview widget
  of the data. Otherwise, those 2 data visualization widgets will not be
  displayed.

  Ad hoc builds a pipeline fragment including only transforms that are
  necessary to produce data for given PCollections pcolls, runs the pipeline
  fragment to compute data for those pcolls and then visualizes the data.

  The function is always blocking. If used within a notebook, the data
  visualized might be dynamically updated before the function returns as more
  and more data could getting processed and emitted when the pipeline fragment
  is being executed. If used within an ipython shell, there will be no dynamic
  plotting but a static plotting in the end of pipeline fragment execution.

  The PCollections given must belong to the same pipeline.

    For example::

      p = beam.Pipeline(InteractiveRunner())
      init = p | 'Init' >> beam.Create(range(1000))
      square = init | 'Square' >> beam.Map(lambda x: x * x)
      cube = init | 'Cube' >> beam.Map(lambda x: x ** 3)

      # Below builds a pipeline fragment from the defined pipeline `p` that
      # contains only applied transforms of `Init` and `Square`. Then the
      # interactive runner runs the pipeline fragment implicitly to compute data
      # represented by PCollection `square` and visualizes it.
      show(square)

      # This is equivalent to `show(square)` because `square` depends on `init`
      # and `init` is included in the pipeline fragment and computed anyway.
      show(init, square)

      # Below is similar to running `p.run()`. It computes data for both
      # PCollection `square` and PCollection `cube`, then visualizes them.
      show(square, cube)
  """
    flatten_pcolls = []
    for pcoll_container in pcolls:
        if isinstance(pcoll_container, dict):
            flatten_pcolls.extend(pcoll_container.values())
        elif isinstance(pcoll_container, beam.pvalue.PCollection):
            flatten_pcolls.append(pcoll_container)
        else:
            try:
                flatten_pcolls.extend(iter(pcoll_container))
            except TypeError:
                raise ValueError(
                    'The given pcoll %s is not a dict, an iterable or a PCollection.'
                    % pcoll_container)
    pcolls = flatten_pcolls
    assert len(pcolls) > 0, (
        'Need at least 1 PCollection to show data visualization.')
    for pcoll in pcolls:
        assert isinstance(pcoll, beam.pvalue.PCollection), (
            '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))
    user_pipeline = pcolls[0].pipeline

    if isinstance(n, str):
        assert n == 'inf', (
            'Currently only the string \'inf\' is supported. This denotes reading '
            'elements until the recording is stopped via a kernel interrupt.')
    elif isinstance(n, int):
        assert n > 0, 'n needs to be positive or the string \'inf\''

    if isinstance(duration, int):
        assert duration > 0, (
            'duration needs to be positive, a duration string, '
            'or the string \'inf\'')

    if n == 'inf':
        n = float('inf')

    if duration == 'inf':
        duration = float('inf')

    recording_manager = ie.current_env().get_recording_manager(
        user_pipeline, create_if_absent=True)
    recording = recording_manager.record(pcolls,
                                         max_n=n,
                                         max_duration=duration)

    # Catch a KeyboardInterrupt to gracefully cancel the recording and
    # visualizations.
    try:
        # If in notebook, static plotting computed pcolls as computation is done.
        if ie.current_env().is_in_notebook:
            for stream in recording.computed().values():
                visualize(stream,
                          include_window_info=include_window_info,
                          display_facets=visualize_data)
        elif ie.current_env().is_in_ipython:
            for stream in recording.computed().values():
                visualize(stream, include_window_info=include_window_info)

        if recording.is_computed():
            return

        # If in notebook, dynamic plotting as computation goes.
        if ie.current_env().is_in_notebook:
            for stream in recording.uncomputed().values():
                visualize(stream,
                          dynamic_plotting_interval=1,
                          include_window_info=include_window_info,
                          display_facets=visualize_data)

        # Invoke wait_until_finish to ensure the blocking nature of this API without
        # relying on the run to be blocking.
        recording.wait_until_finish()

        # If just in ipython shell, plotting once when the computation is completed.
        if ie.current_env(
        ).is_in_ipython and not ie.current_env().is_in_notebook:
            for stream in recording.computed().values():
                visualize(stream, include_window_info=include_window_info)

    except KeyboardInterrupt:
        if recording:
            recording.cancel()
예제 #6
0
 def test_one_shot_visualization_not_return_handle(self,
                                                   mocked_is_in_notebook,
                                                   unused):
     mocked_is_in_notebook.return_value = True
     self.assertIsNone(pv.visualize(self._stream, display_facets=True))
예제 #7
0
 def test_display_plain_text_when_kernel_has_no_frontend(
         self, _mocked_head):
     # Resets the notebook check to False.
     ie.current_env()._is_in_notebook = False
     self.assertIsNone(pv.visualize(self._stream, display_facets=True))
     _mocked_head.assert_called_once()
예제 #8
0
def show(*pcolls, **configs):
    # type: (*Union[Dict[Any, PCollection], Iterable[PCollection], PCollection], **bool) -> None
    """Shows given PCollections in an interactive exploratory way if used within
  a notebook, or prints a heading sampled data if used within an ipython shell.
  Noop if used in a non-interactive environment.

  The given pcolls can be dictionary of PCollections (as values), or iterable
  of PCollections or plain PCollection values.

  There are 2 boolean configurations:

    #. include_window_info=<True/False>. If True, windowing information of the
       data will be visualized too. Default is false.
    #. visualize_data=<True/False>. By default, the visualization contains data
       tables rendering data from given pcolls separately as if they are
       converted into dataframes. If visualize_data is True, there will be a
       more dive-in widget and statistically overview widget of the data.
       Otherwise, those 2 data visualization widgets will not be displayed.

  By default, the visualization contains data tables rendering data from given
  pcolls separately as if they are converted into dataframes. If visualize_data
  is True, there will be a more dive-in widget and statistically overview widget
  of the data. Otherwise, those 2 data visualization widgets will not be
  displayed.

  Ad hoc builds a pipeline fragment including only transforms that are
  necessary to produce data for given PCollections pcolls, runs the pipeline
  fragment to compute data for those pcolls and then visualizes the data.

  The function is always blocking. If used within a notebook, the data
  visualized might be dynamically updated before the function returns as more
  and more data could getting processed and emitted when the pipeline fragment
  is being executed. If used within an ipython shell, there will be no dynamic
  plotting but a static plotting in the end of pipeline fragment execution.

  The PCollections given must belong to the same pipeline.

    For example::

      p = beam.Pipeline(InteractiveRunner())
      init = p | 'Init' >> beam.Create(range(1000))
      square = init | 'Square' >> beam.Map(lambda x: x * x)
      cube = init | 'Cube' >> beam.Map(lambda x: x ** 3)

      # Below builds a pipeline fragment from the defined pipeline `p` that
      # contains only applied transforms of `Init` and `Square`. Then the
      # interactive runner runs the pipeline fragment implicitly to compute data
      # represented by PCollection `square` and visualizes it.
      show(square)

      # This is equivalent to `show(square)` because `square` depends on `init`
      # and `init` is included in the pipeline fragment and computed anyway.
      show(init, square)

      # Below is similar to running `p.run()`. It computes data for both
      # PCollection `square` and PCollection `cube`, then visualizes them.
      show(square, cube)
  """
    flatten_pcolls = []
    for pcoll_container in pcolls:
        if isinstance(pcoll_container, dict):
            flatten_pcolls.extend(pcoll_container.values())
        elif isinstance(pcoll_container, beam.pvalue.PCollection):
            flatten_pcolls.append(pcoll_container)
        else:
            try:
                flatten_pcolls.extend(iter(pcoll_container))
            except TypeError:
                raise ValueError(
                    'The given pcoll %s is not a dict, an iterable or a PCollection.'
                    % pcoll_container)
    pcolls = flatten_pcolls
    assert len(pcolls) > 0, (
        'Need at least 1 PCollection to show data visualization.')
    for pcoll in pcolls:
        assert isinstance(pcoll, beam.pvalue.PCollection), (
            '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))
    user_pipeline = pcolls[0].pipeline
    for pcoll in pcolls:
        assert pcoll.pipeline is user_pipeline, (
            '{} belongs to a different user-defined pipeline ({}) than that of'
            ' other PCollections ({}).'.format(pcoll, pcoll.pipeline,
                                               user_pipeline))
    # TODO(BEAM-8288): Remove below pops and assertion once Python 2 is
    # deprecated from Beam.
    include_window_info = configs.pop('include_window_info', False)
    visualize_data = configs.pop('visualize_data', False)
    # This assertion is to protect the backward compatibility for function
    # signature change after Python 2 deprecation.
    assert not configs, (
        'The only configs supported are include_window_info and '
        'visualize_data.')
    runner = user_pipeline.runner
    if isinstance(runner, ir.InteractiveRunner):
        runner = runner._underlying_runner

    # Make sure that sources without a user reference are still cached.
    pi.watch_sources(user_pipeline)

    # Make sure that all PCollections to be shown are watched. If a PCollection
    # has not been watched, make up a variable name for that PCollection and watch
    # it. No validation is needed here because the watch logic can handle
    # arbitrary variables.
    watched_pcollections = set()
    for watching in ie.current_env().watching():
        for _, val in watching:
            if hasattr(val, '__class__') and isinstance(
                    val, beam.pvalue.PCollection):
                watched_pcollections.add(val)
    for pcoll in pcolls:
        if pcoll not in watched_pcollections:
            watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll})

    if ie.current_env().is_in_ipython:
        warnings.filterwarnings(
            'ignore',
            'options is deprecated since First stable release. References to '
            '<pipeline>.options will not be supported',
            category=DeprecationWarning)
    # Attempt to run background caching job since we have the reference to the
    # user-defined pipeline.
    bcj.attempt_to_run_background_caching_job(runner, user_pipeline,
                                              user_pipeline.options)

    pcolls = set(pcolls)
    computed_pcolls = set()
    for pcoll in pcolls:
        if pcoll in ie.current_env().computed_pcollections:
            computed_pcolls.add(pcoll)
    pcolls = pcolls.difference(computed_pcolls)
    # If in notebook, static plotting computed pcolls as computation is done.
    if ie.current_env().is_in_notebook:
        for pcoll in computed_pcolls:
            visualize(pcoll,
                      include_window_info=include_window_info,
                      display_facets=visualize_data)
    elif ie.current_env().is_in_ipython:
        for pcoll in computed_pcolls:
            visualize(pcoll, include_window_info=include_window_info)

    if not pcolls:
        return

    # Build a pipeline fragment for the PCollections and run it.
    result = pf.PipelineFragment(list(pcolls), user_pipeline.options).run()
    ie.current_env().set_pipeline_result(user_pipeline, result)

    # If in notebook, dynamic plotting as computation goes.
    if ie.current_env().is_in_notebook:
        for pcoll in pcolls:
            visualize(pcoll,
                      dynamic_plotting_interval=1,
                      include_window_info=include_window_info,
                      display_facets=visualize_data)

    # Invoke wait_until_finish to ensure the blocking nature of this API without
    # relying on the run to be blocking.
    result.wait_until_finish()

    # If just in ipython shell, plotting once when the computation is completed.
    if ie.current_env().is_in_ipython and not ie.current_env().is_in_notebook:
        for pcoll in pcolls:
            visualize(pcoll, include_window_info=include_window_info)

    # If the pipeline execution is successful at this stage, mark the computation
    # completeness for the given PCollections so that when further `show`
    # invocation occurs, Interactive Beam wouldn't need to re-compute them.
    if result.state is beam.runners.runner.PipelineState.DONE:
        ie.current_env().mark_pcollection_computed(pcolls)
예제 #9
0
 def test_dynamic_plotting_return_handle(self):
     h = pv.visualize(self._pcoll, dynamic_plotting_interval=1)
     self.assertIsInstance(h, timeloop.Timeloop)
     h.stop()
예제 #10
0
 def test_one_shot_visualization_not_return_handle(self):
     self.assertIsNone(pv.visualize(self._pcoll))
예제 #11
0
 def test_display_plain_text_when_kernel_has_no_frontend(
         self, _mocked_sample):
     ie.new_env(
     )  # Resets the notebook check. Should be False in unit tests.
     self.assertIsNone(pv.visualize(self._pcoll))
     _mocked_sample.assert_called_once()
예제 #12
0
 def test_display_plain_text_when_kernel_has_no_frontend(
         self, _mocked_sample):
     # Resets the notebook check to False.
     ie.current_env()._is_in_notebook = False
     self.assertIsNone(pv.visualize(self._pcoll))
     _mocked_sample.assert_called_once()
예제 #13
0
def show(*pcolls):
  """Visualizes given PCollections in an interactive exploratory way if used
  within a notebook, or prints a heading sampled data if used within an ipython
  shell. Noop if used in a non-interactive environment.

  Ad hoc builds a pipeline fragment including only transforms that are
  necessary to produce data for given PCollections pcolls, runs the pipeline
  fragment to compute data for those pcolls and then visualizes the data.

  The function is always blocking. If used within a notebook, the data
  visualized might be dynamically updated before the function returns as more
  and more data could getting processed and emitted when the pipeline fragment
  is being executed. If used within an ipython shell, there will be no dynamic
  plotting but a static plotting in the end of pipeline fragment execution.

  The PCollections given must belong to the same pipeline.

    For example::

      p = beam.Pipeline(InteractiveRunner())
      init = p | 'Init' >> beam.Create(range(1000))
      square = init | 'Square' >> beam.Map(lambda x: x * x)
      cube = init | 'Cube' >> beam.Map(lambda x: x ** 3)

      # Below builds a pipeline fragment from the defined pipeline `p` that
      # contains only applied transforms of `Init` and `Square`. Then the
      # interactive runner runs the pipeline fragment implicitly to compute data
      # represented by PCollection `square` and visualizes it.
      show(square)

      # This is equivalent to `show(square)` because `square` depends on `init`
      # and `init` is included in the pipeline fragment and computed anyway.
      show(init, square)

      # Below is similar to running `p.run()`. It computes data for both
      # PCollection `square` and PCollection `cube`, then visualizes them.
      show(square, cube)
  """
  assert len(pcolls) > 0, (
      'Need at least 1 PCollection to show data visualization.')
  for pcoll in pcolls:
    assert isinstance(pcoll, beam.pvalue.PCollection), (
        '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))
  user_pipeline = pcolls[0].pipeline
  for pcoll in pcolls:
    assert pcoll.pipeline is user_pipeline, (
        '{} belongs to a different user-defined pipeline ({}) than that of'
        ' other PCollections ({}).'.format(
            pcoll, pcoll.pipeline, user_pipeline))
  runner = user_pipeline.runner
  if isinstance(runner, ir.InteractiveRunner):
    runner = runner._underlying_runner

  # Make sure that all PCollections to be shown are watched. If a PCollection
  # has not been watched, make up a variable name for that PCollection and watch
  # it. No validation is needed here because the watch logic can handle
  # arbitrary variables.
  watched_pcollections = set()
  for watching in ie.current_env().watching():
    for _, val in watching:
      if hasattr(val, '__class__') and isinstance(val, beam.pvalue.PCollection):
        watched_pcollections.add(val)
  for pcoll in pcolls:
    if pcoll not in watched_pcollections:
      watch({re.sub(r'[\[\]\(\)]', '_', str(pcoll)): pcoll})

  # Attempt to run background caching job since we have the reference to the
  # user-defined pipeline.
  bcj.attempt_to_run_background_caching_job(runner, user_pipeline)

  # Build a pipeline fragment for the PCollections and run it.
  result = pf.PipelineFragment(list(pcolls)).run()
  ie.current_env().set_pipeline_result(user_pipeline, result)

  # If in notebook, dynamic plotting as computation goes.
  if ie.current_env().is_in_notebook:
    for pcoll in pcolls:
      visualize(pcoll, dynamic_plotting_interval=1)

  # Invoke wait_until_finish to ensure the blocking nature of this API without
  # relying on the run to be blocking.
  result.wait_until_finish()

  # If just in ipython shell, plotting once when the computation is completed.
  if ie.current_env().is_in_ipython and not ie.current_env().is_in_notebook:
    for pcoll in pcolls:
      visualize(pcoll)

  # If the pipeline execution is successful at this stage, mark the computation
  # completeness for the given PCollections so that when further `show`
  # invocation occurs, Interactive Beam wouldn't need to re-compute them.
  if result.state is beam.runners.runner.PipelineState.DONE:
    ie.current_env().mark_pcollection_computed(pcolls)
 def test_one_shot_visualization_not_return_handle(self):
     self.assertIsNone(pv.visualize(self._pcoll, display_facets=True))
 def test_dynamic_plotting_return_handle(self):
     h = pv.visualize(self._stream,
                      dynamic_plotting_interval=1,
                      display_facets=True)
     self.assertIsInstance(h, timeloop.Timeloop)
     h.stop()