Esempio n. 1
0
  def _write_cache(
      self,
      pipeline,
      pcoll,
      output_as_extended_target=True,
      ignore_unbounded_reads=False,
      is_capture=False):
    """Caches a cacheable PCollection.

    For the given PCollection, by appending sub transform part that materialize
    the PCollection through sink into cache implementation. The cache write is
    not immediate. It happens when the runner runs the transformed pipeline
    and thus not usable for this run as intended. This function always writes
    the cache for the given PCollection as long as the PCollection belongs to
    the pipeline being instrumented and the keyed cache is absent.

    Modifies:
      pipeline
    """
    # Makes sure the pcoll belongs to the pipeline being instrumented.
    if pcoll.pipeline is not pipeline:
      return

    # Ignore the unbounded reads from capturable sources as these will be pruned
    # out using the PipelineFragment later on.
    if ignore_unbounded_reads:
      ignore = False
      producer = pcoll.producer
      while producer:
        if isinstance(producer.transform,
                      tuple(ie.current_env().options.capturable_sources)):
          ignore = True
          break
        producer = producer.parent
      if ignore:
        self._ignored_targets.add(pcoll)
        return

    # The keyed cache is always valid within this instrumentation.
    key = self.cache_key(pcoll)
    # Only need to write when the cache with expected key doesn't exist.
    if not self._cache_manager.exists('full', key):
      label = '{}{}'.format(WRITE_CACHE, key)

      # Read the windowing information and cache it along with the element. This
      # caches the arguments to a WindowedValue object because Python has logic
      # that detects if a DoFn returns a WindowedValue. When it detecs one, it
      # puts the element into the correct window then emits the value to
      # downstream transforms.
      class Reify(beam.DoFn):
        def process(
            self,
            e,
            w=beam.DoFn.WindowParam,
            p=beam.DoFn.PaneInfoParam,
            t=beam.DoFn.TimestampParam):
          yield test_stream.WindowedValueHolder(WindowedValue(e, t, [w], p))

      extended_target = (
          pcoll
          | label + 'reify' >> beam.ParDo(Reify())
          | label >> cache.WriteCache(
              self._cache_manager, key, is_capture=is_capture))
      if output_as_extended_target:
        self._extended_targets.add(extended_target)
 def clear(self):
     # type: () -> None
     """Clears all cached PCollections for this RecordingManager."""
     cache_manager = ie.current_env().get_cache_manager(self.user_pipeline)
     if cache_manager:
         cache_manager.cleanup()
Esempio n. 3
0
                                         coder,
                                         include_window_info=False,
                                         n=5)
        self.assertSequenceEqual(list(elements), list(range(5)))

    def test_element_limit_count(self):
        """Tests that the to_element_list can limit the count."""

        elements = utils.to_element_list(iter(range(10)),
                                         None,
                                         include_window_info=False,
                                         n=5)
        self.assertSequenceEqual(list(elements), list(range(5)))


@unittest.skipIf(not ie.current_env().is_interactive_ready,
                 '[interactive] dependency is not installed.')
class IPythonLogHandlerTest(unittest.TestCase):
    def setUp(self):
        utils.register_ipython_log_handler()
        self._interactive_root_logger = logging.getLogger(
            'apache_beam.runners.interactive')

    def test_ipython_log_handler_not_double_registered(self):
        utils.register_ipython_log_handler()
        ipython_log_handlers = list(
            filter(lambda x: isinstance(x, utils.IPythonLogHandler), [
                handler for handler in self._interactive_root_logger.handlers
            ]))
        self.assertEqual(1, len(ipython_log_handlers))
    def test_streaming_wordcount(self):
        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        # Add the TestStream so that it can be cached.
        ib.options.recordable_sources.add(TestStream)

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(),
                          options=StandardOptions(streaming=True))

        data = (
            p
            | TestStream()
                .advance_watermark_to(0)
                .advance_processing_time(1)
                .add_elements(['to', 'be', 'or', 'not', 'to', 'be'])
                .advance_watermark_to(20)
                .advance_processing_time(1)
                .add_elements(['that', 'is', 'the', 'question'])
                .advance_watermark_to(30)
                .advance_processing_time(1)
                .advance_watermark_to(40)
                .advance_processing_time(1)
                .advance_watermark_to(50)
                .advance_processing_time(1)
            | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable

        counts = (data
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # This tests that the data was correctly cached.
        pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0)
        expected_data_df = pd.DataFrame([
            ('to', 0, [IntervalWindow(0, 10)], pane_info),
            ('be', 0, [IntervalWindow(0, 10)], pane_info),
            ('or', 0, [IntervalWindow(0, 10)], pane_info),
            ('not', 0, [IntervalWindow(0, 10)], pane_info),
            ('to', 0, [IntervalWindow(0, 10)], pane_info),
            ('be', 0, [IntervalWindow(0, 10)], pane_info),
            ('that', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('is', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('the', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('question', 20000000, [IntervalWindow(20, 30)], pane_info)
        ], columns=[0, 'event_time', 'windows', 'pane_info']) # yapf: disable

        data_df = ib.collect(data, n=10, include_window_info=True)
        pd.testing.assert_frame_equal(expected_data_df, data_df)

        # This tests that the windowing was passed correctly so that all the data
        # is aggregated also correctly.
        pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0)
        expected_counts_df = pd.DataFrame([
            ('be', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('not', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('or', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('to', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('is', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('question', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('that', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('the', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
        ], columns=[0, 1, 'event_time', 'windows', 'pane_info']) # yapf: disable

        counts_df = ib.collect(counts, n=8, include_window_info=True)

        # The group by key has no guarantee of order. So we post-process the DF by
        # sorting so we can test equality.
        sorted_counts_df = (counts_df
                            .sort_values(['event_time', 0], ascending=True)
                            .reset_index(drop=True)) # yapf: disable
        pd.testing.assert_frame_equal(expected_counts_df, sorted_counts_df)
    def test_wordcount(self):
        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

        # Count the occurrences of each word.
        counts = (p
                  | beam.Create(['to be or not to be that is the question'])
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that counts will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        result = p.run()
        result.wait_until_finish()

        actual = list(result.get(counts))
        self.assertSetEqual(
            set(actual),
            set([
                ('or', 1),
                ('that', 1),
                ('be', 2),
                ('is', 1),
                ('question', 1),
                ('to', 2),
                ('the', 1),
                ('not', 1),
            ]))

        # Truncate the precision to millis because the window coder uses millis
        # as units then gets upcast to micros.
        end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000
        df_counts = ib.collect(counts, include_window_info=True, n=10)
        df_expected = pd.DataFrame(
            {
                0: [e[0] for e in actual],
                1: [e[1] for e in actual],
                'event_time': [end_of_window for _ in actual],
                'windows': [[GlobalWindow()] for _ in actual],
                'pane_info': [
                    PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)
                    for _ in actual
                ]
            },
            columns=[0, 1, 'event_time', 'windows', 'pane_info'])

        pd.testing.assert_frame_equal(df_expected, df_counts)

        actual_reified = result.get(counts, include_window_info=True)
        expected_reified = [
            WindowedValue(e, Timestamp(micros=end_of_window), [GlobalWindow()],
                          PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0))
            for e in actual
        ]
        self.assertEqual(actual_reified, expected_reified)
Esempio n. 6
0
  def test_instrument_example_unbounded_pipeline_to_read_cache_not_cached(self):
    """Tests that the instrumenter works when the PCollection is not cached.
    """
    # Create the pipeline that will be instrumented.
    from apache_beam.options.pipeline_options import StandardOptions
    options = StandardOptions(streaming=True)
    p_original = beam.Pipeline(interactive_runner.InteractiveRunner(), options)
    ie.current_env().set_cache_manager(
        StreamingCache(cache_dir=None), p_original)
    source_1 = p_original | 'source1' >> beam.io.ReadFromPubSub(
        subscription='projects/fake-project/subscriptions/fake_sub')
    # pylint: disable=possibly-unused-variable
    pcoll_1 = source_1 | 'square1' >> beam.Map(lambda x: x * x)

    # Watch but do not cache the PCollections.
    ib.watch(locals())

    # Instrument the original pipeline to create the pipeline the user will see.
    p_copy = beam.Pipeline.from_runner_api(
        p_original.to_runner_api(),
        runner=interactive_runner.InteractiveRunner(),
        options=options)
    instrumenter = instr.build_pipeline_instrument(p_copy)
    actual_pipeline = beam.Pipeline.from_runner_api(
        proto=instrumenter.instrumented_pipeline_proto(),
        runner=interactive_runner.InteractiveRunner(),
        options=options)

    # Now, build the expected pipeline which replaces the unbounded source with
    # a TestStream.
    source_1_cache_key = self.cache_key_of('source_1', source_1)
    p_expected = beam.Pipeline()
    ie.current_env().set_cache_manager(
        StreamingCache(cache_dir=None), p_expected)
    test_stream = (p_expected | TestStream(output_tags=[source_1_cache_key]))
    # pylint: disable=expression-not-assigned
    (
        test_stream[source_1_cache_key]
        | 'square1' >> beam.Map(lambda x: x * x)
        | 'reify' >> beam.Map(lambda _: _)
        | cache.WriteCache(
            ie.current_env().get_cache_manager(p_expected), 'unused'))

    # Test that the TestStream is outputting to the correct PCollection.
    class TestStreamVisitor(PipelineVisitor):
      def __init__(self):
        self.output_tags = set()

      def enter_composite_transform(self, transform_node):
        self.visit_transform(transform_node)

      def visit_transform(self, transform_node):
        transform = transform_node.transform
        if isinstance(transform, TestStream):
          self.output_tags = transform.output_tags

    v = TestStreamVisitor()
    actual_pipeline.visit(v)
    expected_output_tags = set([source_1_cache_key])
    actual_output_tags = v.output_tags
    self.assertSetEqual(expected_output_tags, actual_output_tags)

    # Test that the pipeline is as expected.
    assert_pipeline_proto_equal(
        self,
        p_expected.to_runner_api(use_fake_coders=True),
        instrumenter.instrumented_pipeline_proto())
    def test_computed(self):
        """Tests that a PCollection is marked as computed only in a complete state.

    Because the background caching job is now long-lived, repeated runs of a
    PipelineFragment may yield different results for the same PCollection.
    """

        p = beam.Pipeline(InteractiveRunner())
        elems = p | beam.Create([0, 1, 2])

        ib.watch(locals())

        # Create a MockPipelineResult to control the state of a fake run of the
        # pipeline.
        mock_result = MockPipelineResult()
        ie.current_env().track_user_pipelines()
        ie.current_env().set_pipeline_result(p, mock_result)

        # Create a mock BackgroundCachingJob that will control whether to set the
        # PCollections as computed or not.
        bcj_mock_result = MockPipelineResult()
        background_caching_job = bcj.BackgroundCachingJob(bcj_mock_result, [])

        # Create a recording.
        recording = Recording(p, [elems],
                              mock_result,
                              pi.PipelineInstrument(p),
                              max_n=10,
                              max_duration_secs=60)

        # The background caching job and the recording isn't done yet so there may
        # be more elements to be recorded.
        self.assertFalse(recording.is_computed())
        self.assertFalse(recording.computed())
        self.assertTrue(recording.uncomputed())

        # The recording is finished but the background caching job is not. There
        # may still be more elements to record, or the intermediate PCollection may
        # have stopped caching in an incomplete state, e.g. before a window could
        # fire.
        mock_result.set_state(PipelineState.DONE)
        recording.wait_until_finish()

        self.assertFalse(recording.is_computed())
        self.assertFalse(recording.computed())
        self.assertTrue(recording.uncomputed())

        # The background caching job finished before we started a recording which
        # is a sure signal that there will be no more elements.
        bcj_mock_result.set_state(PipelineState.DONE)
        ie.current_env().set_background_caching_job(p, background_caching_job)
        recording = Recording(p, [elems],
                              mock_result,
                              pi.PipelineInstrument(p),
                              max_n=10,
                              max_duration_secs=60)
        recording.wait_until_finish()

        # There are no more elements and the recording finished, meaning that the
        # intermediate PCollections are in a complete state. They can now be marked
        # as computed.
        self.assertTrue(recording.is_computed())
        self.assertTrue(recording.computed())
        self.assertFalse(recording.uncomputed())
Esempio n. 8
0
 def is_cache_complete():
   job = ie.current_env().get_background_caching_job(user_pipeline)
   is_done = job and job.is_done()
   cache_changed = is_source_to_cache_changed(
       user_pipeline, update_cached_source_signature=False)
   return is_done and not cache_changed
Esempio n. 9
0
def _setup_test_streaming_cache(pipeline):
  cache_manager = StreamingCache(cache_dir=None)
  ie.current_env().set_cache_manager(cache_manager, pipeline)
  builder = FileRecordsBuilder(tag=_TEST_CACHE_KEY)
  (builder
      .advance_watermark(watermark_secs=0)
      .advance_processing_time(5)
      .add_element(element='a', event_time_secs=1)
      .advance_watermark(watermark_secs=100)
      .advance_processing_time(10)) # yapf: disable
  cache_manager.write(builder.build(), _TEST_CACHE_KEY)


@unittest.skipIf(
    not ie.current_env().is_interactive_ready,
    '[interactive] dependency is not installed.')
@unittest.skipIf(
    sys.version_info < (3, 6), 'The tests require at least Python 3.6 to work.')
class BackgroundCachingJobTest(unittest.TestCase):
  def tearDown(self):
    ie.new_env()

  # TODO(BEAM-8335): remove the patches when there are appropriate test sources
  # that meet the boundedness checks.
  @patch(
      'apache_beam.runners.interactive.background_caching_job'
      '.has_source_to_cache',
      lambda x: True)
  # Disable the clean up so that we can keep the test streaming cache.
  @patch(
Esempio n. 10
0
 def tearDown(self):
     for _, job in ie.current_env()._background_caching_jobs.items():
         job.cancel()
     ie.new_env()
Esempio n. 11
0
 def test_background_caching_job_not_start_for_batch_pipeline(self):
     p = _build_a_test_stream_pipeline()
     p.run()
     self.assertIsNone(ie.current_env().get_background_caching_job(p))
Esempio n. 12
0
 def test_determine_a_test_stream_service_running(self):
     pipeline = _build_an_empty_stream_pipeline()
     test_stream_service = TestStreamServiceController(reader=None)
     ie.current_env().set_test_stream_service_controller(
         pipeline, test_stream_service)
     self.assertTrue(bcj.is_a_test_stream_service_running(pipeline))
Esempio n. 13
0
 def visit_transform(self, transform_node):
   if isinstance(transform_node.transform,
                 tuple(ie.current_env().options.capturable_sources)):
     for pcoll in transform_node.outputs.values():
       ie.current_env().watch({'synthetic_var_' + str(id(pcoll)): pcoll})
Esempio n. 14
0
 def visit_transform(self, transform_node):
   if isinstance(transform_node.transform,
                 tuple(ie.current_env().options.capturable_sources)):
     self.unbounded_sources.append(transform_node)
Esempio n. 15
0
 def test_watch_class_instance(self):
   test_env = ie.InteractiveEnvironment()
   ib.watch(self)
   test_env.watch(self)
   self.assertEqual(ie.current_env().watching(), test_env.watching())
Esempio n. 16
0
def show(*pcolls,
         include_window_info=False,
         visualize_data=False,
         n='inf',
         duration='inf'):
    # type: (*Union[Dict[Any, PCollection], Iterable[PCollection], PCollection], bool, bool, Union[int, str], Union[int, str]) -> None
    """Shows given PCollections in an interactive exploratory way if used within
  a notebook, or prints a heading sampled data if used within an ipython shell.
  Noop if used in a non-interactive environment.

  Args:
    include_window_info: (optional) if True, windowing information of the
        data will be visualized too. Default is false.
    visualize_data: (optional) by default, the visualization contains data
        tables rendering data from given pcolls separately as if they are
        converted into dataframes. If visualize_data is True, there will be a
        more dive-in widget and statistically overview widget of the data.
        Otherwise, those 2 data visualization widgets will not be displayed.
    n: (optional) max number of elements to visualize. Default 'inf'.
    duration: (optional) max duration of elements to read in integer seconds or
        a string duration. Default 'inf'.

  The given pcolls can be dictionary of PCollections (as values), or iterable
  of PCollections or plain PCollection values.

  The user can specify either the max number of elements with `n` to read
  or the maximum duration of elements to read with `duration`. When a limiter is
  not supplied, it is assumed to be infinite.

  By default, the visualization contains data tables rendering data from given
  pcolls separately as if they are converted into dataframes. If visualize_data
  is True, there will be a more dive-in widget and statistically overview widget
  of the data. Otherwise, those 2 data visualization widgets will not be
  displayed.

  Ad hoc builds a pipeline fragment including only transforms that are
  necessary to produce data for given PCollections pcolls, runs the pipeline
  fragment to compute data for those pcolls and then visualizes the data.

  The function is always blocking. If used within a notebook, the data
  visualized might be dynamically updated before the function returns as more
  and more data could getting processed and emitted when the pipeline fragment
  is being executed. If used within an ipython shell, there will be no dynamic
  plotting but a static plotting in the end of pipeline fragment execution.

  The PCollections given must belong to the same pipeline.

    For example::

      p = beam.Pipeline(InteractiveRunner())
      init = p | 'Init' >> beam.Create(range(1000))
      square = init | 'Square' >> beam.Map(lambda x: x * x)
      cube = init | 'Cube' >> beam.Map(lambda x: x ** 3)

      # Below builds a pipeline fragment from the defined pipeline `p` that
      # contains only applied transforms of `Init` and `Square`. Then the
      # interactive runner runs the pipeline fragment implicitly to compute data
      # represented by PCollection `square` and visualizes it.
      show(square)

      # This is equivalent to `show(square)` because `square` depends on `init`
      # and `init` is included in the pipeline fragment and computed anyway.
      show(init, square)

      # Below is similar to running `p.run()`. It computes data for both
      # PCollection `square` and PCollection `cube`, then visualizes them.
      show(square, cube)
  """
    flatten_pcolls = []
    for pcoll_container in pcolls:
        if isinstance(pcoll_container, dict):
            flatten_pcolls.extend(pcoll_container.values())
        elif isinstance(pcoll_container,
                        (beam.pvalue.PCollection, DeferredBase)):
            flatten_pcolls.append(pcoll_container)
        else:
            try:
                flatten_pcolls.extend(iter(pcoll_container))
            except TypeError:
                raise ValueError(
                    'The given pcoll %s is not a dict, an iterable or a PCollection.'
                    % pcoll_container)

    # Iterate through the given PCollections and convert any deferred DataFrames
    # or Series into PCollections.
    pcolls = set()

    # The element type is used to help visualize the given PCollection. For the
    # deferred DataFrame/Series case it is the proxy of the frame.
    element_types = {}
    for pcoll in flatten_pcolls:
        if isinstance(pcoll, DeferredBase):
            pcoll, element_type = deferred_df_to_pcollection(pcoll)
            watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll})
        else:
            element_type = pcoll.element_type

        element_types[pcoll] = element_type

        pcolls.add(pcoll)
        assert isinstance(pcoll, beam.pvalue.PCollection), (
            '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))

    assert len(pcolls) > 0, (
        'Need at least 1 PCollection to show data visualization.')

    pcoll_pipeline = next(iter(pcolls)).pipeline
    user_pipeline = ie.current_env().user_pipeline(pcoll_pipeline)
    # Possibly showing a PCollection defined in a local scope that is not
    # explicitly watched. Ad hoc watch it though it's a little late.
    if not user_pipeline:
        watch({
            'anonymous_pipeline_{}'.format(id(pcoll_pipeline)):
            pcoll_pipeline
        })
        user_pipeline = pcoll_pipeline

    if isinstance(n, str):
        assert n == 'inf', (
            'Currently only the string \'inf\' is supported. This denotes reading '
            'elements until the recording is stopped via a kernel interrupt.')
    elif isinstance(n, int):
        assert n > 0, 'n needs to be positive or the string \'inf\''

    if isinstance(duration, int):
        assert duration > 0, (
            'duration needs to be positive, a duration string, '
            'or the string \'inf\'')

    if n == 'inf':
        n = float('inf')

    if duration == 'inf':
        duration = float('inf')

    previously_computed_pcolls = {
        pcoll
        for pcoll in pcolls if pcoll in ie.current_env().computed_pcollections
    }
    for pcoll in previously_computed_pcolls:
        visualize_computed_pcoll(find_pcoll_name(pcoll),
                                 pcoll,
                                 n,
                                 duration,
                                 include_window_info=include_window_info,
                                 display_facets=visualize_data)
    pcolls = pcolls - previously_computed_pcolls

    recording_manager = ie.current_env().get_recording_manager(
        user_pipeline, create_if_absent=True)
    recording = recording_manager.record(pcolls,
                                         max_n=n,
                                         max_duration=duration)

    # Catch a KeyboardInterrupt to gracefully cancel the recording and
    # visualizations.
    try:
        # If in notebook, static plotting computed pcolls as computation is done.
        if ie.current_env().is_in_notebook:
            for stream in recording.computed().values():
                visualize(stream,
                          include_window_info=include_window_info,
                          display_facets=visualize_data,
                          element_type=element_types[stream.pcoll])
        elif ie.current_env().is_in_ipython:
            for stream in recording.computed().values():
                visualize(stream,
                          include_window_info=include_window_info,
                          element_type=element_types[stream.pcoll])
        if recording.is_computed():
            return

        # If in notebook, dynamic plotting as computation goes.
        if ie.current_env().is_in_notebook:
            for stream in recording.uncomputed().values():
                visualize(stream,
                          dynamic_plotting_interval=1,
                          include_window_info=include_window_info,
                          display_facets=visualize_data,
                          element_type=element_types[stream.pcoll])

        # Invoke wait_until_finish to ensure the blocking nature of this API without
        # relying on the run to be blocking.
        recording.wait_until_finish()

        # If just in ipython shell, plotting once when the computation is completed.
        if ie.current_env(
        ).is_in_ipython and not ie.current_env().is_in_notebook:
            for stream in recording.computed().values():
                visualize(stream, include_window_info=include_window_info)

    except KeyboardInterrupt:
        if recording:
            recording.cancel()
Esempio n. 17
0
  def test_able_to_cache_intermediate_unbounded_source_pcollection(self):
    """Tests being able to cache an intermediate source PCollection.

    In the following pipeline, the source doesn't have a reference and so is
    not automatically cached in the watch() command. This tests that this case
    is taken care of.
    """
    # Create the pipeline that will be instrumented.
    from apache_beam.options.pipeline_options import StandardOptions
    options = StandardOptions(streaming=True)
    streaming_cache_manager = StreamingCache(cache_dir=None)
    p_original = beam.Pipeline(interactive_runner.InteractiveRunner(), options)
    ie.current_env().set_cache_manager(streaming_cache_manager, p_original)

    # pylint: disable=possibly-unused-variable
    source_1 = (
        p_original
        | 'source1' >> beam.io.ReadFromPubSub(
            subscription='projects/fake-project/subscriptions/fake_sub')
        | beam.Map(lambda e: e))

    # Watch but do not cache the PCollections.
    ib.watch(locals())

    # Make sure that sources without a user reference are still cached.
    instr.watch_sources(p_original)

    intermediate_source_pcoll = None
    for watching in ie.current_env().watching():
      watching = list(watching)
      for var, watchable in watching:
        if 'synthetic' in var:
          intermediate_source_pcoll = watchable
          break

    # Instrument the original pipeline to create the pipeline the user will see.
    p_copy = beam.Pipeline.from_runner_api(
        p_original.to_runner_api(),
        runner=interactive_runner.InteractiveRunner(),
        options=options)
    instrumenter = instr.build_pipeline_instrument(p_copy)
    actual_pipeline = beam.Pipeline.from_runner_api(
        proto=instrumenter.instrumented_pipeline_proto(),
        runner=interactive_runner.InteractiveRunner(),
        options=options)

    # Now, build the expected pipeline which replaces the unbounded source with
    # a TestStream.
    intermediate_source_pcoll_cache_key = \
        self.cache_key_of('synthetic_var_' + str(id(intermediate_source_pcoll)),
                     intermediate_source_pcoll)
    p_expected = beam.Pipeline()
    ie.current_env().set_cache_manager(streaming_cache_manager, p_expected)
    test_stream = (
        p_expected
        | TestStream(output_tags=[intermediate_source_pcoll_cache_key]))
    # pylint: disable=expression-not-assigned
    (
        test_stream[intermediate_source_pcoll_cache_key]
        | 'square1' >> beam.Map(lambda e: e)
        | 'reify' >> beam.Map(lambda _: _)
        | cache.WriteCache(
            ie.current_env().get_cache_manager(p_expected), 'unused'))

    # Test that the TestStream is outputting to the correct PCollection.
    class TestStreamVisitor(PipelineVisitor):
      def __init__(self):
        self.output_tags = set()

      def enter_composite_transform(self, transform_node):
        self.visit_transform(transform_node)

      def visit_transform(self, transform_node):
        transform = transform_node.transform
        if isinstance(transform, TestStream):
          self.output_tags = transform.output_tags

    v = TestStreamVisitor()
    actual_pipeline.visit(v)
    expected_output_tags = set([intermediate_source_pcoll_cache_key])
    actual_output_tags = v.output_tags
    self.assertSetEqual(expected_output_tags, actual_output_tags)

    # Test that the pipeline is as expected.
    assert_pipeline_proto_equal(
        self,
        p_expected.to_runner_api(use_fake_coders=True),
        instrumenter.instrumented_pipeline_proto())
Esempio n. 18
0
def collect(pcoll, n='inf', duration='inf', include_window_info=False):
    """Materializes the elements from a PCollection into a Dataframe.

  This reads each element from file and reads only the amount that it needs
  into memory. The user can specify either the max number of elements to read
  or the maximum duration of elements to read. When a limiter is not supplied,
  it is assumed to be infinite.

  Args:
    n: (optional) max number of elements to visualize. Default 'inf'.
    duration: (optional) max duration of elements to read in integer seconds or
        a string duration. Default 'inf'.
    include_window_info: (optional) if True, appends the windowing information
        to each row. Default False.

  For example::

    p = beam.Pipeline(InteractiveRunner())
    init = p | 'Init' >> beam.Create(range(10))
    square = init | 'Square' >> beam.Map(lambda x: x * x)

    # Run the pipeline and bring the PCollection into memory as a Dataframe.
    in_memory_square = head(square, n=5)
  """
    # Remember the element type so we can make an informed decision on how to
    # collect the result in elements_to_df.
    if isinstance(pcoll, DeferredBase):
        # Get the proxy so we can get the output shape of the DataFrame.
        pcoll, element_type = deferred_df_to_pcollection(pcoll)
        watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll})
    else:
        element_type = pcoll.element_type

    assert isinstance(pcoll, beam.pvalue.PCollection), (
        '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))

    if isinstance(n, str):
        assert n == 'inf', (
            'Currently only the string \'inf\' is supported. This denotes reading '
            'elements until the recording is stopped via a kernel interrupt.')
    elif isinstance(n, int):
        assert n > 0, 'n needs to be positive or the string \'inf\''

    if isinstance(duration, int):
        assert duration > 0, (
            'duration needs to be positive, a duration string, '
            'or the string \'inf\'')

    if n == 'inf':
        n = float('inf')

    if duration == 'inf':
        duration = float('inf')

    user_pipeline = ie.current_env().user_pipeline(pcoll.pipeline)
    # Possibly collecting a PCollection defined in a local scope that is not
    # explicitly watched. Ad hoc watch it though it's a little late.
    if not user_pipeline:
        watch({
            'anonymous_pipeline_{}'.format(id(pcoll.pipeline)):
            pcoll.pipeline
        })
        user_pipeline = pcoll.pipeline
    recording_manager = ie.current_env().get_recording_manager(
        user_pipeline, create_if_absent=True)

    # If already computed, directly read the stream and return.
    if pcoll in ie.current_env().computed_pcollections:
        pcoll_name = find_pcoll_name(pcoll)
        elements = list(
            recording_manager.read(pcoll_name, pcoll, n, duration).read())
        return elements_to_df(elements,
                              include_window_info=include_window_info,
                              element_type=element_type)

    recording = recording_manager.record([pcoll],
                                         max_n=n,
                                         max_duration=duration)

    try:
        elements = list(recording.stream(pcoll).read())
    except KeyboardInterrupt:
        recording.cancel()
        return pd.DataFrame()

    if n == float('inf'):
        n = None

    # Collecting DataFrames may have a length > n, so slice again to be sure. Note
    # that array[:None] returns everything.
    return elements_to_df(elements,
                          include_window_info=include_window_info,
                          element_type=element_type)[:n]
Esempio n. 19
0
  def test_instrument_example_unbounded_pipeline_to_multiple_read_cache(self):
    """Tests that the instrumenter works for multiple unbounded sources.
    """
    # Create the pipeline that will be instrumented.
    p_original = beam.Pipeline(interactive_runner.InteractiveRunner())
    ie.current_env().set_cache_manager(
        StreamingCache(cache_dir=None), p_original)
    source_1 = p_original | 'source1' >> beam.io.ReadFromPubSub(
        subscription='projects/fake-project/subscriptions/fake_sub')
    source_2 = p_original | 'source2' >> beam.io.ReadFromPubSub(
        subscription='projects/fake-project/subscriptions/fake_sub')
    # pylint: disable=possibly-unused-variable
    pcoll_1 = source_1 | 'square1' >> beam.Map(lambda x: x * x)
    # pylint: disable=possibly-unused-variable
    pcoll_2 = source_2 | 'square2' >> beam.Map(lambda x: x * x)

    # Mock as if cacheable PCollections are cached.
    ib.watch(locals())

    for name, pcoll in locals().items():
      if not isinstance(pcoll, beam.pvalue.PCollection):
        continue
      cache_key = self.cache_key_of(name, pcoll)
      self._mock_write_cache(p_original, [b''], cache_key)

    # Instrument the original pipeline to create the pipeline the user will see.
    instrumenter = instr.build_pipeline_instrument(p_original)
    actual_pipeline = beam.Pipeline.from_runner_api(
        proto=instrumenter.instrumented_pipeline_proto(),
        runner=interactive_runner.InteractiveRunner(),
        options=None)

    # Now, build the expected pipeline which replaces the unbounded source with
    # a TestStream.
    source_1_cache_key = self.cache_key_of('source_1', source_1)
    source_2_cache_key = self.cache_key_of('source_2', source_2)
    p_expected = beam.Pipeline()
    test_stream = (
        p_expected
        | TestStream(
            output_tags=[
                self.cache_key_of('source_1', source_1),
                self.cache_key_of('source_2', source_2)
            ]))
    # pylint: disable=expression-not-assigned
    test_stream[source_1_cache_key] | 'square1' >> beam.Map(lambda x: x * x)
    # pylint: disable=expression-not-assigned
    test_stream[source_2_cache_key] | 'square2' >> beam.Map(lambda x: x * x)

    # Test that the TestStream is outputting to the correct PCollection.
    class TestStreamVisitor(PipelineVisitor):
      def __init__(self):
        self.output_tags = set()

      def enter_composite_transform(self, transform_node):
        self.visit_transform(transform_node)

      def visit_transform(self, transform_node):
        transform = transform_node.transform
        if isinstance(transform, TestStream):
          self.output_tags = transform.output_tags

    v = TestStreamVisitor()
    actual_pipeline.visit(v)
    expected_output_tags = set([source_1_cache_key, source_2_cache_key])
    actual_output_tags = v.output_tags
    self.assertSetEqual(expected_output_tags, actual_output_tags)

    # Test that the pipeline is as expected.
    assert_pipeline_proto_equal(
        self,
        p_expected.to_runner_api(),
        instrumenter.instrumented_pipeline_proto())
Esempio n. 20
0
  def test_recordings_record(self):
    """Tests that recording pipeline succeeds."""

    # Add the TestStream so that it can be cached.
    ib.options.recordable_sources.add(TestStream)

    # Create a pipeline with an arbitrary amonunt of elements.
    p = beam.Pipeline(
        ir.InteractiveRunner(), options=PipelineOptions(streaming=True))
    # pylint: disable=unused-variable
    _ = (p
         | TestStream()
             .advance_watermark_to(0)
             .advance_processing_time(1)
             .add_elements(list(range(10)))
             .advance_processing_time(1))  # yapf: disable
    ib.watch(locals())
    ie.current_env().track_user_pipelines()

    # Assert that the pipeline starts in a good state.
    self.assertEqual(ib.recordings.describe(p)['state'], PipelineState.STOPPED)
    self.assertEqual(ib.recordings.describe(p)['size'], 0)

    # Create a lmiter that stops the background caching job when something is
    # written to cache. This is used to make ensure that the pipeline is
    # functioning properly and that there are no data races with the test.
    class SizeLimiter(Limiter):
      def __init__(self, pipeline):
        self.pipeline = pipeline
        self.should_trigger = False

      def is_triggered(self):
        return (
            ib.recordings.describe(self.pipeline)['size'] > 0 and
            self.should_trigger)

    limiter = SizeLimiter(p)
    ib.options.capture_control.set_limiters_for_test([limiter])

    # Assert that a recording can be started only once.
    self.assertTrue(ib.recordings.record(p))
    self.assertFalse(ib.recordings.record(p))
    self.assertEqual(ib.recordings.describe(p)['state'], PipelineState.RUNNING)

    # Wait for the pipeline to start and write something to cache.
    limiter.should_trigger = True
    for _ in range(60):
      if limiter.is_triggered():
        break
      time.sleep(1)
    self.assertTrue(
        limiter.is_triggered(),
        'Test timed out waiting for limiter to be triggered. This indicates '
        'that the BackgroundCachingJob did not cache anything.')

    # Assert that a recording can be stopped and can't be started again until
    # after the cache is cleared.
    ib.recordings.stop(p)
    self.assertEqual(ib.recordings.describe(p)['state'], PipelineState.STOPPED)
    self.assertFalse(ib.recordings.record(p))
    ib.recordings.clear(p)
    self.assertTrue(ib.recordings.record(p))
    ib.recordings.stop(p)
Esempio n. 21
0
def is_a_test_stream_service_running(user_pipeline):
    """Checks to see if there is a gPRC server/service running that serves the
  test stream to any job started from the given user_pipeline.
  """
    return ie.current_env().get_test_stream_service_controller(
        user_pipeline) is not None
Esempio n. 22
0
 def test_watch_main_by_default(self):
   test_env = ie.InteractiveEnvironment()
   # Current Interactive Beam env fetched and the test env are 2 instances.
   self.assertNotEqual(id(ie.current_env()), id(test_env))
   self.assertEqual(ie.current_env().watching(), test_env.watching())
class InteractiveRunnerTest(unittest.TestCase):
    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
    def test_basic(self):
        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))
        ib.watch({'p': p})
        p.run().wait_until_finish()
        pc0 = (p | 'read' >> beam.Create([1, 2, 3])
               | 'Print1.1' >> beam.Map(print_with_message('Run1.1')))
        pc = pc0 | 'Print1.2' >> beam.Map(print_with_message('Run1.2'))
        ib.watch(locals())
        p.run().wait_until_finish()
        _ = pc | 'Print2' >> beam.Map(print_with_message('Run2'))
        p.run().wait_until_finish()
        _ = pc0 | 'Print3' >> beam.Map(print_with_message('Run3'))
        p.run().wait_until_finish()

    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
    def test_wordcount(self):
        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

        # Count the occurrences of each word.
        counts = (p
                  | beam.Create(['to be or not to be that is the question'])
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that counts will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        result = p.run()
        result.wait_until_finish()

        actual = list(result.get(counts))
        self.assertSetEqual(
            set(actual),
            set([
                ('or', 1),
                ('that', 1),
                ('be', 2),
                ('is', 1),
                ('question', 1),
                ('to', 2),
                ('the', 1),
                ('not', 1),
            ]))

        # Truncate the precision to millis because the window coder uses millis
        # as units then gets upcast to micros.
        end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000
        df_counts = ib.collect(counts, include_window_info=True, n=10)
        df_expected = pd.DataFrame(
            {
                0: [e[0] for e in actual],
                1: [e[1] for e in actual],
                'event_time': [end_of_window for _ in actual],
                'windows': [[GlobalWindow()] for _ in actual],
                'pane_info': [
                    PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)
                    for _ in actual
                ]
            },
            columns=[0, 1, 'event_time', 'windows', 'pane_info'])

        pd.testing.assert_frame_equal(df_expected, df_counts)

        actual_reified = result.get(counts, include_window_info=True)
        expected_reified = [
            WindowedValue(e, Timestamp(micros=end_of_window), [GlobalWindow()],
                          PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0))
            for e in actual
        ]
        self.assertEqual(actual_reified, expected_reified)

    def test_streaming_wordcount(self):
        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        # Add the TestStream so that it can be cached.
        ib.options.recordable_sources.add(TestStream)

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(),
                          options=StandardOptions(streaming=True))

        data = (
            p
            | TestStream()
                .advance_watermark_to(0)
                .advance_processing_time(1)
                .add_elements(['to', 'be', 'or', 'not', 'to', 'be'])
                .advance_watermark_to(20)
                .advance_processing_time(1)
                .add_elements(['that', 'is', 'the', 'question'])
                .advance_watermark_to(30)
                .advance_processing_time(1)
                .advance_watermark_to(40)
                .advance_processing_time(1)
                .advance_watermark_to(50)
                .advance_processing_time(1)
            | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable

        counts = (data
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # This tests that the data was correctly cached.
        pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0)
        expected_data_df = pd.DataFrame([
            ('to', 0, [IntervalWindow(0, 10)], pane_info),
            ('be', 0, [IntervalWindow(0, 10)], pane_info),
            ('or', 0, [IntervalWindow(0, 10)], pane_info),
            ('not', 0, [IntervalWindow(0, 10)], pane_info),
            ('to', 0, [IntervalWindow(0, 10)], pane_info),
            ('be', 0, [IntervalWindow(0, 10)], pane_info),
            ('that', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('is', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('the', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('question', 20000000, [IntervalWindow(20, 30)], pane_info)
        ], columns=[0, 'event_time', 'windows', 'pane_info']) # yapf: disable

        data_df = ib.collect(data, n=10, include_window_info=True)
        pd.testing.assert_frame_equal(expected_data_df, data_df)

        # This tests that the windowing was passed correctly so that all the data
        # is aggregated also correctly.
        pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0)
        expected_counts_df = pd.DataFrame([
            ('be', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('not', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('or', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('to', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('is', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('question', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('that', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('the', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
        ], columns=[0, 1, 'event_time', 'windows', 'pane_info']) # yapf: disable

        counts_df = ib.collect(counts, n=8, include_window_info=True)

        # The group by key has no guarantee of order. So we post-process the DF by
        # sorting so we can test equality.
        sorted_counts_df = (counts_df
                            .sort_values(['event_time', 0], ascending=True)
                            .reset_index(drop=True)) # yapf: disable
        pd.testing.assert_frame_equal(expected_counts_df, sorted_counts_df)

    def test_session(self):
        class MockPipelineRunner(object):
            def __init__(self):
                self._in_session = False

            def __enter__(self):
                self._in_session = True

            def __exit__(self, exc_type, exc_val, exc_tb):
                self._in_session = False

        underlying_runner = MockPipelineRunner()
        runner = interactive_runner.InteractiveRunner(underlying_runner)
        runner.start_session()
        self.assertTrue(underlying_runner._in_session)
        runner.end_session()
        self.assertFalse(underlying_runner._in_session)

    @unittest.skipIf(not ie.current_env().is_interactive_ready,
                     '[interactive] dependency is not installed.')
    @patch('IPython.get_ipython', new_callable=mock_get_ipython)
    def test_mark_pcollection_completed_after_successful_run(self, cell):
        with cell:  # Cell 1
            p = beam.Pipeline(interactive_runner.InteractiveRunner())
            ib.watch({'p': p})

        with cell:  # Cell 2
            # pylint: disable=range-builtin-not-iterating
            init = p | 'Init' >> beam.Create(range(5))

        with cell:  # Cell 3
            square = init | 'Square' >> beam.Map(lambda x: x * x)
            cube = init | 'Cube' >> beam.Map(lambda x: x**3)

        ib.watch(locals())
        result = p.run()
        self.assertTrue(init in ie.current_env().computed_pcollections)
        self.assertEqual({0, 1, 2, 3, 4}, set(result.get(init)))
        self.assertTrue(square in ie.current_env().computed_pcollections)
        self.assertEqual({0, 1, 4, 9, 16}, set(result.get(square)))
        self.assertTrue(cube in ie.current_env().computed_pcollections)
        self.assertEqual({0, 1, 8, 27, 64}, set(result.get(cube)))

    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
    def test_dataframes(self):
        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))
        data = p | beam.Create([
            1, 2, 3
        ]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x))
        df = to_dataframe(data)

        # Watch the local scope for Interactive Beam so that values will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        df_expected = pd.DataFrame({'square': [1, 4, 9], 'cube': [1, 8, 27]})
        pd.testing.assert_frame_equal(
            df_expected,
            ib.collect(df, n=10).reset_index(drop=True))

    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
    def test_dataframes_with_grouped_index(self):
        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

        data = [
            Record('a', 20, 170),
            Record('a', 30, 170),
            Record('b', 22, 180),
            Record('c', 18, 150)
        ]

        aggregate = lambda df: df.groupby('height').mean()

        deferred_df = aggregate(to_dataframe(p | beam.Create(data)))
        df_expected = aggregate(pd.DataFrame(data))

        # Watch the local scope for Interactive Beam so that values will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        pd.testing.assert_frame_equal(df_expected, ib.collect(deferred_df,
                                                              n=10))

    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
    def test_dataframes_with_multi_index(self):
        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

        data = [
            Record('a', 20, 170),
            Record('a', 30, 170),
            Record('b', 22, 180),
            Record('c', 18, 150)
        ]

        aggregate = lambda df: df.groupby(['name', 'height']).mean()

        deferred_df = aggregate(to_dataframe(p | beam.Create(data)))
        df_expected = aggregate(pd.DataFrame(data))

        # Watch the local scope for Interactive Beam so that values will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        pd.testing.assert_frame_equal(df_expected, ib.collect(deferred_df,
                                                              n=10))

    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
    def test_dataframes_with_multi_index_get_result(self):
        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

        data = [
            Record('a', 20, 170),
            Record('a', 30, 170),
            Record('b', 22, 180),
            Record('c', 18, 150)
        ]

        aggregate = lambda df: df.groupby(['name', 'height']).mean()['age']

        deferred_df = aggregate(to_dataframe(p | beam.Create(data)))
        df_expected = aggregate(pd.DataFrame(data))

        # Watch the local scope for Interactive Beam so that values will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        pd.testing.assert_series_equal(df_expected,
                                       ib.collect(deferred_df, n=10))

    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
    def test_dataframes_same_cell_twice(self):
        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))
        data = p | beam.Create([
            1, 2, 3
        ]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x))
        df = to_dataframe(data)

        # Watch the local scope for Interactive Beam so that values will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        df_expected = pd.DataFrame({'square': [1, 4, 9], 'cube': [1, 8, 27]})
        pd.testing.assert_series_equal(
            df_expected['square'],
            ib.collect(df['square'], n=10).reset_index(drop=True))
        pd.testing.assert_series_equal(
            df_expected['cube'],
            ib.collect(df['cube'], n=10).reset_index(drop=True))

    @unittest.skipIf(not ie.current_env().is_interactive_ready,
                     '[interactive] dependency is not installed.')
    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
    @patch('IPython.get_ipython', new_callable=mock_get_ipython)
    def test_dataframe_caching(self, cell):

        # Create a pipeline that exercises the DataFrame API. This will also use
        # caching in the background.
        with cell:  # Cell 1
            p = beam.Pipeline(interactive_runner.InteractiveRunner())
            ib.watch({'p': p})

        with cell:  # Cell 2
            data = p | beam.Create([
                1, 2, 3
            ]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x))

            with beam.dataframe.allow_non_parallel_operations():
                df = to_dataframe(data).reset_index(drop=True)

            ib.collect(df)

        with cell:  # Cell 3
            df['output'] = df['square'] * df['cube']
            ib.collect(df)

        with cell:  # Cell 4
            df['output'] = 0
            ib.collect(df)

        # We use a trace through the graph to perform an isomorphism test. The end
        # output should look like a linear graph. This indicates that the dataframe
        # transform was correctly broken into separate pieces to cache. If caching
        # isn't enabled, all the dataframe computation nodes are connected to a
        # single shared node.
        trace = []

        # Only look at the top-level transforms for the isomorphism. The test
        # doesn't care about the transform implementations, just the overall shape.
        class TopLevelTracer(beam.pipeline.PipelineVisitor):
            def _find_root_producer(self,
                                    node: beam.pipeline.AppliedPTransform):
                if node is None or not node.full_label:
                    return None

                parent = self._find_root_producer(node.parent)
                if parent is None:
                    return node

                return parent

            def _add_to_trace(self, node, trace):
                if '/' not in str(node):
                    if node.inputs:
                        producer = self._find_root_producer(
                            node.inputs[0].producer)
                        producer_name = producer.full_label if producer else ''
                        trace.append((producer_name, node.full_label))

            def visit_transform(self, node: beam.pipeline.AppliedPTransform):
                self._add_to_trace(node, trace)

            def enter_composite_transform(
                    self, node: beam.pipeline.AppliedPTransform):
                self._add_to_trace(node, trace)

        p.visit(TopLevelTracer())

        # Do the isomorphism test which states that the topological sort of the
        # graph yields a linear graph.
        trace_string = '\n'.join(str(t) for t in trace)
        prev_producer = ''
        for producer, consumer in trace:
            self.assertEqual(producer, prev_producer, trace_string)
            prev_producer = consumer
Esempio n. 24
0
 def test_watch_a_module_by_name(self):
   test_env = ie.InteractiveEnvironment()
   ib.watch(_module_name)
   test_env.watch(_module_name)
   self.assertEqual(ie.current_env().watching(), test_env.watching())
Esempio n. 25
0
 def test_has_unbounded_source(self):
     p = beam.Pipeline()
     ie.current_env().set_cache_manager(InMemoryCache(), p)
     _ = p | 'ReadUnboundedSource' >> beam.io.ReadFromPubSub(
         subscription='projects/fake-project/subscriptions/fake_sub')
     self.assertTrue(utils.has_unbounded_sources(p))
Esempio n. 26
0
 def test_watch_a_module_by_module_object(self):
   test_env = ie.InteractiveEnvironment()
   module = importlib.import_module(_module_name)
   ib.watch(module)
   test_env.watch(module)
   self.assertEqual(ie.current_env().watching(), test_env.watching())
 def is_computed(self):
     # type: () -> boolean
     """Returns True if no more elements will be recorded."""
     return self._pcoll in ie.current_env().computed_pcollections
Esempio n. 28
0
 def test_watch_locals(self):
   # test_env serves as local var too.
   test_env = ie.InteractiveEnvironment()
   ib.watch(locals())
   test_env.watch(locals())
   self.assertEqual(ie.current_env().watching(), test_env.watching())
def visualize(pcoll,
              dynamic_plotting_interval=None,
              include_window_info=False,
              display_facets=False):
    """Visualizes the data of a given PCollection. Optionally enables dynamic
  plotting with interval in seconds if the PCollection is being produced by a
  running pipeline or the pipeline is streaming indefinitely. The function
  always returns immediately and is asynchronous when dynamic plotting is on.

  If dynamic plotting enabled, the visualization is updated continuously until
  the pipeline producing the PCollection is in an end state. The visualization
  would be anchored to the notebook cell output area. The function
  asynchronously returns a handle to the visualization job immediately. The user
  could manually do::

    # In one notebook cell, enable dynamic plotting every 1 second:
    handle = visualize(pcoll, dynamic_plotting_interval=1)
    # Visualization anchored to the cell's output area.
    # In a different cell:
    handle.stop()
    # Will stop the dynamic plotting of the above visualization manually.
    # Otherwise, dynamic plotting ends when pipeline is not running anymore.

  If dynamic_plotting is not enabled (by default), None is returned.

  If include_window_info is True, the data will include window information,
  which consists of the event timestamps, windows, and pane info.

  If display_facets is True, the facets widgets will be rendered. Otherwise, the
  facets widgets will not be rendered.

  The function is experimental. For internal use only; no
  backwards-compatibility guarantees.
  """
    if not _pcoll_visualization_ready:
        return None
    pv = PCollectionVisualization(pcoll,
                                  include_window_info=include_window_info,
                                  display_facets=display_facets)
    if ie.current_env().is_in_notebook:
        pv.display()
    else:
        pv.display_plain_text()
        # We don't want to do dynamic plotting if there is no notebook frontend.
        return None

    if dynamic_plotting_interval:
        # Disables the verbose logging from timeloop.
        logging.getLogger('timeloop').disabled = True
        tl = Timeloop()

        def dynamic_plotting(pcoll, pv, tl, include_window_info,
                             display_facets):
            @tl.job(interval=timedelta(seconds=dynamic_plotting_interval))
            def continuous_update_display():  # pylint: disable=unused-variable
                # Always creates a new PCollVisualization instance when the
                # PCollection materialization is being updated and dynamic
                # plotting is in-process.
                # PCollectionVisualization created at this level doesn't need dynamic
                # plotting interval information when instantiated because it's already
                # in dynamic plotting logic.
                updated_pv = PCollectionVisualization(
                    pcoll,
                    include_window_info=include_window_info,
                    display_facets=display_facets)
                updated_pv.display(updating_pv=pv)
                if ie.current_env().is_terminated(pcoll.pipeline):
                    try:
                        tl.stop()
                    except RuntimeError:
                        # The job can only be stopped once. Ignore excessive stops.
                        pass

            tl.start()
            return tl

        return dynamic_plotting(pcoll, pv, tl, include_window_info,
                                display_facets)
    return None
Esempio n. 30
0
  def __init__(self, pipeline, options=None):
    self._pipeline = pipeline
    # The global cache manager is lazily initiated outside of this module by any
    # interactive runner so that its lifespan could cover multiple runs in
    # the interactive environment. Owned by interactive_environment module. Not
    # owned by this module.
    self._cache_manager = ie.current_env().cache_manager()

    # Invoke a round trip through the runner API. This makes sure the Pipeline
    # proto is stable. The snapshot of pipeline will not be mutated within this
    # module and can be used to recover original pipeline if needed.
    self._pipeline_snap = beam.pipeline.Pipeline.from_runner_api(
        pipeline.to_runner_api(use_fake_coders=True), pipeline.runner, options)

    self._background_caching_pipeline = beam.pipeline.Pipeline.from_runner_api(
        pipeline.to_runner_api(use_fake_coders=True), pipeline.runner, options)

    # Snapshot of original pipeline information.
    (self._original_pipeline_proto,
     self._original_context) = self._pipeline_snap.to_runner_api(
         return_context=True, use_fake_coders=True)

    # All compute-once-against-original-pipeline fields.
    self._unbounded_sources = unbounded_sources(
        self._background_caching_pipeline)
    # TODO(BEAM-7760): once cache scope changed, this is not needed to manage
    # relationships across pipelines, runners, and jobs.
    self._pcolls_to_pcoll_id = pcolls_to_pcoll_id(
        self._pipeline_snap, self._original_context)

    # A mapping from PCollection id to python id() value in user defined
    # pipeline instance.
    (
        self._pcoll_version_map,
        self._cacheables,
        # A dict from pcoll_id to variable name of the referenced PCollection.
        # (Dict[str, str])
        self._cacheable_var_by_pcoll_id) = cacheables(self.pcolls_to_pcoll_id)

    # A dict from cache key to PCollection that is read from cache.
    # If exists, caller should reuse the PCollection read. If not, caller
    # should create new transform and track the PCollection read from cache.
    # (Dict[str, AppliedPTransform]).
    self._cached_pcoll_read = {}

    # Reference to the user defined pipeline instance based on the given
    # pipeline. The class never mutates it.
    # Note: the original pipeline is not the user pipeline.
    self._user_pipeline = None

    # A dict from PCollections in the runner pipeline instance to their
    # corresponding PCollections in the user pipeline instance. Populated
    # after preprocess().
    self._runner_pcoll_to_user_pcoll = {}
    self._pruned_pipeline_proto = None

    # Refers target pcolls output by instrumented write cache transforms, used
    # by pruning logic as supplemental targets to build pipeline fragment up
    # from.
    self._extended_targets = set()

    # Refers pcolls used as inputs but got replaced by outputs of read cache
    # transforms instrumented, used by pruning logic as targets no longer need
    # to be produced during pipeline runs.
    self._ignored_targets = set()