コード例 #1
0
    def test_cancel_stops_recording(self):
        # Add the TestStream so that it can be cached.
        ib.options.capturable_sources.add(TestStream)

        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        elems = (p
                 | TestStream().advance_watermark_to(
                     0).advance_processing_time(1).add_elements(list(
                         range(10))).advance_processing_time(1))
        squares = elems | beam.Map(lambda x: x**2)

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Get the recording then the BackgroundCachingJob.
        rm = RecordingManager(p)
        recording = rm.record([squares], max_n=10, max_duration=30)

        # The BackgroundCachingJob is still waiting for more elements, so it isn't
        # done yet.
        bcj = ie.current_env().get_background_caching_job(p)
        self.assertFalse(bcj.is_done())

        # Assert that something was read and that the BackgroundCachingJob was
        # sucessfully stopped.
        self.assertTrue(list(recording.stream(squares).read()))
        rm.cancel()
        self.assertTrue(bcj.is_done())
コード例 #2
0
    def test_recording_manager_clears_cache(self):
        """Tests that the RecordingManager clears the cache before recording.

    A job may have incomplete PCollections when the job terminates. Clearing the
    cache ensures that correct results are computed every run.
    """
        # Add the TestStream so that it can be cached.
        ib.options.capturable_sources.add(TestStream)
        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        elems = (p
                 | TestStream().advance_watermark_to(
                     0).advance_processing_time(1).add_elements(list(
                         range(10))).advance_processing_time(1))
        squares = elems | beam.Map(lambda x: x**2)

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Do the first recording to get the timestamp of the first time the fragment
        # was run.
        rm = RecordingManager(p)
        rm.record([squares], max_n=10, max_duration=2)
        first_recording_start = rm.describe()['start']
        rm.cancel()

        # Get the cache, key, and coder to read the PCollection from the cache.
        pipeline_instrument = pi.PipelineInstrument(p)
        cache = ie.current_env().get_cache_manager(p)
        cache_key = pipeline_instrument.cache_key(squares)

        # Set up a mock for the Cache's clear function which will be used to clear
        # uncomputed PCollections.
        cache.clear = MagicMock()

        # Rerun the fragment. If the cache was cleared correctly then the starting
        # time of the second recording will be later than the first. This is because
        # the PCollection wasn't considered to be computedand was cleared from
        # cache. Thus the pipeline fragment was rerun for that PCollection at a
        # later time.
        rm.record([squares], max_n=10, max_duration=1)
        second_recording_start = rm.describe()['start']
        rm.cancel()
        self.assertGreater(second_recording_start, first_recording_start)

        # Assert that the cache cleared the PCollection.
        cache.clear.assert_called_with('full', cache_key)
コード例 #3
0
    def test_basic_execution(self):
        """A basic pipeline to be used as a smoke test."""

        # Create the pipeline that will emit 0, 1, 2.
        p = beam.Pipeline(InteractiveRunner())
        numbers = p | 'numbers' >> beam.Create([0, 1, 2])
        letters = p | 'letters' >> beam.Create(['a', 'b', 'c'])

        # Watch the pipeline and PCollections. This is normally done in a notebook
        # environment automatically, but we have to do it manually here.
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Create the recording objects. By calling `record` a new PipelineFragment
        # is started to compute the given PCollections and cache to disk.
        rm = RecordingManager(p)
        numbers_recording = rm.record([numbers],
                                      max_n=3,
                                      max_duration_secs=500)
        numbers_stream = numbers_recording.stream(numbers)
        numbers_recording.wait_until_finish()

        # Once the pipeline fragment completes, we can read from the stream and know
        # that all elements were written to cache.
        elems = list(numbers_stream.read())
        expected_elems = [
            WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3)
        ]
        self.assertListEqual(elems, expected_elems)

        # Make an extra recording and test the description.
        letters_recording = rm.record([letters],
                                      max_n=3,
                                      max_duration_secs=500)
        letters_recording.wait_until_finish()

        self.assertEqual(
            rm.describe()['size'],
            numbers_recording.describe()['size'] +
            letters_recording.describe()['size'])

        rm.cancel()
コード例 #4
0
    def test_recording_manager_clears_cache(self):
        """Tests that the RecordingManager clears the cache before recording.

    A job may have incomplete PCollections when the job terminates. Clearing the
    cache ensures that correct results are computed every run.
    """
        # Add the TestStream so that it can be cached.
        ib.options.recordable_sources.add(TestStream)
        p = beam.Pipeline(InteractiveRunner(),
                          options=PipelineOptions(streaming=True))
        elems = (p
                 | TestStream().advance_watermark_to(
                     0).advance_processing_time(1).add_elements(list(
                         range(10))).advance_processing_time(1))
        squares = elems | beam.Map(lambda x: x**2)

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Do the first recording to get the timestamp of the first time the fragment
        # was run.
        rm = RecordingManager(p)

        # Get the cache, key, and coder to read the PCollection from the cache.
        pipeline_instrument = pi.PipelineInstrument(p)

        # Set up a mock for the Cache's clear function which will be used to clear
        # uncomputed PCollections.
        rm._clear_pcolls = MagicMock()
        rm.record([squares], max_n=1, max_duration=500)
        rm.cancel()

        # Assert that the cache cleared the PCollection.
        rm._clear_pcolls.assert_any_call(
            unittest.mock.ANY,
            set(pipeline_instrument.cache_key(pc) for pc in (elems, squares)))