def test_clear(self): """Tests that clear can empty the cache for a specific pipeline.""" # Create two pipelines so we can check that clearing the cache won't clear # all defined pipelines. p1 = beam.Pipeline(InteractiveRunner()) elems_1 = p1 | 'elems 1' >> beam.Create([0, 1, 2]) p2 = beam.Pipeline(InteractiveRunner()) elems_2 = p2 | 'elems 2' >> beam.Create([0, 1, 2]) # Watch the pipeline and PCollections. This is normally done in a notebook # environment automatically, but we have to do it manually here. ib.watch(locals()) ie.current_env().track_user_pipelines() # Create the recording objects. By calling `record` a new PipelineFragment # is started to compute the given PCollections and cache to disk. rm_1 = RecordingManager(p1) recording = rm_1.record([elems_1], max_n=3, max_duration=500) recording.wait_until_finish() rm_2 = RecordingManager(p2) recording = rm_2.record([elems_2], max_n=3, max_duration=500) recording.wait_until_finish() # Assert that clearing only one recording clears that recording. self.assertGreater(rm_1.describe()['size'], 0) self.assertGreater(rm_2.describe()['size'], 0) rm_1.clear() self.assertEqual(rm_1.describe()['size'], 0) self.assertGreater(rm_2.describe()['size'], 0) rm_2.clear() self.assertEqual(rm_2.describe()['size'], 0)
def test_record_pipeline(self): # Add the TestStream so that it can be cached. ib.options.recordable_sources.add(TestStream) p = beam.Pipeline(InteractiveRunner(), options=PipelineOptions(streaming=True)) # pylint: disable=unused-variable _ = (p | TestStream() .advance_watermark_to(0) .advance_processing_time(1) .add_elements(list(range(10))) .advance_processing_time(1)) # yapf: disable # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # Create a lmiter that stops the background caching job when something is # written to cache. This is used to make ensure that the pipeline is # functioning properly and that there are no data races with the test. class SizeLimiter(Limiter): def __init__(self, p): self.pipeline = p self._rm = None def set_recording_manager(self, rm): self._rm = rm def is_triggered(self): return self._rm.describe()['size'] > 0 if self._rm else False # Do the first recording to get the timestamp of the first time the fragment # was run. size_limiter = SizeLimiter(p) rm = RecordingManager(p, test_limiters=[size_limiter]) size_limiter.set_recording_manager(rm) self.assertEqual(rm.describe()['state'], PipelineState.STOPPED) self.assertTrue(rm.record_pipeline()) # A recording is in progress, no need to start another one. self.assertFalse(rm.record_pipeline()) for _ in range(60): if rm.describe()['state'] == PipelineState.CANCELLED: break time.sleep(1) self.assertTrue( rm.describe()['state'] == PipelineState.CANCELLED, 'Test timed out waiting for pipeline to be cancelled. This indicates ' 'that the BackgroundCachingJob did not cache anything.')
def test_recording_manager_clears_cache(self): """Tests that the RecordingManager clears the cache before recording. A job may have incomplete PCollections when the job terminates. Clearing the cache ensures that correct results are computed every run. """ # Add the TestStream so that it can be cached. ib.options.capturable_sources.add(TestStream) p = beam.Pipeline(InteractiveRunner(), options=PipelineOptions(streaming=True)) elems = (p | TestStream().advance_watermark_to( 0).advance_processing_time(1).add_elements(list( range(10))).advance_processing_time(1)) squares = elems | beam.Map(lambda x: x**2) # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # Do the first recording to get the timestamp of the first time the fragment # was run. rm = RecordingManager(p) rm.record([squares], max_n=10, max_duration=2) first_recording_start = rm.describe()['start'] rm.cancel() # Get the cache, key, and coder to read the PCollection from the cache. pipeline_instrument = pi.PipelineInstrument(p) cache = ie.current_env().get_cache_manager(p) cache_key = pipeline_instrument.cache_key(squares) # Set up a mock for the Cache's clear function which will be used to clear # uncomputed PCollections. cache.clear = MagicMock() # Rerun the fragment. If the cache was cleared correctly then the starting # time of the second recording will be later than the first. This is because # the PCollection wasn't considered to be computedand was cleared from # cache. Thus the pipeline fragment was rerun for that PCollection at a # later time. rm.record([squares], max_n=10, max_duration=1) second_recording_start = rm.describe()['start'] rm.cancel() self.assertGreater(second_recording_start, first_recording_start) # Assert that the cache cleared the PCollection. cache.clear.assert_called_with('full', cache_key)
def test_clear(self): p1 = beam.Pipeline(InteractiveRunner()) elems_1 = p1 | 'elems 1' >> beam.Create([0, 1, 2]) ib.watch(locals()) ie.current_env().track_user_pipelines() recording_manager = RecordingManager(p1) recording = recording_manager.record([elems_1], max_n=3, max_duration=500) recording.wait_until_finish() record_describe = recording_manager.describe() self.assertGreater(record_describe['size'], 0) recording_manager.clear() self.assertEqual(recording_manager.describe()['size'], 0)
def test_basic_execution(self): """A basic pipeline to be used as a smoke test.""" # Create the pipeline that will emit 0, 1, 2. p = beam.Pipeline(InteractiveRunner()) numbers = p | 'numbers' >> beam.Create([0, 1, 2]) letters = p | 'letters' >> beam.Create(['a', 'b', 'c']) # Watch the pipeline and PCollections. This is normally done in a notebook # environment automatically, but we have to do it manually here. ib.watch(locals()) ie.current_env().track_user_pipelines() # Create the recording objects. By calling `record` a new PipelineFragment # is started to compute the given PCollections and cache to disk. rm = RecordingManager(p) numbers_recording = rm.record([numbers], max_n=3, max_duration_secs=500) numbers_stream = numbers_recording.stream(numbers) numbers_recording.wait_until_finish() # Once the pipeline fragment completes, we can read from the stream and know # that all elements were written to cache. elems = list(numbers_stream.read()) expected_elems = [ WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3) ] self.assertListEqual(elems, expected_elems) # Make an extra recording and test the description. letters_recording = rm.record([letters], max_n=3, max_duration_secs=500) letters_recording.wait_until_finish() self.assertEqual( rm.describe()['size'], numbers_recording.describe()['size'] + letters_recording.describe()['size']) rm.cancel()