def test_capture_control_evict_captured_data( self, mocked_test_stream_service_stop, mocked_background_caching_job_cancel): p = _build_an_empty_streaming_pipeline() ie.current_env().track_user_pipelines() self.assertFalse(ie.current_env().tracked_user_pipelines == set()) background_caching_job = bcj.BackgroundCachingJob( runner.PipelineResult(runner.PipelineState.RUNNING), limiters=[]) ie.current_env().set_background_caching_job(p, background_caching_job) _fake_a_running_test_stream_service(p) # Fake the canceling state of the main job. background_caching_job._pipeline_result = runner.PipelineResult( runner.PipelineState.CANCELLING) self.assertIsNotNone( ie.current_env().get_test_stream_service_controller(p)) ie.current_env().set_cached_source_signature(p, 'a signature') ie.current_env().mark_pcollection_computed(['fake_pcoll']) capture_control.evict_captured_data() mocked_background_caching_job_cancel.assert_called() mocked_test_stream_service_stop.assert_called_once() # Neither timer nor capture size limit is reached, thus, the cancelling # main job's background caching job is not considered as done. self.assertFalse(background_caching_job.is_done()) self.assertIsNone( ie.current_env().get_test_stream_service_controller(p)) self.assertTrue(ie.current_env().computed_pcollections == set()) self.assertTrue( ie.current_env().get_cached_source_signature(p) == set())
def _fake_a_running_background_caching_job(pipeline): background_caching_job = bcj.BackgroundCachingJob( runner.PipelineResult(runner.PipelineState.RUNNING), # Do not start multithreaded checkers in tests. start_limit_checkers=False) ie.current_env().set_background_caching_job(pipeline, background_caching_job) return background_caching_job
def test_background_caching_job_not_start_when_such_job_is_done(self): p = _build_a_test_stream_pipeline() a_done_background_caching_job = bcj.BackgroundCachingJob( runner.PipelineResult(runner.PipelineState.DONE)) ie.current_env().set_background_caching_job( p, a_done_background_caching_job) main_job_result = p.run() # No background caching job is started so result is still the running one. self.assertIs(a_done_background_caching_job, ie.current_env().get_background_caching_job(p)) # A new main job is started so result of the main job is set. self.assertIs(main_job_result, ie.current_env().pipeline_result(p))
def test_background_caching_job_not_start_when_such_job_exists(self): p = _build_a_test_stream_pipeline() _setup_test_streaming_cache(p) a_running_background_caching_job = bcj.BackgroundCachingJob( runner.PipelineResult(runner.PipelineState.RUNNING), limiters=[]) ie.current_env().set_background_caching_job( p, a_running_background_caching_job) main_job_result = p.run() # No background caching job is started so result is still the running one. self.assertIs(a_running_background_caching_job, ie.current_env().get_background_caching_job(p)) # A new main job is started so result of the main job is set. self.assertIs(main_job_result, ie.current_env().pipeline_result(p))
def test_timer_terminates_capture_size_checker(self): p = _build_an_empty_streaming_pipeline() class FakeLimiter(capture_limiters.Limiter): def __init__(self): self.trigger = False def is_triggered(self): return self.trigger limiter = FakeLimiter() background_caching_job = bcj.BackgroundCachingJob( runner.PipelineResult(runner.PipelineState.CANCELLING), limiters=[limiter]) ie.current_env().set_background_caching_job(p, background_caching_job) self.assertFalse(background_caching_job.is_done()) limiter.trigger = True self.assertTrue(background_caching_job.is_done())
def test_computed(self): """Tests that a PCollection is marked as computed only in a complete state. Because the background caching job is now long-lived, repeated runs of a PipelineFragment may yield different results for the same PCollection. """ p = beam.Pipeline(InteractiveRunner()) elems = p | beam.Create([0, 1, 2]) ib.watch(locals()) # Create a MockPipelineResult to control the state of a fake run of the # pipeline. mock_result = MockPipelineResult() ie.current_env().track_user_pipelines() ie.current_env().set_pipeline_result(p, mock_result) # Create a mock BackgroundCachingJob that will control whether to set the # PCollections as computed or not. bcj_mock_result = MockPipelineResult() background_caching_job = bcj.BackgroundCachingJob(bcj_mock_result, []) # Create a recording. recording = Recording(p, [elems], mock_result, max_n=10, max_duration_secs=60) # The background caching job and the recording isn't done yet so there may # be more elements to be recorded. self.assertFalse(recording.is_computed()) self.assertFalse(recording.computed()) self.assertTrue(recording.uncomputed()) # The recording is finished but the background caching job is not. There # may still be more elements to record, or the intermediate PCollection may # have stopped caching in an incomplete state, e.g. before a window could # fire. mock_result.set_state(PipelineState.DONE) recording.wait_until_finish() self.assertFalse(recording.is_computed()) self.assertFalse(recording.computed()) self.assertTrue(recording.uncomputed()) # The background caching job finished before we started a recording which # is a sure signal that there will be no more elements. bcj_mock_result.set_state(PipelineState.DONE) ie.current_env().set_background_caching_job(p, background_caching_job) recording = Recording(p, [elems], mock_result, max_n=10, max_duration_secs=60) recording.wait_until_finish() # There are no more elements and the recording finished, meaning that the # intermediate PCollections are in a complete state. They can now be marked # as computed. self.assertTrue(recording.is_computed()) self.assertTrue(recording.computed()) self.assertFalse(recording.uncomputed())