def test_cleanup_invoked_when_new_env_replace_not_none_env(self, mocked_cleanup): ie._interactive_beam_env = None ie.new_env(cache.FileBasedCacheManager()) mocked_cleanup.assert_not_called() ie.new_env(cache.FileBasedCacheManager()) mocked_cleanup.assert_called_once()
def test_cleanup_reregistered_when_cm_changed(self, mocked_unreg, mocked_reg): ie.new_env(cache.FileBasedCacheManager()) mocked_unreg.assert_not_called() ie.current_env().set_cache_manager(cache.FileBasedCacheManager()) mocked_unreg.assert_called_once() mocked_reg.assert_has_calls( [call(ie.current_env().cleanup), call(ie.current_env().cleanup)])
def test_cleanup_invoked_when_not_none_cm_changed(self): env = ie.InteractiveEnvironment() with patch('apache_beam.runners.interactive.interactive_environment' '.InteractiveEnvironment.cleanup') as mocked_cleanup: dummy_pipeline = 'dummy' env.set_cache_manager(cache.FileBasedCacheManager(), dummy_pipeline) mocked_cleanup.assert_not_called() env.set_cache_manager(cache.FileBasedCacheManager(), dummy_pipeline) mocked_cleanup.assert_called_once()
def test_cleanup_invoked_when_not_none_cm_changed(self, mocked_cleanup): ie._interactive_beam_env = None ie.new_env() dummy_pipeline = 'dummy' ie.current_env().set_cache_manager(cache.FileBasedCacheManager(), dummy_pipeline) mocked_cleanup.assert_not_called() ie.current_env().set_cache_manager(cache.FileBasedCacheManager(), dummy_pipeline) mocked_cleanup.assert_called_once()
def test_cache_manager_uses_local_ib_cache_root(self): """ Checks that FileBasedCacheManager._cache_dir is set to the cache_root set under Interactive Beam for a local directory and that the cached values are the same as the values of a cache using default settings. """ prefix = 'full' cache_label = 'some-cache-label' cached_values = [1, 2, 3] self.mock_write_cache(cached_values, prefix, cache_label) reader_one, _ = self.cache_manager.read(prefix, cache_label) pcoll_list_one = list(reader_one) # Set Interactive Beam specified cache dir to local directory ib.options.cache_root = '/tmp/it-test/' cache_manager_with_ib_option = cache.FileBasedCacheManager( cache_dir=ib.options.cache_root) self.assertEqual(ib.options.cache_root, cache_manager_with_ib_option._cache_dir) cache_manager_with_ib_option.write(cached_values, *[prefix, cache_label]) reader_two, _ = self.cache_manager.read(prefix, cache_label) pcoll_list_two = list(reader_two) # Writing to a different directory should not impact the cached values self.assertEqual(pcoll_list_one, pcoll_list_two) # Reset Interactive Beam setting ib.options.cache_root = None
def __init__(self, underlying_runner=None, cache_dir=None, cache_format='text', render_option=None, skip_display=False): """Constructor of InteractiveRunner. Args: underlying_runner: (runner.PipelineRunner) cache_dir: (str) the directory where PCollection caches are kept cache_format: (str) the file format that should be used for saving PCollection caches. Available options are 'text' and 'tfrecord'. render_option: (str) this parameter decides how the pipeline graph is rendered. See display.pipeline_graph_renderer for available options. skip_display: (bool) whether to skip display operations when running the pipeline. Useful if running large pipelines when display is not needed. """ self._underlying_runner = (underlying_runner or direct_runner.DirectRunner()) self._cache_manager = cache.FileBasedCacheManager( cache_dir, cache_format) self._renderer = pipeline_graph_renderer.get_renderer(render_option) self._in_session = False self._skip_display = skip_display
def test_track_user_pipeline_cleanup_non_inspectable_pipeline(self): dummy_pipeline_1 = beam.Pipeline() dummy_pipeline_2 = beam.Pipeline() dummy_pipeline_3 = beam.Pipeline() dummy_pipeline_4 = beam.Pipeline() dummy_pcoll = dummy_pipeline_4 | beam.Create([1]) dummy_pipeline_5 = beam.Pipeline() dummy_non_inspectable_pipeline = 'dummy' ie.current_env().watch(locals()) from apache_beam.runners.interactive.background_caching_job import BackgroundCachingJob ie.current_env().set_background_caching_job( dummy_pipeline_1, BackgroundCachingJob(runner.PipelineResult( runner.PipelineState.DONE), limiters=[])) ie.current_env().set_test_stream_service_controller( dummy_pipeline_2, None) ie.current_env().set_cache_manager(cache.FileBasedCacheManager(), dummy_pipeline_3) ie.current_env().mark_pcollection_computed([dummy_pcoll]) ie.current_env().set_cached_source_signature( dummy_non_inspectable_pipeline, None) ie.current_env().set_pipeline_result( dummy_pipeline_5, runner.PipelineResult(runner.PipelineState.RUNNING)) with patch('apache_beam.runners.interactive.interactive_environment' '.InteractiveEnvironment.cleanup') as mocked_cleanup: ie.current_env().track_user_pipelines() mocked_cleanup.assert_called_once()
def __init__(self, underlying_runner=None, cache_dir=None): # TODO(qinyeli, BEAM-4755) remove explicitly overriding underlying runner # once interactive_runner works with FnAPI mode self._underlying_runner = (underlying_runner or direct_runner.BundleBasedDirectRunner()) self._cache_manager = cache.FileBasedCacheManager(cache_dir) self._in_session = False
def test_cleanup_unregistered_when_not_none_cm_cleared( self, mocked_unreg, mocked_reg): ie.new_env(cache.FileBasedCacheManager()) mocked_reg.assert_called_once() mocked_unreg.assert_not_called() ie.current_env().set_cache_manager(None) mocked_reg.assert_called_once() mocked_unreg.assert_called_once()
def test_noop_when_cm_is_not_changed(self, mocked_unreg, mocked_reg): cache_manager = cache.FileBasedCacheManager() ie.new_env(cache_manager) mocked_unreg.assert_not_called() mocked_reg.assert_called_once() ie.current_env().set_cache_manager(cache_manager) mocked_unreg.assert_not_called() mocked_reg.assert_called_once()
def test_noop_when_cm_is_not_changed(self, mocked_cleanup): ie._interactive_beam_env = None cache_manager = cache.FileBasedCacheManager() dummy_pipeline = 'dummy' ie.new_env() ie.current_env()._cache_managers[str(id(dummy_pipeline))] = cache_manager mocked_cleanup.assert_not_called() ie.current_env().set_cache_manager(cache_manager, dummy_pipeline) mocked_cleanup.assert_not_called()
def test_cleanup_not_invoked_when_cm_changed_from_none(self, mocked_cleanup): ie._interactive_beam_env = None ie.new_env() dummy_pipeline = 'dummy' self.assertIsNone(ie.current_env().get_cache_manager(dummy_pipeline)) cache_manager = cache.FileBasedCacheManager() ie.current_env().set_cache_manager(cache_manager, dummy_pipeline) mocked_cleanup.assert_not_called() self.assertIs( ie.current_env().get_cache_manager(dummy_pipeline), cache_manager)
def test_noop_when_cm_is_not_changed(self): cache_manager = cache.FileBasedCacheManager() dummy_pipeline = 'dummy' env = ie.InteractiveEnvironment() with patch('apache_beam.runners.interactive.interactive_environment' '.InteractiveEnvironment.cleanup') as mocked_cleanup: env._cache_managers[str(id(dummy_pipeline))] = cache_manager mocked_cleanup.assert_not_called() env.set_cache_manager(cache_manager, dummy_pipeline) mocked_cleanup.assert_not_called()
def test_cleanup_not_invoked_when_cm_changed_from_none(self): env = ie.InteractiveEnvironment() with patch('apache_beam.runners.interactive.interactive_environment' '.InteractiveEnvironment.cleanup') as mocked_cleanup: dummy_pipeline = 'dummy' self.assertIsNone(env.get_cache_manager(dummy_pipeline)) cache_manager = cache.FileBasedCacheManager() env.set_cache_manager(cache_manager, dummy_pipeline) mocked_cleanup.assert_not_called() self.assertIs(env.get_cache_manager(dummy_pipeline), cache_manager)
def get_cache_manager(self, pipeline, create_if_absent=False): """Gets the cache manager held by current Interactive Environment for the given pipeline. If the pipeline is absent from the environment while create_if_absent is True, creates and returns a new file based cache manager for the pipeline.""" if self._is_in_ipython: warnings.filterwarnings( 'ignore', 'options is deprecated since First stable release. References to ' '<pipeline>.options will not be supported', category=DeprecationWarning) cache_manager = self._cache_managers.get(str(id(pipeline)), None) if isinstance(pipeline, Pipeline): from apache_beam.runners.interactive.interactive_runner import InteractiveRunner if isinstance(pipeline.runner, InteractiveRunner): pipeline_runner = pipeline.runner._underlying_runner else: pipeline_runner = pipeline.runner else: pipeline_runner = None if not cache_manager and create_if_absent: cache_root = self.options.cache_root if cache_root: if cache_root.startswith('gs://'): cache_dir = self._get_gcs_cache_dir(pipeline, cache_root) else: cache_dir = tempfile.mkdtemp(dir=cache_root) if not isinstance(pipeline_runner, direct_runner.DirectRunner): _LOGGER.warning( 'A local cache directory has been specified while ' 'not using DirectRunner. It is recommended to cache into a ' 'GCS bucket instead.') else: staging_location = pipeline.options.get_all_options( )['staging_location'] if isinstance(pipeline_runner, DataflowRunner) and staging_location: cache_dir = self._get_gcs_cache_dir( pipeline, staging_location) _LOGGER.info( 'No cache_root detected. ' 'Defaulting to staging_location %s for cache location.', staging_location) else: cache_dir = tempfile.mkdtemp(suffix=str(id(pipeline)), prefix='it-', dir=os.environ.get( 'TEST_TMPDIR', None)) cache_manager = cache.FileBasedCacheManager(cache_dir) self._cache_managers[str(id(pipeline))] = cache_manager return cache_manager
def get_cache_manager(self, pipeline, create_if_absent=False): """Gets the cache manager held by current Interactive Environment for the given pipeline. If the pipeline is absent from the environment while create_if_absent is True, creates and returns a new file based cache manager for the pipeline.""" cache_manager = self._cache_managers.get(str(id(pipeline)), None) if not cache_manager and create_if_absent: cache_dir = tempfile.mkdtemp( suffix=str(id(pipeline)), prefix='it-', dir=os.environ.get('TEST_TMPDIR', None)) cache_manager = cache.FileBasedCacheManager(cache_dir) self._cache_managers[str(id(pipeline))] = cache_manager return cache_manager
def __init__(self, underlying_runner=None, cache_dir=None, render_option=None): """Constructor of InteractiveRunner. Args: underlying_runner: (runner.PipelineRunner) cache_dir: (str) the directory where PCollection caches are kept render_option: (str) this parameter decides how the pipeline graph is rendered. See display.pipeline_graph_renderer for available options. """ self._underlying_runner = (underlying_runner or direct_runner.DirectRunner()) self._cache_manager = cache.FileBasedCacheManager(cache_dir) self._renderer = pipeline_graph_renderer.get_renderer(render_option) self._in_session = False
def test_cache_manager_uses_gcs_ib_cache_root(self): """ Checks that FileBasedCacheManager._cache_dir is set to the cache_root set under Interactive Beam for a GCS directory. """ # Set Interactive Beam specified cache dir to cloud storage ib.options.cache_root = 'gs://' cache_manager_with_ib_option = cache.FileBasedCacheManager( cache_dir=ib.options.cache_root) self.assertEqual(ib.options.cache_root, cache_manager_with_ib_option._cache_dir) # Reset Interactive Beam setting ib.options.cache_root = None
def get_cache_manager(self, pipeline, create_if_absent=False): """Gets the cache manager held by current Interactive Environment for the given pipeline. If the pipeline is absent from the environment while create_if_absent is True, creates and returns a new file based cache manager for the pipeline.""" cache_manager = self._cache_managers.get(str(id(pipeline)), None) if not cache_manager and create_if_absent: from apache_beam.runners.interactive import interactive_beam as ib if ib.options.cache_root: #TODO(victorhc): Handle the case when the path starts with "gs://" if ib.options.cache_root.startswith("gs://"): raise ValueError("GCS paths are not currently supported.") cache_dir = tempfile.mkdtemp(dir=ib.options.cache_root) else: cache_dir = tempfile.mkdtemp(suffix=str(id(pipeline)), prefix='it-', dir=os.environ.get( 'TEST_TMPDIR', None)) cache_manager = cache.FileBasedCacheManager(cache_dir) self._cache_managers[str(id(pipeline))] = cache_manager return cache_manager
def __init__( self, underlying_runner=None, cache_dir=None, cache_format='text', render_option=None, skip_display=True, force_compute=True, blocking=True): """Constructor of InteractiveRunner. Args: underlying_runner: (runner.PipelineRunner) cache_dir: (str) the directory where PCollection caches are kept cache_format: (str) the file format that should be used for saving PCollection caches. Available options are 'text' and 'tfrecord'. render_option: (str) this parameter decides how the pipeline graph is rendered. See display.pipeline_graph_renderer for available options. skip_display: (bool) whether to skip display operations when running the pipeline. Useful if running large pipelines when display is not needed. force_compute: (bool) whether sequential pipeline runs can use cached data of PCollections computed from the previous runs including show API invocation from interactive_beam module. If True, always run the whole pipeline and compute data for PCollections forcefully. If False, use available data and run minimum pipeline fragment to only compute data not available. blocking: (bool) whether the pipeline run should be blocking or not. """ self._underlying_runner = ( underlying_runner or direct_runner.DirectRunner()) if not ie.current_env().cache_manager(): ie.current_env().set_cache_manager( cache.FileBasedCacheManager(cache_dir, cache_format)) self._cache_manager = ie.current_env().cache_manager() self._render_option = render_option self._in_session = False self._skip_display = skip_display self._force_compute = force_compute self._blocking = blocking
def setUp(self): self.cache_manager = cache.FileBasedCacheManager( cache_format=self.cache_format)
def setUp(self): self.test_dir = tempfile.mkdtemp() self.cache_manager = cache.FileBasedCacheManager(self.test_dir)
def __init__(self, underlying_runner=None, cache_dir=None): self._underlying_runner = (underlying_runner or direct_runner.DirectRunner()) self._cache_manager = cache.FileBasedCacheManager(cache_dir) self._in_session = False
def test_cleanup_when_cm_not_none(self, mocked_atexit): ie.new_env(cache.FileBasedCacheManager()) mocked_atexit.assert_called_once()
def setUp(self): self.runner = direct_runner.DirectRunner() self.cache_manager = cache.FileBasedCacheManager()
def test_cleanup_invoked_when_cm_changed(self, mocked_cleanup): ie._interactive_beam_env = None ie.new_env(cache.FileBasedCacheManager()) ie.current_env().set_cache_manager(cache.FileBasedCacheManager()) mocked_cleanup.assert_called_once()
def setUp(self): ie.new_env(cache_manager=cache.FileBasedCacheManager())