def test_cacheables(self): p2 = beam.Pipeline() pcoll2 = p2 | beam.Create([2]) ib.watch({'p2': p2, 'pcoll2': pcoll2}) cacheables = utils.cacheables() cacheable_key = Cacheable.from_pcoll('pcoll2', pcoll2).to_key() self.assertIn(cacheable_key, cacheables)
def _clear(self): # type: () -> None """Clears the recording of all non-source PCollections.""" cache_manager = ie.current_env().get_cache_manager(self.user_pipeline) # Only clear the PCollections that aren't being populated from the # BackgroundCachingJob. computed = ie.current_env().computed_pcollections cacheables = [ c for c in utils.cacheables().values() if c.pcoll.pipeline is self.user_pipeline and c.pcoll not in computed ] all_cached = set(str(c.to_key()) for c in cacheables) source_pcolls = getattr(cache_manager, 'capture_keys', set()) to_clear = all_cached - source_pcolls self._clear_pcolls(cache_manager, set(to_clear))
def find_cacheables(self) -> Dict[str, Cacheable]: """Finds PCollections that need to be cached for analyzed pipeline. There might be multiple pipelines defined and watched, this will only find cacheables belong to the analyzed pipeline. """ result = {} cacheables = utils.cacheables() for _, cacheable in cacheables.items(): if cacheable.pcoll.pipeline is not self._user_pipeline: # Ignore all cacheables from other pipelines. continue pcoll_id = self.pcoll_id(cacheable.pcoll) if not pcoll_id: _LOGGER.debug( 'Unable to retrieve PCollection id for %s. Ignored.', cacheable.pcoll) continue result[self.pcoll_id(cacheable.pcoll)] = cacheable return result