Ejemplo n.º 1
0
    def test_cacheables(self):
        p2 = beam.Pipeline()
        pcoll2 = p2 | beam.Create([2])
        ib.watch({'p2': p2, 'pcoll2': pcoll2})

        cacheables = utils.cacheables()
        cacheable_key = Cacheable.from_pcoll('pcoll2', pcoll2).to_key()
        self.assertIn(cacheable_key, cacheables)
Ejemplo n.º 2
0
    def _clear(self):
        # type: () -> None
        """Clears the recording of all non-source PCollections."""

        cache_manager = ie.current_env().get_cache_manager(self.user_pipeline)

        # Only clear the PCollections that aren't being populated from the
        # BackgroundCachingJob.
        computed = ie.current_env().computed_pcollections
        cacheables = [
            c for c in utils.cacheables().values() if
            c.pcoll.pipeline is self.user_pipeline and c.pcoll not in computed
        ]
        all_cached = set(str(c.to_key()) for c in cacheables)
        source_pcolls = getattr(cache_manager, 'capture_keys', set())
        to_clear = all_cached - source_pcolls

        self._clear_pcolls(cache_manager, set(to_clear))
Ejemplo n.º 3
0
    def find_cacheables(self) -> Dict[str, Cacheable]:
        """Finds PCollections that need to be cached for analyzed pipeline.

    There might be multiple pipelines defined and watched, this will only find
    cacheables belong to the analyzed pipeline.
    """
        result = {}
        cacheables = utils.cacheables()
        for _, cacheable in cacheables.items():
            if cacheable.pcoll.pipeline is not self._user_pipeline:
                # Ignore all cacheables from other pipelines.
                continue
            pcoll_id = self.pcoll_id(cacheable.pcoll)
            if not pcoll_id:
                _LOGGER.debug(
                    'Unable to retrieve PCollection id for %s. Ignored.',
                    cacheable.pcoll)
                continue
            result[self.pcoll_id(cacheable.pcoll)] = cacheable
        return result