def test_cacheables(self):
        p_cacheables = beam.Pipeline(interactive_runner.InteractiveRunner())
        ie.current_env().set_cache_manager(InMemoryCache(), p_cacheables)
        # pylint: disable=bad-option-value
        init_pcoll = p_cacheables | 'Init Create' >> beam.Create(range(10))
        squares = init_pcoll | 'Square' >> beam.Map(lambda x: x * x)
        cubes = init_pcoll | 'Cube' >> beam.Map(lambda x: x**3)
        ib.watch(locals())

        pipeline_instrument = instr.build_pipeline_instrument(p_cacheables)

        self.assertEqual(
            pipeline_instrument._cacheables, {
                pipeline_instrument.pcoll_id(init_pcoll):
                Cacheable(var='init_pcoll',
                          version=str(id(init_pcoll)),
                          producer_version=str(id(init_pcoll.producer)),
                          pcoll=init_pcoll),
                pipeline_instrument.pcoll_id(squares):
                Cacheable(var='squares',
                          version=str(id(squares)),
                          producer_version=str(id(squares.producer)),
                          pcoll=squares),
                pipeline_instrument.pcoll_id(cubes):
                Cacheable(var='cubes',
                          version=str(id(cubes)),
                          producer_version=str(id(cubes.producer)),
                          pcoll=cubes)
            })
Example #2
0
def cacheables(pcolls_to_pcoll_id):
    """Finds PCollections that need to be cached for analyzed PCollections.

  The function only treats the result as cacheables since there is no guarantee
  whether PCollections that need to be cached have been cached or not. A
  PCollection needs to be cached when it's bound to a user defined variable in
  the source code. Otherwise, the PCollection is not reusable nor introspectable
  which nullifies the need of cache. There might be multiple pipelines defined
  and watched, this will only return for PCollections with pcolls_to_pcoll_id
  analyzed. The check is not strict because pcoll_id is not unique across
  multiple pipelines. Additional check needs to be done during instrument.
  """
    pcoll_version_map = {}
    cacheables = {}
    cacheable_var_by_pcoll_id = {}
    for watching in ie.current_env().watching():
        for key, val in watching:
            if isinstance(val, beam.pvalue.PCollection):
                pcoll_id = pcolls_to_pcoll_id.get(str(val), None)
                # It's highly possible that PCollection str is not unique across
                # multiple pipelines, further check during instrument is needed.
                if not pcoll_id:
                    continue

                cacheable = Cacheable(pcoll_id=pcoll_id,
                                      var=key,
                                      version=str(id(val)),
                                      pcoll=val,
                                      producer_version=str(id(val.producer)))
                pcoll_version_map[cacheable.pcoll_id] = cacheable.version
                cacheables[cacheable_key(val, pcolls_to_pcoll_id)] = cacheable
                cacheable_var_by_pcoll_id[cacheable.pcoll_id] = key

    return pcoll_version_map, cacheables, cacheable_var_by_pcoll_id
Example #3
0
    def test_cacheables(self):
        p2 = beam.Pipeline()
        pcoll2 = p2 | beam.Create([2])
        ib.watch({'p2': p2, 'pcoll2': pcoll2})

        cacheables = utils.cacheables()
        cacheable_key = Cacheable.from_pcoll('pcoll2', pcoll2).to_key()
        self.assertIn(cacheable_key, cacheables)
Example #4
0
 def __init__(self, pipeline: beam_runner_api_pb2.Pipeline,
              context: PipelineContext, cache_manager: cache.CacheManager,
              cacheable: Cacheable):
     self._pipeline = pipeline
     self._context = context
     self._cache_manager = cache_manager
     self._cacheable = cacheable
     self._key = repr(cacheable.to_key())
Example #5
0
File: utils.py Project: mszb/beam
def cacheables() -> Dict[CacheKey, Cacheable]:
  """Finds all Cacheables with their CacheKeys."""
  from apache_beam.runners.interactive import interactive_environment as ie

  inspectables = ie.current_env().inspector_with_synthetic.inspectables
  cacheables = {}
  for _, inspectable in inspectables.items():
    metadata = inspectable['metadata']
    if metadata['type'] == 'pcollection':
      cacheable = Cacheable.from_pcoll(metadata['name'], inspectable['value'])
      cacheables[cacheable.to_key()] = cacheable
  return cacheables
Example #6
0
 def cacheables(self) -> Dict[beam.pvalue.PCollection, Cacheable]:
     """Finds all the cacheable intermediate PCollections in the pipeline with
 their metadata.
 """
     c = {}
     for watching in ie.current_env().watching():
         for key, val in watching:
             if (isinstance(val, beam.pvalue.PCollection)
                     and val.pipeline is self._user_pipeline
                     and (not self._pcolls or val in self._pcolls)):
                 c[val] = Cacheable(var=key,
                                    pcoll=val,
                                    version=str(id(val)),
                                    producer_version=str(id(val.producer)))
     return c