def test_pcolls_to_pcoll_id(self): p = beam.Pipeline(interactive_runner.InteractiveRunner()) # pylint: disable=range-builtin-not-iterating init_pcoll = p | 'Init Create' >> beam.Impulse() _, ctx = p.to_runner_api(use_fake_coders=True, return_context=True) self.assertEqual(instr.pcolls_to_pcoll_id(p, ctx), {str(init_pcoll): 'ref_PCollection_PCollection_1'})
def test_cacheable_key_with_version_map(self): p = beam.Pipeline(interactive_runner.InteractiveRunner()) ie.current_env().set_cache_manager(InMemoryCache(), p) # pylint: disable=range-builtin-not-iterating init_pcoll = p | 'Init Create' >> beam.Create(range(10)) # It's normal that when executing, the pipeline object is a different # but equivalent instance from what user has built. The pipeline instrument # should be able to identify if the original instance has changed in an # interactive env while mutating the other instance for execution. The # version map can be used to figure out what the PCollection instances are # in the original instance and if the evaluation has changed since last # execution. p2 = beam.Pipeline(interactive_runner.InteractiveRunner()) ie.current_env().set_cache_manager(InMemoryCache(), p2) # pylint: disable=range-builtin-not-iterating init_pcoll_2 = p2 | 'Init Create' >> beam.Create(range(10)) _, ctx = p2.to_runner_api(return_context=True) # The cacheable_key should use id(init_pcoll) as prefix even when # init_pcoll_2 is supplied as long as the version map is given. self.assertEqual( instr.cacheable_key( init_pcoll_2, instr.pcolls_to_pcoll_id(p2, ctx), {'ref_PCollection_PCollection_8': str(id(init_pcoll))}), str(id(init_pcoll)) + '_ref_PCollection_PCollection_8')
def __init__(self, pcolls, options=None): """Constructor of PipelineFragment. Args: pcolls: (List[PCollection]) a list of PCollections to build pipeline fragment for. options: (PipelineOptions) the pipeline options for the implicit pipeline run. """ assert len(pcolls) > 0, ( 'Need at least 1 PCollection as the target data to build a pipeline ' 'fragment that produces it.') for pcoll in pcolls: assert isinstance(pcoll, beam.pvalue.PCollection), ( '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll)) # No modification to self._user_pipeline is allowed. self._user_pipeline = pcolls[0].pipeline # These are user PCollections. Do not use them to deduce anything that # will be executed by any runner. Instead, use # `self._runner_pcolls_to_user_pcolls.keys()` to get copied PCollections. self._pcolls = set(pcolls) for pcoll in self._pcolls: assert pcoll.pipeline is self._user_pipeline, ( '{} belongs to a different user pipeline than other PCollections ' 'given and cannot be used to build a pipeline fragment that produces ' 'the given PCollections.'.format(pcoll)) self._options = options # A copied pipeline instance for modification without changing the user # pipeline instance held by the end user. This instance can be processed # into a pipeline fragment that later run by the underlying runner. self._runner_pipeline = self._build_runner_pipeline() _, self._context = self._runner_pipeline.to_runner_api( return_context=True, use_fake_coders=True) from apache_beam.runners.interactive import pipeline_instrument as instr self._runner_pcoll_to_id = instr.pcolls_to_pcoll_id( self._runner_pipeline, self._context) # Correlate components in the runner pipeline to components in the user # pipeline. The target pcolls are the pcolls given and defined in the user # pipeline. self._id_to_target_pcoll = self._calculate_target_pcoll_ids() self._label_to_user_transform = self._calculate_user_transform_labels() # Below will give us the 1:1 correlation between # PCollections/AppliedPTransforms from the copied runner pipeline and # PCollections/AppliedPTransforms from the user pipeline. # (Dict[PCollection, PCollection]) ( self._runner_pcolls_to_user_pcolls, # (Dict[AppliedPTransform, AppliedPTransform]) self._runner_transforms_to_user_transforms ) = self._build_correlation_between_pipelines( self._runner_pcoll_to_id, self._id_to_target_pcoll, self._label_to_user_transform) # Below are operated on the runner pipeline. (self._necessary_transforms, self._necessary_pcollections ) = self._mark_necessary_transforms_and_pcolls( self._runner_pcolls_to_user_pcolls) self._runner_pipeline = self._prune_runner_pipeline_to_fragment( self._runner_pipeline, self._necessary_transforms)
def test_cacheable_key_without_version_map(self): p = beam.Pipeline(interactive_runner.InteractiveRunner()) # pylint: disable=range-builtin-not-iterating init_pcoll = p | 'Init Create' >> beam.Create(range(10)) _, ctx = p.to_runner_api(use_fake_coders=True, return_context=True) self.assertEqual( instr.cacheable_key(init_pcoll, instr.pcolls_to_pcoll_id(p, ctx)), str(id(init_pcoll)) + '_ref_PCollection_PCollection_10')
def test_pcolls_to_pcoll_id(self): p = beam.Pipeline(interactive_runner.InteractiveRunner()) ie.current_env().set_cache_manager(InMemoryCache(), p) # pylint: disable=range-builtin-not-iterating init_pcoll = p | 'Init Create' >> beam.Impulse() _, ctx = p.to_runner_api(return_context=True) self.assertEqual(instr.pcolls_to_pcoll_id(p, ctx), {str(init_pcoll): 'ref_PCollection_PCollection_1'})
def test_cacheable_key_without_version_map(self): p = beam.Pipeline(interactive_runner.InteractiveRunner()) ie.current_env().set_cache_manager(InMemoryCache(), p) # pylint: disable=range-builtin-not-iterating init_pcoll = p | 'Init Create' >> beam.Create(range(10)) _, ctx = p.to_runner_api(return_context=True) self.assertEqual( instr.cacheable_key(init_pcoll, instr.pcolls_to_pcoll_id(p, ctx)), str(id(init_pcoll)) + '_ref_PCollection_PCollection_8')