Esempio n. 1
0
 def test_pcolls_to_pcoll_id(self):
     p = beam.Pipeline(interactive_runner.InteractiveRunner())
     # pylint: disable=range-builtin-not-iterating
     init_pcoll = p | 'Init Create' >> beam.Impulse()
     _, ctx = p.to_runner_api(use_fake_coders=True, return_context=True)
     self.assertEqual(instr.pcolls_to_pcoll_id(p, ctx),
                      {str(init_pcoll): 'ref_PCollection_PCollection_1'})
Esempio n. 2
0
    def test_cacheable_key_with_version_map(self):
        p = beam.Pipeline(interactive_runner.InteractiveRunner())
        ie.current_env().set_cache_manager(InMemoryCache(), p)
        # pylint: disable=range-builtin-not-iterating
        init_pcoll = p | 'Init Create' >> beam.Create(range(10))

        # It's normal that when executing, the pipeline object is a different
        # but equivalent instance from what user has built. The pipeline instrument
        # should be able to identify if the original instance has changed in an
        # interactive env while mutating the other instance for execution. The
        # version map can be used to figure out what the PCollection instances are
        # in the original instance and if the evaluation has changed since last
        # execution.
        p2 = beam.Pipeline(interactive_runner.InteractiveRunner())
        ie.current_env().set_cache_manager(InMemoryCache(), p2)
        # pylint: disable=range-builtin-not-iterating
        init_pcoll_2 = p2 | 'Init Create' >> beam.Create(range(10))
        _, ctx = p2.to_runner_api(return_context=True)

        # The cacheable_key should use id(init_pcoll) as prefix even when
        # init_pcoll_2 is supplied as long as the version map is given.
        self.assertEqual(
            instr.cacheable_key(
                init_pcoll_2, instr.pcolls_to_pcoll_id(p2, ctx),
                {'ref_PCollection_PCollection_8': str(id(init_pcoll))}),
            str(id(init_pcoll)) + '_ref_PCollection_PCollection_8')
Esempio n. 3
0
    def __init__(self, pcolls, options=None):
        """Constructor of PipelineFragment.

    Args:
      pcolls: (List[PCollection]) a list of PCollections to build pipeline
          fragment for.
      options: (PipelineOptions) the pipeline options for the implicit
          pipeline run.
    """
        assert len(pcolls) > 0, (
            'Need at least 1 PCollection as the target data to build a pipeline '
            'fragment that produces it.')
        for pcoll in pcolls:
            assert isinstance(pcoll, beam.pvalue.PCollection), (
                '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))
        # No modification to self._user_pipeline is allowed.
        self._user_pipeline = pcolls[0].pipeline
        # These are user PCollections. Do not use them to deduce anything that
        # will be executed by any runner. Instead, use
        # `self._runner_pcolls_to_user_pcolls.keys()` to get copied PCollections.
        self._pcolls = set(pcolls)
        for pcoll in self._pcolls:
            assert pcoll.pipeline is self._user_pipeline, (
                '{} belongs to a different user pipeline than other PCollections '
                'given and cannot be used to build a pipeline fragment that produces '
                'the given PCollections.'.format(pcoll))
        self._options = options

        # A copied pipeline instance for modification without changing the user
        # pipeline instance held by the end user. This instance can be processed
        # into a pipeline fragment that later run by the underlying runner.
        self._runner_pipeline = self._build_runner_pipeline()
        _, self._context = self._runner_pipeline.to_runner_api(
            return_context=True, use_fake_coders=True)
        from apache_beam.runners.interactive import pipeline_instrument as instr
        self._runner_pcoll_to_id = instr.pcolls_to_pcoll_id(
            self._runner_pipeline, self._context)
        # Correlate components in the runner pipeline to components in the user
        # pipeline. The target pcolls are the pcolls given and defined in the user
        # pipeline.
        self._id_to_target_pcoll = self._calculate_target_pcoll_ids()
        self._label_to_user_transform = self._calculate_user_transform_labels()
        # Below will give us the 1:1 correlation between
        # PCollections/AppliedPTransforms from the copied runner pipeline and
        # PCollections/AppliedPTransforms from the user pipeline.
        # (Dict[PCollection, PCollection])
        (
            self._runner_pcolls_to_user_pcolls,
            # (Dict[AppliedPTransform, AppliedPTransform])
            self._runner_transforms_to_user_transforms
        ) = self._build_correlation_between_pipelines(
            self._runner_pcoll_to_id, self._id_to_target_pcoll,
            self._label_to_user_transform)

        # Below are operated on the runner pipeline.
        (self._necessary_transforms, self._necessary_pcollections
         ) = self._mark_necessary_transforms_and_pcolls(
             self._runner_pcolls_to_user_pcolls)
        self._runner_pipeline = self._prune_runner_pipeline_to_fragment(
            self._runner_pipeline, self._necessary_transforms)
Esempio n. 4
0
 def test_cacheable_key_without_version_map(self):
     p = beam.Pipeline(interactive_runner.InteractiveRunner())
     # pylint: disable=range-builtin-not-iterating
     init_pcoll = p | 'Init Create' >> beam.Create(range(10))
     _, ctx = p.to_runner_api(use_fake_coders=True, return_context=True)
     self.assertEqual(
         instr.cacheable_key(init_pcoll, instr.pcolls_to_pcoll_id(p, ctx)),
         str(id(init_pcoll)) + '_ref_PCollection_PCollection_10')
Esempio n. 5
0
 def test_pcolls_to_pcoll_id(self):
     p = beam.Pipeline(interactive_runner.InteractiveRunner())
     ie.current_env().set_cache_manager(InMemoryCache(), p)
     # pylint: disable=range-builtin-not-iterating
     init_pcoll = p | 'Init Create' >> beam.Impulse()
     _, ctx = p.to_runner_api(return_context=True)
     self.assertEqual(instr.pcolls_to_pcoll_id(p, ctx),
                      {str(init_pcoll): 'ref_PCollection_PCollection_1'})
Esempio n. 6
0
 def test_cacheable_key_without_version_map(self):
     p = beam.Pipeline(interactive_runner.InteractiveRunner())
     ie.current_env().set_cache_manager(InMemoryCache(), p)
     # pylint: disable=range-builtin-not-iterating
     init_pcoll = p | 'Init Create' >> beam.Create(range(10))
     _, ctx = p.to_runner_api(return_context=True)
     self.assertEqual(
         instr.cacheable_key(init_pcoll, instr.pcolls_to_pcoll_id(p, ctx)),
         str(id(init_pcoll)) + '_ref_PCollection_PCollection_8')