Exemple #1
0
    def __init__(self, pipeline, options=None):
        self._pipeline = pipeline

        self._user_pipeline = ie.current_env().user_pipeline(pipeline)
        if not self._user_pipeline:
            self._user_pipeline = pipeline
        self._cache_manager = ie.current_env().get_cache_manager(
            self._user_pipeline, create_if_absent=True)
        # Check if the user defined pipeline contains any source to cache.
        # If so, during the check, the cache manager is converted into a
        # streaming cache manager, thus re-assign.
        if background_caching_job.has_source_to_cache(self._user_pipeline):
            self._cache_manager = ie.current_env().get_cache_manager(
                self._user_pipeline)

        self._background_caching_pipeline = beam.pipeline.Pipeline.from_runner_api(
            pipeline.to_runner_api(), pipeline.runner, options)
        ie.current_env().add_derived_pipeline(
            self._pipeline, self._background_caching_pipeline)

        # Snapshot of original pipeline information.
        (self._original_pipeline_proto,
         context) = self._pipeline.to_runner_api(return_context=True)

        # All compute-once-against-original-pipeline fields.
        self._unbounded_sources = utils.unbounded_sources(
            self._background_caching_pipeline)
        self._pcoll_to_pcoll_id = pcoll_to_pcoll_id(self._pipeline, context)

        # A Dict[str, Cacheable] from a PCollection id to a Cacheable that belongs
        # to the analyzed pipeline.
        self._cacheables = self.find_cacheables()

        # A dict from cache key to PCollection that is read from cache.
        # If exists, caller should reuse the PCollection read. If not, caller
        # should create new transform and track the PCollection read from cache.
        # (Dict[str, AppliedPTransform]).
        self._cached_pcoll_read = {}

        # A dict from PCollections in the runner pipeline instance to their
        # corresponding PCollections in the user pipeline instance. Populated
        # after preprocess().
        self._runner_pcoll_to_user_pcoll = {}
        self._pruned_pipeline_proto = None

        # Refers target pcolls output by instrumented write cache transforms, used
        # by pruning logic as supplemental targets to build pipeline fragment up
        # from.
        self._extended_targets = set()

        # Refers pcolls used as inputs but got replaced by outputs of read cache
        # transforms instrumented, used by pruning logic as targets no longer need
        # to be produced during pipeline runs.
        self._ignored_targets = set()

        # Set of PCollections that are written to cache.
        self.cached_pcolls = set()
def extract_source_to_cache_signature(user_pipeline):
    """Extracts a set of signature for sources that need to be cached in the
  user-defined pipeline.

  A signature is a str representation of urn and payload of a source.
  """
    # TODO(BEAM-8335): we temporarily only cache replaceable unbounded sources.
    # Add logic for other cacheable sources here when they are available.
    unbounded_sources_as_applied_transforms = utils.unbounded_sources(
        user_pipeline)
    unbounded_sources_as_ptransforms = set(
        map(lambda x: x.transform, unbounded_sources_as_applied_transforms))
    _, context = user_pipeline.to_runner_api(return_context=True)
    signature = set(
        map(lambda transform: str(transform.to_runner_api(context)),
            unbounded_sources_as_ptransforms))
    return signature
Exemple #3
0
    def background_caching_pipeline_proto(self):
        """Returns the background caching pipeline.

    This method creates a background caching pipeline by: adding writes to cache
    from each unbounded source (done in the instrument method), and cutting out
    all components (transform, PCollections, coders, windowing strategies) that
    are not the unbounded sources or writes to cache (or subtransforms thereof).
    """
        # Create the pipeline_proto to read all the components from. It will later
        # create a new pipeline proto from the cut out components.
        pipeline_proto, context = self._background_caching_pipeline.to_runner_api(
            return_context=True)

        # Get all the sources we want to cache.
        sources = utils.unbounded_sources(self._background_caching_pipeline)

        # Get all the root transforms. The caching transforms will be subtransforms
        # of one of these roots.
        roots = [root for root in pipeline_proto.root_transform_ids]

        # Get the transform IDs of the caching transforms. These caching operations
        # are added to the _background_caching_pipeline in the instrument() method.
        # It's added there so that multiple calls to this method won't add multiple
        # caching operations (idempotent).
        transforms = pipeline_proto.components.transforms
        caching_transform_ids = [
            t_id for root in roots for t_id in transforms[root].subtransforms
            if WRITE_CACHE in t_id
        ]

        # Get the IDs of the unbounded sources.
        required_transform_labels = [src.full_label for src in sources]
        unbounded_source_ids = [
            k for k, v in transforms.items()
            if v.unique_name in required_transform_labels
        ]

        # The required transforms are the tranforms that we want to cut out of
        # the pipeline_proto and insert into a new pipeline to return.
        required_transform_ids = (roots + caching_transform_ids +
                                  unbounded_source_ids)
        (t, p) = self._required_components(pipeline_proto,
                                           required_transform_ids, set())

        def set_proto_map(proto_map, new_value):
            proto_map.clear()
            for key, value in new_value.items():
                proto_map[key].CopyFrom(value)

        # Copy the transforms into the new pipeline.
        pipeline_to_execute = beam_runner_api_pb2.Pipeline()
        pipeline_to_execute.root_transform_ids[:] = roots
        set_proto_map(pipeline_to_execute.components.transforms, t)
        set_proto_map(pipeline_to_execute.components.pcollections, p)
        set_proto_map(pipeline_to_execute.components.coders,
                      context.to_runner_api().coders)
        set_proto_map(pipeline_to_execute.components.windowing_strategies,
                      context.to_runner_api().windowing_strategies)

        # Cut out all subtransforms in the root that aren't the required transforms.
        for root_id in roots:
            root = pipeline_to_execute.components.transforms[root_id]
            root.subtransforms[:] = [
                transform_id for transform_id in root.subtransforms
                if transform_id in pipeline_to_execute.components.transforms
            ]

        return pipeline_to_execute