コード例 #1
0
    def __init__(self,
                 user_pipeline: beam.Pipeline,
                 pcolls: Optional[Set[beam.pvalue.PCollection]] = None):
        """
    Initializes a pipelilne for augmenting interactive flavor.

    Args:
      user_pipeline: a beam.Pipeline instance defined by the user.
      pcolls: cacheable pcolls to be computed/retrieved. If the set is
        empty, all intermediate pcolls assigned to variables are applicable.
    """
        assert not pcolls or all([
            pcoll.pipeline is user_pipeline for pcoll in pcolls
        ]), 'All %s need to belong to %s' % (pcolls, user_pipeline)
        self._user_pipeline = user_pipeline
        self._pcolls = pcolls
        self._cache_manager = ie.current_env().get_cache_manager(
            self._user_pipeline, create_if_absent=True)
        if background_caching_job.has_source_to_cache(self._user_pipeline):
            self._cache_manager = ie.current_env().get_cache_manager(
                self._user_pipeline)
        _, self._context = self._user_pipeline.to_runner_api(
            return_context=True)
        self._context.component_id_map = copy.copy(
            self._user_pipeline.component_id_map)
        self._cacheables = self.cacheables()
コード例 #2
0
 def _process(self, pcoll):
     pcoll_id = self._pin.pcolls_to_pcoll_id.get(str(pcoll), '')
     if pcoll_id in self._pin._pcoll_version_map:
         cacheable_key = self._pin._cacheable_key(pcoll)
         user_pcoll = self._pin.cacheables[cacheable_key].pcoll
         if (cacheable_key in self._pin.cacheables
                 and user_pcoll != pcoll):
             if not self._pin._user_pipeline:
                 # Retrieve a reference to the user defined pipeline instance.
                 self._pin._user_pipeline = user_pcoll.pipeline
                 # Retrieve a reference to the cache manager for the user defined
                 # pipeline instance.
                 self._pin._cache_manager = ie.current_env(
                 ).get_cache_manager(self._pin._user_pipeline,
                                     create_if_absent=True)
                 # Check if the user defined pipeline contains any source to cache.
                 # If so, during the check, the cache manager is converted into a
                 # streaming cache manager, thus re-assign the reference.
                 if background_caching_job.has_source_to_cache(
                         self._pin._user_pipeline):
                     self._pin._cache_manager = ie.current_env(
                     ).get_cache_manager(self._pin._user_pipeline)
             self._pin._runner_pcoll_to_user_pcoll[
                 pcoll] = user_pcoll
             self._pin.cacheables[cacheable_key].pcoll = pcoll
コード例 #3
0
def _build_query_components(
    query: str,
    found: Dict[str, beam.PCollection],
    output_name: str,
    run: bool = True
) -> Tuple[str,
           Union[Dict[str, beam.PCollection], beam.PCollection, beam.Pipeline],
           SqlChain]:
  """Builds necessary components needed to apply the SqlTransform.

  Args:
    query: The SQL query to be executed by the magic.
    found: The PCollections with variable names found to be used by the query.
    output_name: The output variable name in __main__ module.
    run: Whether to prepare components for a local run or not.

  Returns:
    The processed query to be executed by the magic; a source to apply the
    SqlTransform to: a dictionary of tagged PCollections, or a single
    PCollection, or the pipeline to execute the query; the chain of applied
    beam_sql magics this one belongs to.
  """
  if found:
    user_pipeline = ie.current_env().user_pipeline(
        next(iter(found.values())).pipeline)
    sql_pipeline = beam.Pipeline(options=user_pipeline._options)
    ie.current_env().add_derived_pipeline(user_pipeline, sql_pipeline)
    sql_source = {}
    if run:
      if has_source_to_cache(user_pipeline):
        sql_source = pcolls_from_streaming_cache(
            user_pipeline, sql_pipeline, found)
      else:
        cache_manager = ie.current_env().get_cache_manager(
            user_pipeline, create_if_absent=True)
        for pcoll_name, pcoll in found.items():
          cache_key = CacheKey.from_pcoll(pcoll_name, pcoll).to_str()
          sql_source[pcoll_name] = unreify_from_cache(
              pipeline=sql_pipeline,
              cache_key=cache_key,
              cache_manager=cache_manager,
              element_type=pcoll.element_type)
    else:
      sql_source = found
    if len(sql_source) == 1:
      query = replace_single_pcoll_token(query, next(iter(sql_source.keys())))
      sql_source = next(iter(sql_source.values()))

    node = SqlNode(
        output_name=output_name, source=set(found.keys()), query=query)
    chain = ie.current_env().get_sql_chain(
        user_pipeline, set_user_pipeline=True).append(node)
  else:  # does not query any existing PCollection
    sql_source = beam.Pipeline()
    ie.current_env().add_user_pipeline(sql_source)

    # The node should be the root node of the chain created below.
    node = SqlNode(output_name=output_name, source=sql_source, query=query)
    chain = ie.current_env().get_sql_chain(sql_source).append(node)
  return query, sql_source, chain
コード例 #4
0
ファイル: pipeline_instrument.py プロジェクト: KevinGG/beam
    def __init__(self, pipeline, options=None):
        self._pipeline = pipeline

        self._user_pipeline = ie.current_env().user_pipeline(pipeline)
        if not self._user_pipeline:
            self._user_pipeline = pipeline
        self._cache_manager = ie.current_env().get_cache_manager(
            self._user_pipeline, create_if_absent=True)
        # Check if the user defined pipeline contains any source to cache.
        # If so, during the check, the cache manager is converted into a
        # streaming cache manager, thus re-assign.
        if background_caching_job.has_source_to_cache(self._user_pipeline):
            self._cache_manager = ie.current_env().get_cache_manager(
                self._user_pipeline)

        self._background_caching_pipeline = beam.pipeline.Pipeline.from_runner_api(
            pipeline.to_runner_api(), pipeline.runner, options)
        ie.current_env().add_derived_pipeline(
            self._pipeline, self._background_caching_pipeline)

        # Snapshot of original pipeline information.
        (self._original_pipeline_proto,
         context) = self._pipeline.to_runner_api(return_context=True)

        # All compute-once-against-original-pipeline fields.
        self._unbounded_sources = utils.unbounded_sources(
            self._background_caching_pipeline)
        self._pcoll_to_pcoll_id = pcoll_to_pcoll_id(self._pipeline, context)

        # A Dict[str, Cacheable] from a PCollection id to a Cacheable that belongs
        # to the analyzed pipeline.
        self._cacheables = self.find_cacheables()

        # A dict from cache key to PCollection that is read from cache.
        # If exists, caller should reuse the PCollection read. If not, caller
        # should create new transform and track the PCollection read from cache.
        # (Dict[str, AppliedPTransform]).
        self._cached_pcoll_read = {}

        # A dict from PCollections in the runner pipeline instance to their
        # corresponding PCollections in the user pipeline instance. Populated
        # after preprocess().
        self._runner_pcoll_to_user_pcoll = {}
        self._pruned_pipeline_proto = None

        # Refers target pcolls output by instrumented write cache transforms, used
        # by pruning logic as supplemental targets to build pipeline fragment up
        # from.
        self._extended_targets = set()

        # Refers pcolls used as inputs but got replaced by outputs of read cache
        # transforms instrumented, used by pruning logic as targets no longer need
        # to be produced during pipeline runs.
        self._ignored_targets = set()

        # Set of PCollections that are written to cache.
        self.cached_pcolls = set()
コード例 #5
0
 def _process(self, pcoll):
   pcoll_id = self._pin.pcolls_to_pcoll_id.get(str(pcoll), '')
   if pcoll_id in self._pin._pcoll_version_map:
     cacheable_key = self._pin._cacheable_key(pcoll)
     user_pcoll = self._pin.cacheables[cacheable_key]['pcoll']
     if (cacheable_key in self._pin.cacheables and user_pcoll != pcoll):
       if not self._pin._user_pipeline:
         # Retrieve a reference to the user defined pipeline instance.
         self._pin._user_pipeline = user_pcoll.pipeline
         # Once user_pipeline is retrieved, check if the user pipeline
         # contains any source to cache. If so, current cache manager held
         # by current interactive environment might get wrapped into a
         # streaming cache, thus re-assign the reference to that cache
         # manager.
         if background_caching_job.has_source_to_cache(
             self._pin._user_pipeline):
           self._pin._cache_manager = ie.current_env().cache_manager()
       self._pin._runner_pcoll_to_user_pcoll[pcoll] = user_pcoll
       self._pin.cacheables[cacheable_key]['pcoll'] = pcoll
コード例 #6
0
ファイル: beam_sql_magics.py プロジェクト: mszb/beam
def _build_query_components(
    query: str, found: Dict[str, beam.PCollection]
) -> Tuple[str, Union[Dict[str, beam.PCollection], beam.PCollection,
                      beam.Pipeline]]:
    """Builds necessary components needed to apply the SqlTransform.

  Args:
    query: The SQL query to be executed by the magic.
    found: The PCollections with variable names found to be used by the query.

  Returns:
    The processed query to be executed by the magic and a source to apply the
    SqlTransform to: a dictionary of tagged PCollections, or a single
    PCollection, or the pipeline to execute the query.
  """
    if found:
        user_pipeline = ie.current_env().user_pipeline(
            next(iter(found.values())).pipeline)
        sql_pipeline = beam.Pipeline(options=user_pipeline._options)
        ie.current_env().add_derived_pipeline(user_pipeline, sql_pipeline)
        sql_source = {}
        if has_source_to_cache(user_pipeline):
            sql_source = pcolls_from_streaming_cache(user_pipeline,
                                                     sql_pipeline, found)
        else:
            cache_manager = ie.current_env().get_cache_manager(
                user_pipeline, create_if_absent=True)
            for pcoll_name, pcoll in found.items():
                cache_key = CacheKey.from_pcoll(pcoll_name, pcoll).to_str()
                sql_source[pcoll_name] = unreify_from_cache(
                    pipeline=sql_pipeline,
                    cache_key=cache_key,
                    cache_manager=cache_manager,
                    element_type=pcoll.element_type)
        if len(sql_source) == 1:
            query = replace_single_pcoll_token(query,
                                               next(iter(sql_source.keys())))
            sql_source = next(iter(sql_source.values()))
    else:
        sql_source = beam.Pipeline()
        ie.current_env().add_user_pipeline(sql_source)
    return query, sql_source
コード例 #7
0
ファイル: interactive_runner.py プロジェクト: mszb/beam
    def run_pipeline(self, pipeline, options):
        if not ie.current_env().options.enable_recording_replay:
            capture_control.evict_captured_data()
        if self._force_compute:
            ie.current_env().evict_computed_pcollections()

        # Make sure that sources without a user reference are still cached.
        watch_sources(pipeline)

        user_pipeline = ie.current_env().user_pipeline(pipeline)
        pipeline_instrument = inst.build_pipeline_instrument(pipeline, options)

        # The user_pipeline analyzed might be None if the pipeline given has nothing
        # to be cached and tracing back to the user defined pipeline is impossible.
        # When it's None, there is no need to cache including the background
        # caching job and no result to track since no background caching job is
        # started at all.
        if user_pipeline:
            # Should use the underlying runner and run asynchronously.
            background_caching_job.attempt_to_run_background_caching_job(
                self._underlying_runner, user_pipeline, options)
            if (background_caching_job.has_source_to_cache(user_pipeline)
                    and not background_caching_job.
                    is_a_test_stream_service_running(user_pipeline)):
                streaming_cache_manager = ie.current_env().get_cache_manager(
                    user_pipeline)

                # Only make the server if it doesn't exist already.
                if (streaming_cache_manager and not ie.current_env().
                        get_test_stream_service_controller(user_pipeline)):

                    def exception_handler(e):
                        _LOGGER.error(str(e))
                        return True

                    test_stream_service = TestStreamServiceController(
                        streaming_cache_manager,
                        exception_handler=exception_handler)
                    test_stream_service.start()
                    ie.current_env().set_test_stream_service_controller(
                        user_pipeline, test_stream_service)

        pipeline_to_execute = beam.pipeline.Pipeline.from_runner_api(
            pipeline_instrument.instrumented_pipeline_proto(),
            self._underlying_runner, options)

        if ie.current_env().get_test_stream_service_controller(user_pipeline):
            endpoint = ie.current_env().get_test_stream_service_controller(
                user_pipeline).endpoint

            # TODO: make the StreamingCacheManager and TestStreamServiceController
            # constructed when the InteractiveEnvironment is imported.
            class TestStreamVisitor(PipelineVisitor):
                def visit_transform(self, transform_node):
                    from apache_beam.testing.test_stream import TestStream
                    if (isinstance(transform_node.transform, TestStream)
                            and not transform_node.transform._events):
                        transform_node.transform._endpoint = endpoint

            pipeline_to_execute.visit(TestStreamVisitor())

        if not self._skip_display:
            a_pipeline_graph = pipeline_graph.PipelineGraph(
                pipeline_instrument.original_pipeline_proto,
                render_option=self._render_option)
            a_pipeline_graph.display_graph()

        main_job_result = PipelineResult(pipeline_to_execute.run(),
                                         pipeline_instrument)
        # In addition to this pipeline result setting, redundant result setting from
        # outer scopes are also recommended since the user_pipeline might not be
        # available from within this scope.
        if user_pipeline:
            ie.current_env().set_pipeline_result(user_pipeline,
                                                 main_job_result)

        if self._blocking:
            main_job_result.wait_until_finish()

        if main_job_result.state is beam.runners.runner.PipelineState.DONE:
            # pylint: disable=dict-values-not-iterating
            ie.current_env().mark_pcollection_computed(
                pipeline_instrument.cached_pcolls)

        return main_job_result