def __init__(self, user_pipeline: beam.Pipeline, pcolls: Optional[Set[beam.pvalue.PCollection]] = None): """ Initializes a pipelilne for augmenting interactive flavor. Args: user_pipeline: a beam.Pipeline instance defined by the user. pcolls: cacheable pcolls to be computed/retrieved. If the set is empty, all intermediate pcolls assigned to variables are applicable. """ assert not pcolls or all([ pcoll.pipeline is user_pipeline for pcoll in pcolls ]), 'All %s need to belong to %s' % (pcolls, user_pipeline) self._user_pipeline = user_pipeline self._pcolls = pcolls self._cache_manager = ie.current_env().get_cache_manager( self._user_pipeline, create_if_absent=True) if background_caching_job.has_source_to_cache(self._user_pipeline): self._cache_manager = ie.current_env().get_cache_manager( self._user_pipeline) _, self._context = self._user_pipeline.to_runner_api( return_context=True) self._context.component_id_map = copy.copy( self._user_pipeline.component_id_map) self._cacheables = self.cacheables()
def _process(self, pcoll): pcoll_id = self._pin.pcolls_to_pcoll_id.get(str(pcoll), '') if pcoll_id in self._pin._pcoll_version_map: cacheable_key = self._pin._cacheable_key(pcoll) user_pcoll = self._pin.cacheables[cacheable_key].pcoll if (cacheable_key in self._pin.cacheables and user_pcoll != pcoll): if not self._pin._user_pipeline: # Retrieve a reference to the user defined pipeline instance. self._pin._user_pipeline = user_pcoll.pipeline # Retrieve a reference to the cache manager for the user defined # pipeline instance. self._pin._cache_manager = ie.current_env( ).get_cache_manager(self._pin._user_pipeline, create_if_absent=True) # Check if the user defined pipeline contains any source to cache. # If so, during the check, the cache manager is converted into a # streaming cache manager, thus re-assign the reference. if background_caching_job.has_source_to_cache( self._pin._user_pipeline): self._pin._cache_manager = ie.current_env( ).get_cache_manager(self._pin._user_pipeline) self._pin._runner_pcoll_to_user_pcoll[ pcoll] = user_pcoll self._pin.cacheables[cacheable_key].pcoll = pcoll
def _build_query_components( query: str, found: Dict[str, beam.PCollection], output_name: str, run: bool = True ) -> Tuple[str, Union[Dict[str, beam.PCollection], beam.PCollection, beam.Pipeline], SqlChain]: """Builds necessary components needed to apply the SqlTransform. Args: query: The SQL query to be executed by the magic. found: The PCollections with variable names found to be used by the query. output_name: The output variable name in __main__ module. run: Whether to prepare components for a local run or not. Returns: The processed query to be executed by the magic; a source to apply the SqlTransform to: a dictionary of tagged PCollections, or a single PCollection, or the pipeline to execute the query; the chain of applied beam_sql magics this one belongs to. """ if found: user_pipeline = ie.current_env().user_pipeline( next(iter(found.values())).pipeline) sql_pipeline = beam.Pipeline(options=user_pipeline._options) ie.current_env().add_derived_pipeline(user_pipeline, sql_pipeline) sql_source = {} if run: if has_source_to_cache(user_pipeline): sql_source = pcolls_from_streaming_cache( user_pipeline, sql_pipeline, found) else: cache_manager = ie.current_env().get_cache_manager( user_pipeline, create_if_absent=True) for pcoll_name, pcoll in found.items(): cache_key = CacheKey.from_pcoll(pcoll_name, pcoll).to_str() sql_source[pcoll_name] = unreify_from_cache( pipeline=sql_pipeline, cache_key=cache_key, cache_manager=cache_manager, element_type=pcoll.element_type) else: sql_source = found if len(sql_source) == 1: query = replace_single_pcoll_token(query, next(iter(sql_source.keys()))) sql_source = next(iter(sql_source.values())) node = SqlNode( output_name=output_name, source=set(found.keys()), query=query) chain = ie.current_env().get_sql_chain( user_pipeline, set_user_pipeline=True).append(node) else: # does not query any existing PCollection sql_source = beam.Pipeline() ie.current_env().add_user_pipeline(sql_source) # The node should be the root node of the chain created below. node = SqlNode(output_name=output_name, source=sql_source, query=query) chain = ie.current_env().get_sql_chain(sql_source).append(node) return query, sql_source, chain
def __init__(self, pipeline, options=None): self._pipeline = pipeline self._user_pipeline = ie.current_env().user_pipeline(pipeline) if not self._user_pipeline: self._user_pipeline = pipeline self._cache_manager = ie.current_env().get_cache_manager( self._user_pipeline, create_if_absent=True) # Check if the user defined pipeline contains any source to cache. # If so, during the check, the cache manager is converted into a # streaming cache manager, thus re-assign. if background_caching_job.has_source_to_cache(self._user_pipeline): self._cache_manager = ie.current_env().get_cache_manager( self._user_pipeline) self._background_caching_pipeline = beam.pipeline.Pipeline.from_runner_api( pipeline.to_runner_api(), pipeline.runner, options) ie.current_env().add_derived_pipeline( self._pipeline, self._background_caching_pipeline) # Snapshot of original pipeline information. (self._original_pipeline_proto, context) = self._pipeline.to_runner_api(return_context=True) # All compute-once-against-original-pipeline fields. self._unbounded_sources = utils.unbounded_sources( self._background_caching_pipeline) self._pcoll_to_pcoll_id = pcoll_to_pcoll_id(self._pipeline, context) # A Dict[str, Cacheable] from a PCollection id to a Cacheable that belongs # to the analyzed pipeline. self._cacheables = self.find_cacheables() # A dict from cache key to PCollection that is read from cache. # If exists, caller should reuse the PCollection read. If not, caller # should create new transform and track the PCollection read from cache. # (Dict[str, AppliedPTransform]). self._cached_pcoll_read = {} # A dict from PCollections in the runner pipeline instance to their # corresponding PCollections in the user pipeline instance. Populated # after preprocess(). self._runner_pcoll_to_user_pcoll = {} self._pruned_pipeline_proto = None # Refers target pcolls output by instrumented write cache transforms, used # by pruning logic as supplemental targets to build pipeline fragment up # from. self._extended_targets = set() # Refers pcolls used as inputs but got replaced by outputs of read cache # transforms instrumented, used by pruning logic as targets no longer need # to be produced during pipeline runs. self._ignored_targets = set() # Set of PCollections that are written to cache. self.cached_pcolls = set()
def _process(self, pcoll): pcoll_id = self._pin.pcolls_to_pcoll_id.get(str(pcoll), '') if pcoll_id in self._pin._pcoll_version_map: cacheable_key = self._pin._cacheable_key(pcoll) user_pcoll = self._pin.cacheables[cacheable_key]['pcoll'] if (cacheable_key in self._pin.cacheables and user_pcoll != pcoll): if not self._pin._user_pipeline: # Retrieve a reference to the user defined pipeline instance. self._pin._user_pipeline = user_pcoll.pipeline # Once user_pipeline is retrieved, check if the user pipeline # contains any source to cache. If so, current cache manager held # by current interactive environment might get wrapped into a # streaming cache, thus re-assign the reference to that cache # manager. if background_caching_job.has_source_to_cache( self._pin._user_pipeline): self._pin._cache_manager = ie.current_env().cache_manager() self._pin._runner_pcoll_to_user_pcoll[pcoll] = user_pcoll self._pin.cacheables[cacheable_key]['pcoll'] = pcoll
def _build_query_components( query: str, found: Dict[str, beam.PCollection] ) -> Tuple[str, Union[Dict[str, beam.PCollection], beam.PCollection, beam.Pipeline]]: """Builds necessary components needed to apply the SqlTransform. Args: query: The SQL query to be executed by the magic. found: The PCollections with variable names found to be used by the query. Returns: The processed query to be executed by the magic and a source to apply the SqlTransform to: a dictionary of tagged PCollections, or a single PCollection, or the pipeline to execute the query. """ if found: user_pipeline = ie.current_env().user_pipeline( next(iter(found.values())).pipeline) sql_pipeline = beam.Pipeline(options=user_pipeline._options) ie.current_env().add_derived_pipeline(user_pipeline, sql_pipeline) sql_source = {} if has_source_to_cache(user_pipeline): sql_source = pcolls_from_streaming_cache(user_pipeline, sql_pipeline, found) else: cache_manager = ie.current_env().get_cache_manager( user_pipeline, create_if_absent=True) for pcoll_name, pcoll in found.items(): cache_key = CacheKey.from_pcoll(pcoll_name, pcoll).to_str() sql_source[pcoll_name] = unreify_from_cache( pipeline=sql_pipeline, cache_key=cache_key, cache_manager=cache_manager, element_type=pcoll.element_type) if len(sql_source) == 1: query = replace_single_pcoll_token(query, next(iter(sql_source.keys()))) sql_source = next(iter(sql_source.values())) else: sql_source = beam.Pipeline() ie.current_env().add_user_pipeline(sql_source) return query, sql_source
def run_pipeline(self, pipeline, options): if not ie.current_env().options.enable_recording_replay: capture_control.evict_captured_data() if self._force_compute: ie.current_env().evict_computed_pcollections() # Make sure that sources without a user reference are still cached. watch_sources(pipeline) user_pipeline = ie.current_env().user_pipeline(pipeline) pipeline_instrument = inst.build_pipeline_instrument(pipeline, options) # The user_pipeline analyzed might be None if the pipeline given has nothing # to be cached and tracing back to the user defined pipeline is impossible. # When it's None, there is no need to cache including the background # caching job and no result to track since no background caching job is # started at all. if user_pipeline: # Should use the underlying runner and run asynchronously. background_caching_job.attempt_to_run_background_caching_job( self._underlying_runner, user_pipeline, options) if (background_caching_job.has_source_to_cache(user_pipeline) and not background_caching_job. is_a_test_stream_service_running(user_pipeline)): streaming_cache_manager = ie.current_env().get_cache_manager( user_pipeline) # Only make the server if it doesn't exist already. if (streaming_cache_manager and not ie.current_env(). get_test_stream_service_controller(user_pipeline)): def exception_handler(e): _LOGGER.error(str(e)) return True test_stream_service = TestStreamServiceController( streaming_cache_manager, exception_handler=exception_handler) test_stream_service.start() ie.current_env().set_test_stream_service_controller( user_pipeline, test_stream_service) pipeline_to_execute = beam.pipeline.Pipeline.from_runner_api( pipeline_instrument.instrumented_pipeline_proto(), self._underlying_runner, options) if ie.current_env().get_test_stream_service_controller(user_pipeline): endpoint = ie.current_env().get_test_stream_service_controller( user_pipeline).endpoint # TODO: make the StreamingCacheManager and TestStreamServiceController # constructed when the InteractiveEnvironment is imported. class TestStreamVisitor(PipelineVisitor): def visit_transform(self, transform_node): from apache_beam.testing.test_stream import TestStream if (isinstance(transform_node.transform, TestStream) and not transform_node.transform._events): transform_node.transform._endpoint = endpoint pipeline_to_execute.visit(TestStreamVisitor()) if not self._skip_display: a_pipeline_graph = pipeline_graph.PipelineGraph( pipeline_instrument.original_pipeline_proto, render_option=self._render_option) a_pipeline_graph.display_graph() main_job_result = PipelineResult(pipeline_to_execute.run(), pipeline_instrument) # In addition to this pipeline result setting, redundant result setting from # outer scopes are also recommended since the user_pipeline might not be # available from within this scope. if user_pipeline: ie.current_env().set_pipeline_result(user_pipeline, main_job_result) if self._blocking: main_job_result.wait_until_finish() if main_job_result.state is beam.runners.runner.PipelineState.DONE: # pylint: disable=dict-values-not-iterating ie.current_env().mark_pcollection_computed( pipeline_instrument.cached_pcolls) return main_job_result