def record(self, pcolls, max_n, max_duration): # type: (List[beam.pvalue.PCollection], int, Union[int,str]) -> Recording """Records the given PCollections.""" # Assert that all PCollection come from the same user_pipeline. for pcoll in pcolls: assert pcoll.pipeline is self.user_pipeline, ( '{} belongs to a different user-defined pipeline ({}) than that of' ' other PCollections ({}).'.format( pcoll, pcoll.pipeline, self.user_pipeline)) if isinstance(max_duration, str) and max_duration != 'inf': max_duration_secs = pd.to_timedelta(max_duration).total_seconds() else: max_duration_secs = max_duration # Make sure that all PCollections to be shown are watched. If a PCollection # has not been watched, make up a variable name for that PCollection and # watch it. No validation is needed here because the watch logic can handle # arbitrary variables. self._watch(pcolls) pipeline_instrument = pi.PipelineInstrument(self.user_pipeline) pipeline_instrument = pi.PipelineInstrument(self.user_pipeline) self.record_pipeline() # Get the subset of computed PCollections. These do not to be recomputed. computed_pcolls = set( pcoll for pcoll in pcolls if pcoll in ie.current_env().computed_pcollections) # Start a pipeline fragment to start computing the PCollections. uncomputed_pcolls = set(pcolls).difference(computed_pcolls) if uncomputed_pcolls: # Clear the cache of the given uncomputed PCollections because they are # incomplete. self._clear(pipeline_instrument) warnings.filterwarnings( 'ignore', 'options is deprecated since First stable release. References to ' '<pipeline>.options will not be supported', category=DeprecationWarning) pf.PipelineFragment(list(uncomputed_pcolls), self.user_pipeline.options).run() result = ie.current_env().pipeline_result(self.user_pipeline) else: result = None recording = Recording( self.user_pipeline, pcolls, result, pipeline_instrument, max_n, max_duration_secs) self._recordings.add(recording) return recording
def __init__(self, pcoll, include_window_info=False, display_facets=False): assert _pcoll_visualization_ready, ( 'Dependencies for PCollection visualization are not available. Please ' 'use `pip install apache-beam[interactive]` to install necessary ' 'dependencies and make sure that you are executing code in an ' 'interactive environment such as a Jupyter notebook.') assert isinstance(pcoll, pvalue.PCollection), ( 'pcoll should be apache_beam.pvalue.PCollection') self._pcoll = pcoll # This allows us to access cache key and other meta data about the pipeline # whether it's the pipeline defined in user code or a copy of that pipeline. # Thus, this module doesn't need any other user input but the PCollection # variable to be visualized. It then automatically figures out the pipeline # definition, materialized data and the pipeline result for the execution # even if the user never assigned or waited the result explicitly. # With only the constructor of PipelineInstrument, any interactivity related # pre-process or instrument is not triggered for performance concerns. self._pin = instr.PipelineInstrument(pcoll.pipeline) # Variable name as the title for element value in the rendered data table. self._pcoll_var = self._pin.cacheable_var_by_pcoll_id( self._pin.pcolls_to_pcoll_id.get(str(pcoll), None)) if not self._pcoll_var: self._pcoll_var = 'Value' self._cache_key = self._pin.cache_key(self._pcoll) obfuscated_id = obfuscate(self._cache_key, id(self)) self._dive_display_id = 'facets_dive_{}'.format(obfuscated_id) self._overview_display_id = 'facets_overview_{}'.format(obfuscated_id) self._df_display_id = 'df_{}'.format(obfuscated_id) self._include_window_info = include_window_info self._display_facets = display_facets self._is_datatable_empty = True
def test_recording_manager_clears_cache(self): """Tests that the RecordingManager clears the cache before recording. A job may have incomplete PCollections when the job terminates. Clearing the cache ensures that correct results are computed every run. """ # Add the TestStream so that it can be cached. ib.options.capturable_sources.add(TestStream) p = beam.Pipeline(InteractiveRunner(), options=PipelineOptions(streaming=True)) elems = (p | TestStream().advance_watermark_to( 0).advance_processing_time(1).add_elements(list( range(10))).advance_processing_time(1)) squares = elems | beam.Map(lambda x: x**2) # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # Do the first recording to get the timestamp of the first time the fragment # was run. rm = RecordingManager(p) rm.record([squares], max_n=10, max_duration=2) first_recording_start = rm.describe()['start'] rm.cancel() # Get the cache, key, and coder to read the PCollection from the cache. pipeline_instrument = pi.PipelineInstrument(p) cache = ie.current_env().get_cache_manager(p) cache_key = pipeline_instrument.cache_key(squares) # Set up a mock for the Cache's clear function which will be used to clear # uncomputed PCollections. cache.clear = MagicMock() # Rerun the fragment. If the cache was cleared correctly then the starting # time of the second recording will be later than the first. This is because # the PCollection wasn't considered to be computedand was cleared from # cache. Thus the pipeline fragment was rerun for that PCollection at a # later time. rm.record([squares], max_n=10, max_duration=1) second_recording_start = rm.describe()['start'] rm.cancel() self.assertGreater(second_recording_start, first_recording_start) # Assert that the cache cleared the PCollection. cache.clear.assert_called_with('full', cache_key)
def test_describe(self): p = beam.Pipeline(InteractiveRunner()) numbers = p | 'numbers' >> beam.Create([0, 1, 2]) letters = p | 'letters' >> beam.Create(['a', 'b', 'c']) ib.watch(locals()) # Create a MockPipelineResult to control the state of a fake run of the # pipeline. mock_result = MockPipelineResult() ie.current_env().track_user_pipelines() ie.current_env().set_pipeline_result(p, mock_result) cache_manager = InMemoryCache() ie.current_env().set_cache_manager(cache_manager, p) # Create a recording with an arbitrary start time. start_time = 100 recording = Recording(p, [numbers, letters], mock_result, pi.PipelineInstrument(p), max_n=10, max_duration_secs=60, start_time_for_test=start_time) # Get the cache key of the stream and write something to cache. This is # so that a pipeline doesn't have to run in the test. numbers_stream = recording.stream(numbers) cache_manager.write([0, 1, 2], 'full', numbers_stream.cache_key) cache_manager.save_pcoder(None, 'full', numbers_stream.cache_key) letters_stream = recording.stream(letters) cache_manager.write(['a', 'b', 'c'], 'full', letters_stream.cache_key) cache_manager.save_pcoder(None, 'full', letters_stream.cache_key) # Get the description. description = recording.describe() size = description['size'] start = description['start'] self.assertEqual( size, cache_manager.size('full', numbers_stream.cache_key) + cache_manager.size('full', letters_stream.cache_key)) self.assertEqual(start, start_time)
def test_recording_manager_clears_cache(self): """Tests that the RecordingManager clears the cache before recording. A job may have incomplete PCollections when the job terminates. Clearing the cache ensures that correct results are computed every run. """ # Add the TestStream so that it can be cached. ib.options.recordable_sources.add(TestStream) p = beam.Pipeline(InteractiveRunner(), options=PipelineOptions(streaming=True)) elems = (p | TestStream().advance_watermark_to( 0).advance_processing_time(1).add_elements(list( range(10))).advance_processing_time(1)) squares = elems | beam.Map(lambda x: x**2) # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # Do the first recording to get the timestamp of the first time the fragment # was run. rm = RecordingManager(p) # Get the cache, key, and coder to read the PCollection from the cache. pipeline_instrument = pi.PipelineInstrument(p) # Set up a mock for the Cache's clear function which will be used to clear # uncomputed PCollections. rm._clear_pcolls = MagicMock() rm.record([squares], max_n=1, max_duration=500) rm.cancel() # Assert that the cache cleared the PCollection. rm._clear_pcolls.assert_any_call( unittest.mock.ANY, set(pipeline_instrument.cache_key(pc) for pc in (elems, squares)))
def head(pcoll, n=5, include_window_info=False): """Materializes the first n elements from a PCollection into a Dataframe. This reads each element from file and reads only the amount that it needs into memory. For example:: p = beam.Pipeline(InteractiveRunner()) init = p | 'Init' >> beam.Create(range(10)) square = init | 'Square' >> beam.Map(lambda x: x * x) # Run the pipeline and bring the PCollection into memory as a Dataframe. in_memory_square = head(square, n=5) """ assert isinstance(pcoll, beam.pvalue.PCollection), ( '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll)) user_pipeline = pcoll.pipeline runner = user_pipeline.runner if isinstance(runner, ir.InteractiveRunner): runner = runner._underlying_runner # Make sure that sources without a user reference are still cached. pi.watch_sources(user_pipeline) # Make sure that all PCollections to be shown are watched. If a PCollection # has not been watched, make up a variable name for that PCollection and watch # it. No validation is needed here because the watch logic can handle # arbitrary variables. watched_pcollections = set() for watching in ie.current_env().watching(): for _, val in watching: if hasattr(val, '__class__') and isinstance( val, beam.pvalue.PCollection): watched_pcollections.add(val) if pcoll not in watched_pcollections: watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll}) warnings.filterwarnings('ignore', category=DeprecationWarning) # Attempt to run background caching job since we have the reference to the # user-defined pipeline. bcj.attempt_to_run_background_caching_job(runner, user_pipeline, user_pipeline.options) if pcoll in ie.current_env().computed_pcollections: # Read from pcoll cache, then convert to DF pipeline_instrument = pi.PipelineInstrument(pcoll.pipeline) key = pipeline_instrument.cache_key(pcoll) cache_manager = ie.current_env().cache_manager() coder = cache_manager.load_pcoder('full', key) reader, _ = cache_manager.read('full', key) elements = to_element_list(reader, coder, include_window_info=True) else: # Build a pipeline fragment for the PCollections and run it. result = pf.PipelineFragment([pcoll], user_pipeline.options).run() ie.current_env().set_pipeline_result(user_pipeline, result) # Invoke wait_until_finish to ensure the blocking nature of this API without # relying on the run to be blocking. result.wait_until_finish() # If the pipeline execution is successful at this stage, mark the # computation completeness for the given PCollections so that when further # `show` invocation occurs, Interactive Beam wouldn't need to re-compute. if result.state is beam.runners.runner.PipelineState.DONE: ie.current_env().mark_pcollection_computed([pcoll]) elements = result.read(pcoll, include_window_info=True) results = [] for e in elements: results.append(e) if len(results) >= n and n > 0: break return elements_to_df(results, include_window_info=include_window_info)
def __init__( self, pipeline, # type: Union[beam_runner_api_pb2.Pipeline, beam.Pipeline] default_vertex_attrs={'shape': 'box'}, default_edge_attrs=None, render_option=None): """Constructor of PipelineGraph. Examples: graph = pipeline_graph.PipelineGraph(pipeline_proto) graph.get_dot() or graph = pipeline_graph.PipelineGraph(pipeline) graph.get_dot() Args: pipeline: (Pipeline proto) or (Pipeline) pipeline to be rendered. default_vertex_attrs: (Dict[str, str]) a dict of default vertex attributes default_edge_attrs: (Dict[str, str]) a dict of default edge attributes render_option: (str) this parameter decides how the pipeline graph is rendered. See display.pipeline_graph_renderer for available options. """ self._lock = threading.Lock() self._graph = None # type: pydot.Dot self._pipeline_instrument = None if isinstance(pipeline, beam.Pipeline): self._pipeline_instrument = inst.PipelineInstrument(pipeline) # The pre-process links user pipeline to runner pipeline through analysis # but without mutating runner pipeline. self._pipeline_instrument.preprocess() if isinstance(pipeline, beam_runner_api_pb2.Pipeline): self._pipeline_proto = pipeline elif isinstance(pipeline, beam.Pipeline): self._pipeline_proto = pipeline.to_runner_api() else: raise TypeError( 'pipeline should either be a %s or %s, while %s is given' % (beam_runner_api_pb2.Pipeline, beam.Pipeline, type(pipeline))) # A dict from PCollection ID to a list of its consuming Transform IDs self._consumers = collections.defaultdict( list) # type: DefaultDict[str, List[str]] # A dict from PCollection ID to its producing Transform ID self._producers = {} # type: Dict[str, str] for transform_id, transform_proto in self._top_level_transforms(): for pcoll_id in transform_proto.inputs.values(): self._consumers[pcoll_id].append(transform_id) for pcoll_id in transform_proto.outputs.values(): self._producers[pcoll_id] = transform_id default_vertex_attrs = default_vertex_attrs or {'shape': 'box'} if 'color' not in default_vertex_attrs: default_vertex_attrs['color'] = 'blue' if 'fontcolor' not in default_vertex_attrs: default_vertex_attrs['fontcolor'] = 'blue' vertex_dict, edge_dict = self._generate_graph_dicts() self._construct_graph(vertex_dict, edge_dict, default_vertex_attrs, default_edge_attrs) self._renderer = pipeline_graph_renderer.get_renderer(render_option)
def test_computed(self): """Tests that a PCollection is marked as computed only in a complete state. Because the background caching job is now long-lived, repeated runs of a PipelineFragment may yield different results for the same PCollection. """ p = beam.Pipeline(InteractiveRunner()) elems = p | beam.Create([0, 1, 2]) ib.watch(locals()) # Create a MockPipelineResult to control the state of a fake run of the # pipeline. mock_result = MockPipelineResult() ie.current_env().track_user_pipelines() ie.current_env().set_pipeline_result(p, mock_result) # Create a mock BackgroundCachingJob that will control whether to set the # PCollections as computed or not. bcj_mock_result = MockPipelineResult() background_caching_job = bcj.BackgroundCachingJob(bcj_mock_result, []) # Create a recording. recording = Recording(p, [elems], mock_result, pi.PipelineInstrument(p), max_n=10, max_duration_secs=60) # The background caching job and the recording isn't done yet so there may # be more elements to be recorded. self.assertFalse(recording.is_computed()) self.assertFalse(recording.computed()) self.assertTrue(recording.uncomputed()) # The recording is finished but the background caching job is not. There # may still be more elements to record, or the intermediate PCollection may # have stopped caching in an incomplete state, e.g. before a window could # fire. mock_result.set_state(PipelineState.DONE) recording.wait_until_finish() self.assertFalse(recording.is_computed()) self.assertFalse(recording.computed()) self.assertTrue(recording.uncomputed()) # The background caching job finished before we started a recording which # is a sure signal that there will be no more elements. bcj_mock_result.set_state(PipelineState.DONE) ie.current_env().set_background_caching_job(p, background_caching_job) recording = Recording(p, [elems], mock_result, pi.PipelineInstrument(p), max_n=10, max_duration_secs=60) recording.wait_until_finish() # There are no more elements and the recording finished, meaning that the # intermediate PCollections are in a complete state. They can now be marked # as computed. self.assertTrue(recording.is_computed()) self.assertTrue(recording.computed()) self.assertFalse(recording.uncomputed())
def record(self, pcolls, max_n, max_duration_secs): # type: (List[beam.pvalue.PCollection], int, int) -> Recording """Records the given PCollections.""" # Assert that all PCollection come from the same user_pipeline. for pcoll in pcolls: assert pcoll.pipeline is self.user_pipeline, ( '{} belongs to a different user-defined pipeline ({}) than that of' ' other PCollections ({}).'.format(pcoll, pcoll.pipeline, self.user_pipeline)) runner = self.user_pipeline.runner if isinstance(runner, ir.InteractiveRunner): runner = runner._underlying_runner # Make sure that sources without a user reference are still cached. pi.watch_sources(self.user_pipeline) # Make sure that all PCollections to be shown are watched. If a PCollection # has not been watched, make up a variable name for that PCollection and # watch it. No validation is needed here because the watch logic can handle # arbitrary variables. self._watch(pcolls) pipeline_instrument = pi.PipelineInstrument(self.user_pipeline) # Attempt to run background caching job to record any sources. if ie.current_env().is_in_ipython: warnings.filterwarnings( 'ignore', 'options is deprecated since First stable release. References to ' '<pipeline>.options will not be supported', category=DeprecationWarning) bcj.attempt_to_run_background_caching_job( runner, self.user_pipeline, options=self.user_pipeline.options) # Get the subset of computed PCollections. These do not to be recomputed. computed_pcolls = set( pcoll for pcoll in pcolls if pcoll in ie.current_env().computed_pcollections) # Start a pipeline fragment to start computing the PCollections. uncomputed_pcolls = set(pcolls).difference(computed_pcolls) if uncomputed_pcolls: # Clear the cache of the given uncomputed PCollections because they are # incomplete. self._clear(pipeline_instrument) warnings.filterwarnings( 'ignore', 'options is deprecated since First stable release. References to ' '<pipeline>.options will not be supported', category=DeprecationWarning) pf.PipelineFragment(list(uncomputed_pcolls), self.user_pipeline.options).run() result = ie.current_env().pipeline_result(self.user_pipeline) else: result = None recording = Recording(self.user_pipeline, pcolls, result, pipeline_instrument, max_n, max_duration_secs) self._recordings.add(recording) return recording
def __init__(self, user_pipeline): # type: (beam.Pipeline, List[Limiter]) -> None self.user_pipeline = user_pipeline self._pipeline_instrument = pi.PipelineInstrument(self.user_pipeline)