def test_user_pipeline_intact_after_deducing_pipeline_fragment(self, cell): with cell: # Cell 1 p = beam.Pipeline(ir.InteractiveRunner()) # Watch the pipeline `p` immediately without calling locals(). ib.watch({'p': p}) with cell: # Cell 2 # pylint: disable=range-builtin-not-iterating init = p | 'Init' >> beam.Create(range(10)) with cell: # Cell 3 square = init | 'Square' >> beam.Map(lambda x: x * x) with cell: # Cell 4 cube = init | 'Cube' >> beam.Map(lambda x: x**3) # Watch every PCollection has been defined so far in local scope without # calling locals(). ib.watch({'init': init, 'square': square, 'cube': cube}) user_pipeline_proto_before_deducing_fragment = p.to_runner_api( return_context=False, use_fake_coders=True) _ = pf.PipelineFragment([square]).deduce_fragment() user_pipeline_proto_after_deducing_fragment = p.to_runner_api( return_context=False, use_fake_coders=True) assert_pipeline_proto_equal( self, user_pipeline_proto_before_deducing_fragment, user_pipeline_proto_after_deducing_fragment)
def test_pipeline_composites(self, cell): """Tests that composites are supported. """ with cell: # Cell 1 p = beam.Pipeline(ir.InteractiveRunner()) ib.watch({'p': p}) with cell: # Cell 2 # pylint: disable=bad-option-value init = p | 'Init' >> beam.Create(range(5)) with cell: # Cell 3 # Have a composite within a composite to test that all transforms under a # composite are added. @beam.ptransform_fn def Bar(pcoll): return pcoll | beam.Map(lambda n: 2 * n) @beam.ptransform_fn def Foo(pcoll): p1 = pcoll | beam.Map(lambda n: 3 * n) p2 = pcoll | beam.Map(str) bar = p1 | Bar() return {'pc1': p1, 'pc2': p2, 'bar': bar} res = init | Foo() ib.watch(res) pc = res['bar'] result = pf.PipelineFragment([pc]).run() self.assertEqual([0, 6, 12, 18, 24], list(result.get(pc)))
def record(self, pcolls, max_n, max_duration): # type: (List[beam.pvalue.PCollection], int, Union[int,str]) -> Recording """Records the given PCollections.""" # Assert that all PCollection come from the same user_pipeline. for pcoll in pcolls: assert pcoll.pipeline is self.user_pipeline, ( '{} belongs to a different user-defined pipeline ({}) than that of' ' other PCollections ({}).'.format( pcoll, pcoll.pipeline, self.user_pipeline)) if isinstance(max_duration, str) and max_duration != 'inf': max_duration_secs = pd.to_timedelta(max_duration).total_seconds() else: max_duration_secs = max_duration # Make sure that all PCollections to be shown are watched. If a PCollection # has not been watched, make up a variable name for that PCollection and # watch it. No validation is needed here because the watch logic can handle # arbitrary variables. self._watch(pcolls) pipeline_instrument = pi.PipelineInstrument(self.user_pipeline) pipeline_instrument = pi.PipelineInstrument(self.user_pipeline) self.record_pipeline() # Get the subset of computed PCollections. These do not to be recomputed. computed_pcolls = set( pcoll for pcoll in pcolls if pcoll in ie.current_env().computed_pcollections) # Start a pipeline fragment to start computing the PCollections. uncomputed_pcolls = set(pcolls).difference(computed_pcolls) if uncomputed_pcolls: # Clear the cache of the given uncomputed PCollections because they are # incomplete. self._clear(pipeline_instrument) warnings.filterwarnings( 'ignore', 'options is deprecated since First stable release. References to ' '<pipeline>.options will not be supported', category=DeprecationWarning) pf.PipelineFragment(list(uncomputed_pcolls), self.user_pipeline.options).run() result = ie.current_env().pipeline_result(self.user_pipeline) else: result = None recording = Recording( self.user_pipeline, pcolls, result, pipeline_instrument, max_n, max_duration_secs) self._recordings.add(recording) return recording
def record(self, pcolls, max_n, max_duration_secs): # type: (List[beam.pvalue.PCollection], int, int) -> Recording """Records the given PCollections.""" # Assert that all PCollection come from the same user_pipeline. for pcoll in pcolls: assert pcoll.pipeline is self.user_pipeline, ( '{} belongs to a different user-defined pipeline ({}) than that of' ' other PCollections ({}).'.format(pcoll, pcoll.pipeline, self.user_pipeline)) runner = self.user_pipeline.runner if isinstance(runner, ir.InteractiveRunner): runner = runner._underlying_runner # Make sure that sources without a user reference are still cached. pi.watch_sources(self.user_pipeline) # Make sure that all PCollections to be shown are watched. If a PCollection # has not been watched, make up a variable name for that PCollection and # watch it. No validation is needed here because the watch logic can handle # arbitrary variables. self._watch(pcolls) # Attempt to run background caching job to record any sources. if ie.current_env().is_in_ipython: warnings.filterwarnings( 'ignore', 'options is deprecated since First stable release. References to ' '<pipeline>.options will not be supported', category=DeprecationWarning) bcj.attempt_to_run_background_caching_job( runner, self.user_pipeline, options=self.user_pipeline.options) # Get the subset of computed PCollections. These do not to be recomputed. computed_pcolls = set( pcoll for pcoll in pcolls if pcoll in ie.current_env().computed_pcollections) # Start a pipeline fragment to start computing the PCollections. uncomputed_pcolls = set(pcolls).difference(computed_pcolls) if uncomputed_pcolls: # Clear the cache of the given uncomputed PCollections because they are # incomplete. self.clear(uncomputed_pcolls) warnings.filterwarnings( 'ignore', 'options is deprecated since First stable release. References to ' '<pipeline>.options will not be supported', category=DeprecationWarning) result = pf.PipelineFragment(list(uncomputed_pcolls), self.user_pipeline.options).run() ie.current_env().set_pipeline_result(self.user_pipeline, result) else: result = None return Recording(self.user_pipeline, pcolls, result, self._pipeline_instrument, max_n, max_duration_secs)
def instrumented_pipeline_proto(self): """Always returns a new instance of portable instrumented proto.""" targets = set(self._runner_pcoll_to_user_pcoll.keys()) targets.update(self._extended_targets) targets = targets.difference(self._ignored_targets) if len(targets) > 0: # Prunes upstream transforms that don't contribute to the targets the # instrumented pipeline run cares. return pf.PipelineFragment( list(targets)).deduce_fragment().to_runner_api() return self._pipeline.to_runner_api()
def test_pipeline_fragment_produces_correct_data(self, cell): with cell: # Cell 1 p = beam.Pipeline(ir.InteractiveRunner()) ib.watch({'p': p}) with cell: # Cell 2 # pylint: disable=range-builtin-not-iterating init = p | 'Init' >> beam.Create(range(5)) with cell: # Cell 3 square = init | 'Square' >> beam.Map(lambda x: x * x) _ = init | 'Cube' >> beam.Map(lambda x: x**3) ib.watch(locals()) result = pf.PipelineFragment([square]).run() self.assertEqual([0, 1, 4, 9, 16], list(result.get(square)))
def test_fragment_does_not_prune_teststream(self): """Tests that the fragment does not prune the TestStream composite parts. """ options = StandardOptions(streaming=True) p = beam.Pipeline(ir.InteractiveRunner(), options) test_stream = p | TestStream(output_tags=['a', 'b']) # pylint: disable=unused-variable a = test_stream['a'] | 'a' >> beam.Map(lambda _: _) b = test_stream['b'] | 'b' >> beam.Map(lambda _: _) fragment = pf.PipelineFragment([b]).deduce_fragment() # If the fragment does prune the TestStreawm composite parts, then the # resulting graph is invalid and the following call will raise an exception. fragment.to_runner_api()
def test_build_pipeline_fragment(self, cell): with cell: # Cell 1 p = beam.Pipeline(ir.InteractiveRunner()) p_expected = beam.Pipeline(ir.InteractiveRunner()) # Watch local scope now to allow interactive beam to track the pipelines. ib.watch(locals()) with cell: # Cell 2 # pylint: disable=range-builtin-not-iterating init = p | 'Init' >> beam.Create(range(10)) init_expected = p_expected | 'Init' >> beam.Create(range(10)) with cell: # Cell 3 square = init | 'Square' >> beam.Map(lambda x: x * x) _ = init | 'Cube' >> beam.Map(lambda x: x**3) _ = init_expected | 'Square' >> beam.Map(lambda x: x * x) # Watch every PCollection has been defined so far in local scope. ib.watch(locals()) fragment = pf.PipelineFragment([square]).deduce_fragment() assert_pipeline_equal(self, p_expected, fragment)
def head(pcoll, n=5, include_window_info=False): """Materializes the first n elements from a PCollection into a Dataframe. This reads each element from file and reads only the amount that it needs into memory. For example:: p = beam.Pipeline(InteractiveRunner()) init = p | 'Init' >> beam.Create(range(10)) square = init | 'Square' >> beam.Map(lambda x: x * x) # Run the pipeline and bring the PCollection into memory as a Dataframe. in_memory_square = head(square, n=5) """ assert isinstance(pcoll, beam.pvalue.PCollection), ( '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll)) user_pipeline = pcoll.pipeline runner = user_pipeline.runner if isinstance(runner, ir.InteractiveRunner): runner = runner._underlying_runner # Make sure that sources without a user reference are still cached. pi.watch_sources(user_pipeline) # Make sure that all PCollections to be shown are watched. If a PCollection # has not been watched, make up a variable name for that PCollection and watch # it. No validation is needed here because the watch logic can handle # arbitrary variables. watched_pcollections = set() for watching in ie.current_env().watching(): for _, val in watching: if hasattr(val, '__class__') and isinstance( val, beam.pvalue.PCollection): watched_pcollections.add(val) if pcoll not in watched_pcollections: watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll}) warnings.filterwarnings('ignore', category=DeprecationWarning) # Attempt to run background caching job since we have the reference to the # user-defined pipeline. bcj.attempt_to_run_background_caching_job(runner, user_pipeline, user_pipeline.options) if pcoll in ie.current_env().computed_pcollections: # Read from pcoll cache, then convert to DF pipeline_instrument = pi.PipelineInstrument(pcoll.pipeline) key = pipeline_instrument.cache_key(pcoll) cache_manager = ie.current_env().cache_manager() coder = cache_manager.load_pcoder('full', key) reader, _ = cache_manager.read('full', key) elements = to_element_list(reader, coder, include_window_info=True) else: # Build a pipeline fragment for the PCollections and run it. result = pf.PipelineFragment([pcoll], user_pipeline.options).run() ie.current_env().set_pipeline_result(user_pipeline, result) # Invoke wait_until_finish to ensure the blocking nature of this API without # relying on the run to be blocking. result.wait_until_finish() # If the pipeline execution is successful at this stage, mark the # computation completeness for the given PCollections so that when further # `show` invocation occurs, Interactive Beam wouldn't need to re-compute. if result.state is beam.runners.runner.PipelineState.DONE: ie.current_env().mark_pcollection_computed([pcoll]) elements = result.read(pcoll, include_window_info=True) results = [] for e in elements: results.append(e) if len(results) >= n and n > 0: break return elements_to_df(results, include_window_info=include_window_info)
def show(*pcolls, **configs): # type: (*Union[Dict[Any, PCollection], Iterable[PCollection], PCollection], **bool) -> None """Shows given PCollections in an interactive exploratory way if used within a notebook, or prints a heading sampled data if used within an ipython shell. Noop if used in a non-interactive environment. The given pcolls can be dictionary of PCollections (as values), or iterable of PCollections or plain PCollection values. There are 2 boolean configurations: #. include_window_info=<True/False>. If True, windowing information of the data will be visualized too. Default is false. #. visualize_data=<True/False>. By default, the visualization contains data tables rendering data from given pcolls separately as if they are converted into dataframes. If visualize_data is True, there will be a more dive-in widget and statistically overview widget of the data. Otherwise, those 2 data visualization widgets will not be displayed. By default, the visualization contains data tables rendering data from given pcolls separately as if they are converted into dataframes. If visualize_data is True, there will be a more dive-in widget and statistically overview widget of the data. Otherwise, those 2 data visualization widgets will not be displayed. Ad hoc builds a pipeline fragment including only transforms that are necessary to produce data for given PCollections pcolls, runs the pipeline fragment to compute data for those pcolls and then visualizes the data. The function is always blocking. If used within a notebook, the data visualized might be dynamically updated before the function returns as more and more data could getting processed and emitted when the pipeline fragment is being executed. If used within an ipython shell, there will be no dynamic plotting but a static plotting in the end of pipeline fragment execution. The PCollections given must belong to the same pipeline. For example:: p = beam.Pipeline(InteractiveRunner()) init = p | 'Init' >> beam.Create(range(1000)) square = init | 'Square' >> beam.Map(lambda x: x * x) cube = init | 'Cube' >> beam.Map(lambda x: x ** 3) # Below builds a pipeline fragment from the defined pipeline `p` that # contains only applied transforms of `Init` and `Square`. Then the # interactive runner runs the pipeline fragment implicitly to compute data # represented by PCollection `square` and visualizes it. show(square) # This is equivalent to `show(square)` because `square` depends on `init` # and `init` is included in the pipeline fragment and computed anyway. show(init, square) # Below is similar to running `p.run()`. It computes data for both # PCollection `square` and PCollection `cube`, then visualizes them. show(square, cube) """ flatten_pcolls = [] for pcoll_container in pcolls: if isinstance(pcoll_container, dict): flatten_pcolls.extend(pcoll_container.values()) elif isinstance(pcoll_container, beam.pvalue.PCollection): flatten_pcolls.append(pcoll_container) else: try: flatten_pcolls.extend(iter(pcoll_container)) except TypeError: raise ValueError( 'The given pcoll %s is not a dict, an iterable or a PCollection.' % pcoll_container) pcolls = flatten_pcolls assert len(pcolls) > 0, ( 'Need at least 1 PCollection to show data visualization.') for pcoll in pcolls: assert isinstance(pcoll, beam.pvalue.PCollection), ( '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll)) user_pipeline = pcolls[0].pipeline for pcoll in pcolls: assert pcoll.pipeline is user_pipeline, ( '{} belongs to a different user-defined pipeline ({}) than that of' ' other PCollections ({}).'.format(pcoll, pcoll.pipeline, user_pipeline)) # TODO(BEAM-8288): Remove below pops and assertion once Python 2 is # deprecated from Beam. include_window_info = configs.pop('include_window_info', False) visualize_data = configs.pop('visualize_data', False) # This assertion is to protect the backward compatibility for function # signature change after Python 2 deprecation. assert not configs, ( 'The only configs supported are include_window_info and ' 'visualize_data.') runner = user_pipeline.runner if isinstance(runner, ir.InteractiveRunner): runner = runner._underlying_runner # Make sure that sources without a user reference are still cached. pi.watch_sources(user_pipeline) # Make sure that all PCollections to be shown are watched. If a PCollection # has not been watched, make up a variable name for that PCollection and watch # it. No validation is needed here because the watch logic can handle # arbitrary variables. watched_pcollections = set() for watching in ie.current_env().watching(): for _, val in watching: if hasattr(val, '__class__') and isinstance( val, beam.pvalue.PCollection): watched_pcollections.add(val) for pcoll in pcolls: if pcoll not in watched_pcollections: watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll}) if ie.current_env().is_in_ipython: warnings.filterwarnings( 'ignore', 'options is deprecated since First stable release. References to ' '<pipeline>.options will not be supported', category=DeprecationWarning) # Attempt to run background caching job since we have the reference to the # user-defined pipeline. bcj.attempt_to_run_background_caching_job(runner, user_pipeline, user_pipeline.options) pcolls = set(pcolls) computed_pcolls = set() for pcoll in pcolls: if pcoll in ie.current_env().computed_pcollections: computed_pcolls.add(pcoll) pcolls = pcolls.difference(computed_pcolls) # If in notebook, static plotting computed pcolls as computation is done. if ie.current_env().is_in_notebook: for pcoll in computed_pcolls: visualize(pcoll, include_window_info=include_window_info, display_facets=visualize_data) elif ie.current_env().is_in_ipython: for pcoll in computed_pcolls: visualize(pcoll, include_window_info=include_window_info) if not pcolls: return # Build a pipeline fragment for the PCollections and run it. result = pf.PipelineFragment(list(pcolls), user_pipeline.options).run() ie.current_env().set_pipeline_result(user_pipeline, result) # If in notebook, dynamic plotting as computation goes. if ie.current_env().is_in_notebook: for pcoll in pcolls: visualize(pcoll, dynamic_plotting_interval=1, include_window_info=include_window_info, display_facets=visualize_data) # Invoke wait_until_finish to ensure the blocking nature of this API without # relying on the run to be blocking. result.wait_until_finish() # If just in ipython shell, plotting once when the computation is completed. if ie.current_env().is_in_ipython and not ie.current_env().is_in_notebook: for pcoll in pcolls: visualize(pcoll, include_window_info=include_window_info) # If the pipeline execution is successful at this stage, mark the computation # completeness for the given PCollections so that when further `show` # invocation occurs, Interactive Beam wouldn't need to re-compute them. if result.state is beam.runners.runner.PipelineState.DONE: ie.current_env().mark_pcollection_computed(pcolls)
def show(*pcolls): """Visualizes given PCollections in an interactive exploratory way if used within a notebook, or prints a heading sampled data if used within an ipython shell. Noop if used in a non-interactive environment. Ad hoc builds a pipeline fragment including only transforms that are necessary to produce data for given PCollections pcolls, runs the pipeline fragment to compute data for those pcolls and then visualizes the data. The function is always blocking. If used within a notebook, the data visualized might be dynamically updated before the function returns as more and more data could getting processed and emitted when the pipeline fragment is being executed. If used within an ipython shell, there will be no dynamic plotting but a static plotting in the end of pipeline fragment execution. The PCollections given must belong to the same pipeline. For example:: p = beam.Pipeline(InteractiveRunner()) init = p | 'Init' >> beam.Create(range(1000)) square = init | 'Square' >> beam.Map(lambda x: x * x) cube = init | 'Cube' >> beam.Map(lambda x: x ** 3) # Below builds a pipeline fragment from the defined pipeline `p` that # contains only applied transforms of `Init` and `Square`. Then the # interactive runner runs the pipeline fragment implicitly to compute data # represented by PCollection `square` and visualizes it. show(square) # This is equivalent to `show(square)` because `square` depends on `init` # and `init` is included in the pipeline fragment and computed anyway. show(init, square) # Below is similar to running `p.run()`. It computes data for both # PCollection `square` and PCollection `cube`, then visualizes them. show(square, cube) """ assert len(pcolls) > 0, ( 'Need at least 1 PCollection to show data visualization.') for pcoll in pcolls: assert isinstance(pcoll, beam.pvalue.PCollection), ( '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll)) user_pipeline = pcolls[0].pipeline for pcoll in pcolls: assert pcoll.pipeline is user_pipeline, ( '{} belongs to a different user-defined pipeline ({}) than that of' ' other PCollections ({}).'.format( pcoll, pcoll.pipeline, user_pipeline)) runner = user_pipeline.runner if isinstance(runner, ir.InteractiveRunner): runner = runner._underlying_runner # Make sure that all PCollections to be shown are watched. If a PCollection # has not been watched, make up a variable name for that PCollection and watch # it. No validation is needed here because the watch logic can handle # arbitrary variables. watched_pcollections = set() for watching in ie.current_env().watching(): for _, val in watching: if hasattr(val, '__class__') and isinstance(val, beam.pvalue.PCollection): watched_pcollections.add(val) for pcoll in pcolls: if pcoll not in watched_pcollections: watch({re.sub(r'[\[\]\(\)]', '_', str(pcoll)): pcoll}) # Attempt to run background caching job since we have the reference to the # user-defined pipeline. bcj.attempt_to_run_background_caching_job(runner, user_pipeline) # Build a pipeline fragment for the PCollections and run it. result = pf.PipelineFragment(list(pcolls)).run() ie.current_env().set_pipeline_result(user_pipeline, result) # If in notebook, dynamic plotting as computation goes. if ie.current_env().is_in_notebook: for pcoll in pcolls: visualize(pcoll, dynamic_plotting_interval=1) # Invoke wait_until_finish to ensure the blocking nature of this API without # relying on the run to be blocking. result.wait_until_finish() # If just in ipython shell, plotting once when the computation is completed. if ie.current_env().is_in_ipython and not ie.current_env().is_in_notebook: for pcoll in pcolls: visualize(pcoll) # If the pipeline execution is successful at this stage, mark the computation # completeness for the given PCollections so that when further `show` # invocation occurs, Interactive Beam wouldn't need to re-compute them. if result.state is beam.runners.runner.PipelineState.DONE: ie.current_env().mark_pcollection_computed(pcolls)