def cache_output(output_name: str, output: PValue) -> None: user_pipeline = ie.current_env().user_pipeline(output.pipeline) if user_pipeline: cache_manager = ie.current_env().get_cache_manager( user_pipeline, create_if_absent=True) else: _LOGGER.warning( 'Something is wrong with %s. Cannot introspect its data.', output) return key = CacheKey.from_pcoll(output_name, output).to_str() _ = reify_to_cache(pcoll=output, cache_key=key, cache_manager=cache_manager) try: output.pipeline.run().wait_until_finish() except (KeyboardInterrupt, SystemExit): raise except Exception as e: _LOGGER.warning(_NOT_SUPPORTED_MSG, e, output.pipeline.runner) return ie.current_env().mark_pcollection_computed([output]) visualize_computed_pcoll(output_name, output, max_n=float('inf'), max_duration_secs=float('inf'))
def show(*pcolls, include_window_info=False, visualize_data=False, n='inf', duration='inf'): # type: (*Union[Dict[Any, PCollection], Iterable[PCollection], PCollection], bool, bool, Union[int, str], Union[int, str]) -> None """Shows given PCollections in an interactive exploratory way if used within a notebook, or prints a heading sampled data if used within an ipython shell. Noop if used in a non-interactive environment. Args: include_window_info: (optional) if True, windowing information of the data will be visualized too. Default is false. visualize_data: (optional) by default, the visualization contains data tables rendering data from given pcolls separately as if they are converted into dataframes. If visualize_data is True, there will be a more dive-in widget and statistically overview widget of the data. Otherwise, those 2 data visualization widgets will not be displayed. n: (optional) max number of elements to visualize. Default 'inf'. duration: (optional) max duration of elements to read in integer seconds or a string duration. Default 'inf'. The given pcolls can be dictionary of PCollections (as values), or iterable of PCollections or plain PCollection values. The user can specify either the max number of elements with `n` to read or the maximum duration of elements to read with `duration`. When a limiter is not supplied, it is assumed to be infinite. By default, the visualization contains data tables rendering data from given pcolls separately as if they are converted into dataframes. If visualize_data is True, there will be a more dive-in widget and statistically overview widget of the data. Otherwise, those 2 data visualization widgets will not be displayed. Ad hoc builds a pipeline fragment including only transforms that are necessary to produce data for given PCollections pcolls, runs the pipeline fragment to compute data for those pcolls and then visualizes the data. The function is always blocking. If used within a notebook, the data visualized might be dynamically updated before the function returns as more and more data could getting processed and emitted when the pipeline fragment is being executed. If used within an ipython shell, there will be no dynamic plotting but a static plotting in the end of pipeline fragment execution. The PCollections given must belong to the same pipeline. For example:: p = beam.Pipeline(InteractiveRunner()) init = p | 'Init' >> beam.Create(range(1000)) square = init | 'Square' >> beam.Map(lambda x: x * x) cube = init | 'Cube' >> beam.Map(lambda x: x ** 3) # Below builds a pipeline fragment from the defined pipeline `p` that # contains only applied transforms of `Init` and `Square`. Then the # interactive runner runs the pipeline fragment implicitly to compute data # represented by PCollection `square` and visualizes it. show(square) # This is equivalent to `show(square)` because `square` depends on `init` # and `init` is included in the pipeline fragment and computed anyway. show(init, square) # Below is similar to running `p.run()`. It computes data for both # PCollection `square` and PCollection `cube`, then visualizes them. show(square, cube) """ flatten_pcolls = [] for pcoll_container in pcolls: if isinstance(pcoll_container, dict): flatten_pcolls.extend(pcoll_container.values()) elif isinstance(pcoll_container, (beam.pvalue.PCollection, DeferredBase)): flatten_pcolls.append(pcoll_container) else: try: flatten_pcolls.extend(iter(pcoll_container)) except TypeError: raise ValueError( 'The given pcoll %s is not a dict, an iterable or a PCollection.' % pcoll_container) # Iterate through the given PCollections and convert any deferred DataFrames # or Series into PCollections. pcolls = set() # The element type is used to help visualize the given PCollection. For the # deferred DataFrame/Series case it is the proxy of the frame. element_types = {} for pcoll in flatten_pcolls: if isinstance(pcoll, DeferredBase): pcoll, element_type = deferred_df_to_pcollection(pcoll) watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll}) else: element_type = pcoll.element_type element_types[pcoll] = element_type pcolls.add(pcoll) assert isinstance(pcoll, beam.pvalue.PCollection), ( '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll)) assert len(pcolls) > 0, ( 'Need at least 1 PCollection to show data visualization.') pcoll_pipeline = next(iter(pcolls)).pipeline user_pipeline = ie.current_env().user_pipeline(pcoll_pipeline) # Possibly showing a PCollection defined in a local scope that is not # explicitly watched. Ad hoc watch it though it's a little late. if not user_pipeline: watch({ 'anonymous_pipeline_{}'.format(id(pcoll_pipeline)): pcoll_pipeline }) user_pipeline = pcoll_pipeline if isinstance(n, str): assert n == 'inf', ( 'Currently only the string \'inf\' is supported. This denotes reading ' 'elements until the recording is stopped via a kernel interrupt.') elif isinstance(n, int): assert n > 0, 'n needs to be positive or the string \'inf\'' if isinstance(duration, int): assert duration > 0, ( 'duration needs to be positive, a duration string, ' 'or the string \'inf\'') if n == 'inf': n = float('inf') if duration == 'inf': duration = float('inf') previously_computed_pcolls = { pcoll for pcoll in pcolls if pcoll in ie.current_env().computed_pcollections } for pcoll in previously_computed_pcolls: visualize_computed_pcoll(find_pcoll_name(pcoll), pcoll, n, duration, include_window_info=include_window_info, display_facets=visualize_data) pcolls = pcolls - previously_computed_pcolls recording_manager = ie.current_env().get_recording_manager( user_pipeline, create_if_absent=True) recording = recording_manager.record(pcolls, max_n=n, max_duration=duration) # Catch a KeyboardInterrupt to gracefully cancel the recording and # visualizations. try: # If in notebook, static plotting computed pcolls as computation is done. if ie.current_env().is_in_notebook: for stream in recording.computed().values(): visualize(stream, include_window_info=include_window_info, display_facets=visualize_data, element_type=element_types[stream.pcoll]) elif ie.current_env().is_in_ipython: for stream in recording.computed().values(): visualize(stream, include_window_info=include_window_info, element_type=element_types[stream.pcoll]) if recording.is_computed(): return # If in notebook, dynamic plotting as computation goes. if ie.current_env().is_in_notebook: for stream in recording.uncomputed().values(): visualize(stream, dynamic_plotting_interval=1, include_window_info=include_window_info, display_facets=visualize_data, element_type=element_types[stream.pcoll]) # Invoke wait_until_finish to ensure the blocking nature of this API without # relying on the run to be blocking. recording.wait_until_finish() # If just in ipython shell, plotting once when the computation is completed. if ie.current_env( ).is_in_ipython and not ie.current_env().is_in_notebook: for stream in recording.computed().values(): visualize(stream, include_window_info=include_window_info) except KeyboardInterrupt: if recording: recording.cancel()