def test_test_stream_payload_events(self): """Tests that the to_element_list can limit the count in a single bundle.""" coder = coders.FastPrimitivesCoder() def reader(): element_payload = [ TestStreamPayload.TimestampedElement( encoded_element=coder.encode( WindowedValueHolder(WindowedValue(e, 0, []))), timestamp=Timestamp.of(0).micros) for e in range(10) ] event = TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=element_payload)) yield event # The reader creates 10 elements in a single TestStreamPayload but we limit # the number of elements read to 5 here. This tests that the to_element_list # can limit the number of elements in a single bundle. elements = utils.to_element_list(reader(), coder, include_window_info=False, n=5) self.assertSequenceEqual(list(elements), list(range(5)))
def test_element_limit_count(self): """Tests that the to_element_list can limit the count.""" elements = utils.to_element_list(iter(range(10)), None, include_window_info=False, n=5) self.assertSequenceEqual(list(elements), list(range(5)))
def _to_dataframe(self): results = [] cache_manager = ie.current_env().cache_manager() if cache_manager.exists('full', self._cache_key): coder = cache_manager.load_pcoder('full', self._cache_key) reader, _ = cache_manager.read('full', self._cache_key) results = list(to_element_list(reader, coder, include_window_info=True)) return elements_to_df(results, self._include_window_info)
def read(self, tail=True): # type: (boolean) -> Any """Reads the elements currently recorded.""" # Get the cache manager and wait until the file exists. cache_manager = ie.current_env().get_cache_manager(self._pipeline) # Retrieve the coder for the particular PCollection which will be used to # decode elements read from cache. coder = cache_manager.load_pcoder('full', self._cache_key) # Read the elements from the cache. # Import limiters here to prevent a circular import. from apache_beam.runners.interactive.options.capture_limiters import CountLimiter from apache_beam.runners.interactive.options.capture_limiters import ProcessingTimeLimiter reader, _ = cache_manager.read('full', self._cache_key, tail=tail) # Because a single TestStreamFileRecord can yield multiple elements, we # limit the count again here in the to_element_list call. # # There are two ways of exiting this loop either a limiter was triggered or # all elements from the cache were read. In the latter situation, it may be # the case that the pipeline was still running. Thus, another invocation of # `read` will yield new elements. count_limiter = CountLimiter(self._n) time_limiter = ProcessingTimeLimiter(self._duration_secs) limiters = (count_limiter, time_limiter) for e in utils.to_element_list(reader, coder, include_window_info=True, n=self._n, include_time_events=True): # From the to_element_list we either get TestStreamPayload.Events if # include_time_events or decoded elements from the reader. Make sure we # only count the decoded elements to break early. if isinstance(e, TestStreamPayload.Event): time_limiter.update(e) else: count_limiter.update(e) yield e if any(l.is_triggered() for l in limiters): break # A limiter being triggered means that we have fulfilled the user's request. # This implies that reading from the cache again won't yield any new # elements. WLOG, this applies to the user pipeline being terminated. if any(l.is_triggered() for l in limiters) or ie.current_env().is_terminated(self._pipeline): self._done = True
def read(self, pcoll, include_window_info=False): """Reads the PCollection one element at a time from cache. If include_window_info is True, then returns the elements as WindowedValues. Otherwise, return the element as itself. """ key = self._pipeline_instrument.cache_key(pcoll) cache_manager = ie.current_env().cache_manager() if cache_manager.exists('full', key): coder = cache_manager.load_pcoder('full', key) reader, _ = cache_manager.read('full', key) return to_element_list(reader, coder, include_window_info) else: raise ValueError('PCollection not available, please run the pipeline.')
def read(self, tail=True): # type: (boolean) -> Any """Reads the elements currently recorded.""" # Get the cache manager and wait until the file exists. cache_manager = ie.current_env().get_cache_manager(self._pipeline) # Retrieve the coder for the particular PCollection which will be used to # decode elements read from cache. coder = cache_manager.load_pcoder('full', self._cache_key) # Read the elements from the cache. limiters = [ CountLimiter(self._n), ProcessingTimeLimiter(self._duration_secs) ] reader, _ = cache_manager.read('full', self._cache_key, tail=tail) # Because a single TestStreamFileRecord can yield multiple elements, we # limit the count again here in the to_element_list call. # # There are two ways of exiting this loop either a limiter was triggered or # all elements from the cache were read. In the latter situation, it may be # the case that the pipeline was still running. Thus, another invocation of # `read` will yield new elements. for e in utils.to_element_list(reader, coder, include_window_info=True, n=self._n): for l in limiters: l.update(e) yield e if any(l.is_triggered() for l in limiters): break # A limiter being triggered means that we have fulfilled the user's request. # This implies that reading from the cache again won't yield any new # elements. WLOG, this applies to the user pipeline being terminated. if any(l.is_triggered() for l in limiters) or ie.current_env().is_terminated( self._pipeline): self._done = True
def head(pcoll, n=5, include_window_info=False): """Materializes the first n elements from a PCollection into a Dataframe. This reads each element from file and reads only the amount that it needs into memory. For example:: p = beam.Pipeline(InteractiveRunner()) init = p | 'Init' >> beam.Create(range(10)) square = init | 'Square' >> beam.Map(lambda x: x * x) # Run the pipeline and bring the PCollection into memory as a Dataframe. in_memory_square = head(square, n=5) """ assert isinstance(pcoll, beam.pvalue.PCollection), ( '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll)) user_pipeline = pcoll.pipeline runner = user_pipeline.runner if isinstance(runner, ir.InteractiveRunner): runner = runner._underlying_runner # Make sure that sources without a user reference are still cached. pi.watch_sources(user_pipeline) # Make sure that all PCollections to be shown are watched. If a PCollection # has not been watched, make up a variable name for that PCollection and watch # it. No validation is needed here because the watch logic can handle # arbitrary variables. watched_pcollections = set() for watching in ie.current_env().watching(): for _, val in watching: if hasattr(val, '__class__') and isinstance( val, beam.pvalue.PCollection): watched_pcollections.add(val) if pcoll not in watched_pcollections: watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll}) warnings.filterwarnings('ignore', category=DeprecationWarning) # Attempt to run background caching job since we have the reference to the # user-defined pipeline. bcj.attempt_to_run_background_caching_job(runner, user_pipeline, user_pipeline.options) if pcoll in ie.current_env().computed_pcollections: # Read from pcoll cache, then convert to DF pipeline_instrument = pi.PipelineInstrument(pcoll.pipeline) key = pipeline_instrument.cache_key(pcoll) cache_manager = ie.current_env().cache_manager() coder = cache_manager.load_pcoder('full', key) reader, _ = cache_manager.read('full', key) elements = to_element_list(reader, coder, include_window_info=True) else: # Build a pipeline fragment for the PCollections and run it. result = pf.PipelineFragment([pcoll], user_pipeline.options).run() ie.current_env().set_pipeline_result(user_pipeline, result) # Invoke wait_until_finish to ensure the blocking nature of this API without # relying on the run to be blocking. result.wait_until_finish() # If the pipeline execution is successful at this stage, mark the # computation completeness for the given PCollections so that when further # `show` invocation occurs, Interactive Beam wouldn't need to re-compute. if result.state is beam.runners.runner.PipelineState.DONE: ie.current_env().mark_pcollection_computed([pcoll]) elements = result.read(pcoll, include_window_info=True) results = [] for e in elements: results.append(e) if len(results) >= n and n > 0: break return elements_to_df(results, include_window_info=include_window_info)