def test_parse_windowedvalue_with_dicts(self): """Tests that dicts play well with WindowedValues. """ from apache_beam.transforms.window import GlobalWindow els = [ WindowedValue({ 'b': 2, 'd': 4 }, 1, [GlobalWindow()]), WindowedValue({ 'a': 1, 'b': 2, 'c': 3 }, 1, [GlobalWindow()]) ] actual_df = utils.elements_to_df(els, include_window_info=True) expected_df = pd.DataFrame( [[ np.nan, 2, np.nan, 4, int(1e6), els[0].windows, els[0].pane_info ], [1, 2, 3, np.nan, int(1e6), els[1].windows, els[1].pane_info]], columns=['a', 'b', 'c', 'd', 'event_time', 'windows', 'pane_info']) pd.testing.assert_frame_equal(actual_df, expected_df)
def create_split_across_windows(self, primary_windows, residual_windows): primary = SplitResultPrimary(primary_value=WindowedValue(( ('a', (OffsetRange(0, 100), self.watermark_estimator_state)), 100), 57, primary_windows)) if primary_windows else None residual = SplitResultResidual( residual_value=WindowedValue( (('a', (OffsetRange(0, 100), self.watermark_estimator_state)), 100), 57, residual_windows), current_watermark=None, deferred_timestamp=None) if residual_windows else None return primary, residual
def _output_as_events(self): """Outputs buffered elements as TestStream events. """ if self.timing_events: yield WindowedValue( self.timing_events, timestamp=0, windows=[beam.window.GlobalWindow()]) if self.elements: yield WindowedValue([ElementEvent(self.elements)], timestamp=0, windows=[beam.window.GlobalWindow()])
def test_count_limiter_with_dataframes(self): limiter = CountLimiter(5) # Test that empty dataframes don't count. for _ in range(10): df = WindowedValue(pd.DataFrame(), 0, []) limiter.update(df) self.assertFalse(limiter.is_triggered()) df = WindowedValue(pd.DataFrame({'col': list(range(10))}), 0, []) limiter.update(df) self.assertTrue(limiter.is_triggered())
def create_split_in_window(self, offset_index, windows): return ( SplitResultPrimary(primary_value=WindowedValue((('a', ( OffsetRange(0, offset_index), self.watermark_estimator_state)), offset_index), 57, windows)), SplitResultResidual( residual_value=WindowedValue( (('a', (OffsetRange(offset_index, 100), self.watermark_estimator.get_estimator_state())), 100 - offset_index), 57, windows), current_watermark=self.watermark_estimator.current_watermark(), deferred_timestamp=None))
def test_parse_windowedvalue(self): """Tests that WindowedValues are supported but not present. """ from apache_beam.transforms.window import GlobalWindow els = [ WindowedValue(('a', 2), 1, [GlobalWindow()]), WindowedValue(('b', 3), 1, [GlobalWindow()]) ] actual_df = utils.elements_to_df(els, include_window_info=False) expected_df = pd.DataFrame([['a', 2], ['b', 3]], columns=[0, 1]) pd.testing.assert_frame_equal(actual_df, expected_df)
def _process_outputs(self, windowed_input_element, results): """Dispatch the result of computation to the appropriate receivers. A value wrapped in a SideOutputValue object will be unwrapped and then dispatched to the appropriate indexed output. """ if results is None: return for result in results: tag = None if isinstance(result, SideOutputValue): tag = result.tag if not isinstance(tag, basestring): raise TypeError('In %s, tag %s is not a string' % (self, tag)) result = result.value if isinstance(result, WindowedValue): windowed_value = result if (windowed_input_element is not None and len(windowed_input_element.windows) != 1): windowed_value.windows *= len( windowed_input_element.windows) elif windowed_input_element is None: # Start and finish have no element from which to grab context, # but may emit elements. if isinstance(result, TimestampedValue): value = result.value timestamp = result.timestamp assign_context = NoContext(value, timestamp) else: value = result timestamp = -1 assign_context = NoContext(value) windowed_value = WindowedValue( value, timestamp, self.window_fn.assign(assign_context)) elif isinstance(result, TimestampedValue): assign_context = WindowFn.AssignContext( result.timestamp, result.value) windowed_value = WindowedValue( result.value, result.timestamp, self.window_fn.assign(assign_context)) if len(windowed_input_element.windows) != 1: windowed_value.windows *= len( windowed_input_element.windows) else: windowed_value = windowed_input_element.with_value(result) if tag is None: self.main_receivers.receive(windowed_value) else: self.tagged_receivers[tag].output(windowed_value)
def finish_bundle(self): from apache_beam.transforms import window assert self.file_to_read for file_name in glob.glob(self.file_to_read): if self.compression_type is None: with open(file_name) as file: for record in file: value = self.coder.decode(record.rstrip('\n')) yield WindowedValue(value, -1, [window.GlobalWindow()]) else: with gzip.open(file_name, 'r') as file: for record in file: value = self.coder.decode(record.rstrip('\n')) yield WindowedValue(value, -1, [window.GlobalWindow()])
def invoke_process(self, windowed_value, restriction_tracker=None, output_processor=None): output_processor = output_processor or self.output_processor self.context.set_element(windowed_value) # Call for the process function for each window if has windowed side inputs # or if the process accesses the window parameter. We can just call it once # otherwise as none of the arguments are changing additional_kwargs = {} if restriction_tracker: restriction_tracker_param = _find_param_with_default( self.signature.process_method, default_as_type=core.RestrictionProvider)[0] if not restriction_tracker_param: raise ValueError( 'A RestrictionTracker %r was provided but DoFn does not have a ' 'RestrictionTrackerParam defined', restriction_tracker) additional_kwargs[restriction_tracker_param] = restriction_tracker if self.has_windowed_inputs and len(windowed_value.windows) != 1: for w in windowed_value.windows: self._invoke_per_window( WindowedValue(windowed_value.value, windowed_value.timestamp, (w, )), additional_kwargs, output_processor) else: self._invoke_per_window(windowed_value, additional_kwargs, output_processor)
def test_basic_wordcount(self): """A wordcount to be used as a smoke test.""" # Create the pipeline that will emit 0, 1, 2. p = beam.Pipeline(InteractiveRunner()) elems = p | beam.Create([0, 1, 2]) # Watch the pipeline and PCollections. This is normally done in a notebook # environment automatically, but we have to do it manually here. ib.watch(locals()) ie.current_env().track_user_pipelines() # Create the recording objects. By calling `record` a new PipelineFragment # is started to compute the given PCollections and cache to disk. rm = RecordingManager(p) recording = rm.record([elems], max_n=3, max_duration_secs=500) stream = recording.stream(elems) recording.wait_until_finish() # Once the pipeline fragment completes, we can read from the stream and know # that all elements were written to cache. elems = list(stream.read()) expected_elems = [ WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3) ] self.assertListEqual(elems, expected_elems)
def test_basic_test_stream(self): test_stream = (TestStream() .advance_watermark_to(0) .add_elements([ 'a', WindowedValue('b', 3, []), TimestampedValue('c', 6)]) .advance_processing_time(10) .advance_watermark_to(8) .add_elements(['d']) .advance_watermark_to_infinity()) # yapf: disable self.assertEqual(test_stream._events, [ WatermarkEvent(0), ElementEvent([ TimestampedValue('a', 0), TimestampedValue('b', 3), TimestampedValue('c', 6), ]), ProcessingTimeEvent(10), WatermarkEvent(8), ElementEvent([ TimestampedValue('d', 8), ]), WatermarkEvent(timestamp.MAX_TIMESTAMP), ])
def test_windowed_values_interpreted_correctly(self): windowed_value = WindowedValueHolder( WindowedValue('a', Timestamp(5), [beam.window.IntervalWindow(5, 10)], PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0))) test_stream = (TestStream() .advance_processing_time(10) .advance_watermark_to(10) .add_elements([windowed_value]) .advance_watermark_to_infinity()) # yapf: disable class RecordFn(beam.DoFn): def process(self, element=beam.DoFn.ElementParam, timestamp=beam.DoFn.TimestampParam, window=beam.DoFn.WindowParam): yield (element, timestamp, window) options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: my_record_fn = RecordFn() records = p | test_stream | beam.ParDo(my_record_fn) assert_that( records, equal_to([ ('a', timestamp.Timestamp(5), beam.window.IntervalWindow(5, 10)), ]))
def process_outputs(self, windowed_input_element, results): """Dispatch the result of process computation to the appropriate receivers. A value wrapped in a TaggedOutput object will be unwrapped and then dispatched to the appropriate indexed output. """ if results is None: return for result in results: tag = None if isinstance(result, TaggedOutput): tag = result.tag if not isinstance(tag, basestring): raise TypeError('In %s, tag %s is not a string' % (self, tag)) result = result.value if isinstance(result, WindowedValue): windowed_value = result if (windowed_input_element is not None and len(windowed_input_element.windows) != 1): windowed_value.windows *= len(windowed_input_element.windows) elif isinstance(result, TimestampedValue): assign_context = WindowFn.AssignContext(result.timestamp, result.value) windowed_value = WindowedValue( result.value, result.timestamp, self.window_fn.assign(assign_context)) if len(windowed_input_element.windows) != 1: windowed_value.windows *= len(windowed_input_element.windows) else: windowed_value = windowed_input_element.with_value(result) if tag is None: self.main_receivers.receive(windowed_value) else: self.tagged_receivers[tag].receive(windowed_value)
def windowed_values(self): # yield first windowed_value as is, then iterate through # _appended_values to yield WindowedValue on the fly. yield self._initial_windowed_value for v in self._appended_values: yield WindowedValue(v, self._initial_windowed_value.timestamp, self._initial_windowed_value.windows)
def test_parse_windowedvalue_with_window_info(self): """Tests that WindowedValues are supported and have their own columns. """ from apache_beam.transforms.window import GlobalWindow els = [ WindowedValue(('a', 2), 1, [GlobalWindow()]), WindowedValue(('b', 3), 1, [GlobalWindow()]) ] actual_df = utils.elements_to_df(els, include_window_info=True) expected_df = pd.DataFrame( [['a', 2, int(1e6), els[0].windows, els[0].pane_info], ['b', 3, int(1e6), els[1].windows, els[1].pane_info]], columns=[0, 1, 'event_time', 'windows', 'pane_info']) pd.testing.assert_frame_equal(actual_df, expected_df)
def _output_as_records(self): """Outputs buffered elements as TestStreamFileRecords. """ if self.header: yield WindowedValue( self.header, timestamp=0, windows=[beam.window.GlobalWindow()]) if self.timing_events: timing_events = self._timing_events_to_records(self.timing_events) for r in timing_events: yield WindowedValue( r, timestamp=0, windows=[beam.window.GlobalWindow()]) if self.elements: elements = self._elements_to_record(self.elements) yield WindowedValue( elements, timestamp=0, windows=[beam.window.GlobalWindow()])
def windowed_value( cls, value, # type: Any timestamp=MIN_TIMESTAMP, # type: Timestamp pane_info=windowed_value.PANE_INFO_UNKNOWN # type: windowed_value.PaneInfo ): # type: (...) -> WindowedValue return WindowedValue(value, timestamp, (GlobalWindow(), ), pane_info)
def invoke_process(self, windowed_value, restriction_tracker=None, output_processor=None, additional_args=None, additional_kwargs=None): if not additional_args: additional_args = [] if not additional_kwargs: additional_kwargs = {} if not output_processor: output_processor = self.output_processor self.context.set_element(windowed_value) # Call for the process function for each window if has windowed side inputs # or if the process accesses the window parameter. We can just call it once # otherwise as none of the arguments are changing if self.is_splittable and not restriction_tracker: restriction = self.invoke_initial_restriction(windowed_value.value) restriction_tracker = self.invoke_create_tracker(restriction) if restriction_tracker: if len(windowed_value.windows) > 1 and self.has_windowed_inputs: # Should never get here due to window explosion in # the upstream pair-with-restriction. raise NotImplementedError( 'SDFs in multiply-windowed values with windowed arguments.' ) restriction_tracker_param = _find_param_with_default( self.signature.process_method, default_as_type=core.RestrictionProvider)[0] if not restriction_tracker_param: raise ValueError( 'A RestrictionTracker %r was provided but DoFn does not have a ' 'RestrictionTrackerParam defined' % restriction_tracker) additional_kwargs[restriction_tracker_param] = restriction_tracker try: self.current_windowed_value = windowed_value self.restriction_tracker = restriction_tracker return self._invoke_process_per_window(windowed_value, additional_args, additional_kwargs, output_processor) finally: self.restriction_tracker = None self.current_windowed_value = windowed_value elif self.has_windowed_inputs and len(windowed_value.windows) != 1: for w in windowed_value.windows: self._invoke_process_per_window( WindowedValue(windowed_value.value, windowed_value.timestamp, (w, )), additional_args, additional_kwargs, output_processor) else: self._invoke_process_per_window(windowed_value, additional_args, additional_kwargs, output_processor)
def old_dofn_process(self, element): if self.has_windowed_side_inputs and len(element.windows) > 1: for w in element.windows: self.context.set_element( WindowedValue(element.value, element.timestamp, (w, ))) self._process_outputs(element, self.dofn_process(self.context)) else: self.context.set_element(element) self._process_outputs(element, self.dofn_process(self.context))
def output_key(self, wkey, accumulator): windows, key = wkey if self.combine_fn_compact is None: value = accumulator else: value = self.combine_fn_compact(accumulator) if windows is 0: self.output(_globally_windowed_value.with_value((key, value))) else: self.output(WindowedValue((key, value), windows[0].end, windows))
def finish_bundle(self): """Runs predictions on remaining elements at end of bundle of elements.""" logging.info("Run predictions on all intermediate elements.") for elements in self.batches.values(): outputs = self.make_predictions(elements) for output in outputs: yield WindowedValue( value=output, timestamp=int(time.time()), windows=(window.GlobalWindow(),)) self.batches = {}
def flush(self, target): limit = self.size - target for ix, (kw, vs) in enumerate(self.table.items()): if ix >= limit: break del self.table[kw] key, windows = kw output_value = [v.value[1] for v in vs] windowed_value = WindowedValue((key, output_value), vs[0].timestamp, windows) self.output(windowed_value)
def invoke_process(self, windowed_value): self.context.set_element(windowed_value) # Call for the process function for each window if has windowed side inputs # or if the process accesses the window parameter. We can just call it once # otherwise as none of the arguments are changing if self.has_windowed_inputs and len(windowed_value.windows) != 1: for w in windowed_value.windows: self._invoke_per_window( WindowedValue(windowed_value.value, windowed_value.timestamp, (w,))) else: self._invoke_per_window(windowed_value)
def setUp(self): self.window1 = IntervalWindow(0, 10) self.window2 = IntervalWindow(10, 20) self.window3 = IntervalWindow(20, 30) self.windowed_value = WindowedValue( 'a', 57, (self.window1, self.window2, self.window3)) self.restriction = OffsetRange(0, 100) self.watermark_estimator_state = Timestamp(21) self.restriction_provider = TestOffsetRestrictionProvider() self.watermark_estimator = ManualWatermarkEstimator(Timestamp(42)) self.maxDiff = None
def _dofn_invoker(self, element): self.context.set_element(element) # Call for the process function for each window if has windowed side inputs # or if the process accesses the window parameter. We can just call it once # otherwise as none of the arguments are changing if self.has_windowed_inputs and len(element.windows) != 1: for w in element.windows: self._dofn_per_window_invoker( WindowedValue(element.value, element.timestamp, (w, ))) else: self._dofn_per_window_invoker(element)
def process_outputs( self, windowed_input_element, results, watermark_estimator=None): # type: (WindowedValue, Iterable[Any]) -> None """Dispatch the result of process computation to the appropriate receivers. A value wrapped in a TaggedOutput object will be unwrapped and then dispatched to the appropriate indexed output. """ if results is None: # TODO(BEAM-3937): Remove if block after output counter released. # Only enable per_element_output_counter when counter cythonized. if (self.per_element_output_counter is not None and self.per_element_output_counter.is_cythonized): self.per_element_output_counter.add_input(0) return output_element_count = 0 for result in results: # results here may be a generator, which cannot call len on it. output_element_count += 1 tag = None if isinstance(result, TaggedOutput): tag = result.tag if not isinstance(tag, (str, unicode)): raise TypeError('In %s, tag %s is not a string' % (self, tag)) result = result.value if isinstance(result, WindowedValue): windowed_value = result if (windowed_input_element is not None and len(windowed_input_element.windows) != 1): windowed_value.windows *= len(windowed_input_element.windows) elif isinstance(result, TimestampedValue): assign_context = WindowFn.AssignContext(result.timestamp, result.value) windowed_value = WindowedValue( result.value, result.timestamp, self.window_fn.assign(assign_context)) if len(windowed_input_element.windows) != 1: windowed_value.windows *= len(windowed_input_element.windows) else: windowed_value = windowed_input_element.with_value(result) if watermark_estimator is not None: watermark_estimator.observe_timestamp(windowed_value.timestamp) if tag is None: self.main_receivers.receive(windowed_value) else: self.tagged_receivers[tag].receive(windowed_value) # TODO(BEAM-3937): Remove if block after output counter released. # Only enable per_element_output_counter when counter cythonized if (self.per_element_output_counter is not None and self.per_element_output_counter.is_cythonized): self.per_element_output_counter.add_input(output_element_count)
def reader(): element_payload = [ TestStreamPayload.TimestampedElement( encoded_element=coder.encode( WindowedValueHolder(WindowedValue(e, 0, []))), timestamp=Timestamp.of(0).micros) for e in range(10) ] event = TestStreamPayload.Event( element_event=TestStreamPayload.Event.AddElements( elements=element_payload)) yield event
def output_key(self, wkey, accumulator, timestamp): if self.combine_fn_compact is None: value = accumulator else: value = self.combine_fn_compact(accumulator) if self.is_default_windowing: self.output(_globally_windowed_value.with_value((wkey, value))) else: windows, key = wkey if self.timestamp_combiner is None: timestamp = windows[0].max_timestamp() self.output(WindowedValue((key, value), timestamp, windows))
def process(self, element: prediction_log_pb2.PredictionLog, window=beam.DoFn.WindowParam, timestamp=beam.DoFn.TimestampParam): if len(element.predict_log.request.inputs['examples'].string_val) > 1: raise Exception("Only support single input string.") if len(self.batch) > self.batching_size: for k in self.process_result(self.batch): yield k self.batch.clear() else: self.batch.append(WindowedValue(element, timestamp, [window]))
def finish_bundle_outputs(self, results): """Dispatch the result of finish_bundle to the appropriate receivers. A value wrapped in a OutputValue object will be unwrapped and then dispatched to the appropriate indexed output. """ if results is None: return for result in results: tag = None if isinstance(result, OutputValue): tag = result.tag if not isinstance(tag, basestring): raise TypeError('In %s, tag %s is not a string' % (self, tag)) result = result.value if isinstance(result, WindowedValue): windowed_value = result elif isinstance(result, TimestampedValue): value = result.value timestamp = result.timestamp assign_context = NoContext(value, timestamp) windowed_value = WindowedValue( value, timestamp, self.window_fn.assign(assign_context)) else: value = result timestamp = -1 assign_context = NoContext(value) windowed_value = WindowedValue( value, timestamp, self.window_fn.assign(assign_context)) if tag is None: self.main_receivers.receive(windowed_value) else: self.tagged_receivers[tag].output(windowed_value)