class BatchGlobalTriggerDriver(TriggerDriver): """Groups all received values together. """ GLOBAL_WINDOW_TUPLE = (GlobalWindow(), ) ONLY_FIRING = windowed_value.PaneInfo( is_first=True, is_last=True, timing=windowed_value.PaneInfoTiming.ON_TIME, index=0, nonspeculative_index=0) def process_elements( self, state, windowed_values, unused_output_watermark, unused_input_watermark=MIN_TIMESTAMP): yield WindowedValue( _UnwindowedValues(windowed_values), MIN_TIMESTAMP, self.GLOBAL_WINDOW_TUPLE, self.ONLY_FIRING) def process_timer( self, window_id, name, time_domain, timestamp, state, input_watermark=None): raise TypeError('Triggers never set or called for batch default windowing.')
def test_homogeneous_from_windowed_values(self): pane_info = windowed_value.PaneInfo( True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0) windowed_values = [ windowed_value.WindowedValue('foofoo', 3, (), pane_info), windowed_value.WindowedValue('foobar', 6, (), pane_info), windowed_value.WindowedValue('foobaz', 9, (), pane_info), windowed_value.WindowedValue('barfoo', 3, (), pane_info), windowed_value.WindowedValue('barbar', 6, (), pane_info), windowed_value.WindowedValue('barbaz', 9, (), pane_info), windowed_value.WindowedValue('bazfoo', 3, (), pane_info), windowed_value.WindowedValue('bazbar', 6, (), pane_info), windowed_value.WindowedValue('bazbaz', 9, (), pane_info), ] self.assertEqual( list( windowed_value.WindowedBatch.from_windowed_values( windowed_values, produce_fn=list)), [ windowed_value.HomogeneousWindowedBatch.of( ['foofoo', 'barfoo', 'bazfoo'], 3, (), pane_info), windowed_value.HomogeneousWindowedBatch.of( ['foobar', 'barbar', 'bazbar'], 6, (), pane_info), windowed_value.HomogeneousWindowedBatch.of( ['foobaz', 'barbaz', 'bazbaz'], 9, (), pane_info) ])
def test_homogeneous_windowed_batch_with_values(self): pane_info = windowed_value.PaneInfo( True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0) wb = windowed_value.HomogeneousWindowedBatch.of(['foo', 'bar'], 6, (), pane_info) self.assertEqual( wb.with_values(['baz', 'foo']), windowed_value.HomogeneousWindowedBatch.of(['baz', 'foo'], 6, (), pane_info))
def test_homogeneous_windowed_batch_as_windowed_values(self): pane_info = windowed_value.PaneInfo( True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0) wb = windowed_value.HomogeneousWindowedBatch.of(['foo', 'bar'], 3, (), pane_info) self.assertEqual(list(wb.as_windowed_values(iter)), [ windowed_value.WindowedValue('foo', 3, (), pane_info), windowed_value.WindowedValue('bar', 3, (), pane_info) ])
def test_windowedvalue_coder_paneinfo(self): coder = coders.WindowedValueCoder(coders.VarIntCoder(), coders.GlobalWindowCoder()) test_paneinfo_values = [ windowed_value.PANE_INFO_UNKNOWN, windowed_value.PaneInfo( True, True, windowed_value.PaneInfoTiming.EARLY, 0, -1), windowed_value.PaneInfo( True, False, windowed_value.PaneInfoTiming.ON_TIME, 0, 0), windowed_value.PaneInfo( True, False, windowed_value.PaneInfoTiming.ON_TIME, 10, 0), windowed_value.PaneInfo( False, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 23), windowed_value.PaneInfo( False, True, windowed_value.PaneInfoTiming.ON_TIME, 12, 23), windowed_value.PaneInfo( False, False, windowed_value.PaneInfoTiming.LATE, 0, 123),] test_values = [windowed_value.WindowedValue(123, 234, (GlobalWindow(),), p) for p in test_paneinfo_values] # Test unnested. self.check_coder(coder, windowed_value.WindowedValue( 123, 234, (GlobalWindow(),), windowed_value.PANE_INFO_UNKNOWN)) for value in test_values: self.check_coder(coder, value) # Test nested. for value1 in test_values: for value2 in test_values: self.check_coder(coders.TupleCoder((coder, coder)), (value1, value2))
def _output( self, window, finished, state, input_watermark, output_watermark, maybe_ontime): """Output window and clean up if appropriate.""" index = state.get_state(window, self.INDEX) state.add_state(window, self.INDEX, 1) if output_watermark <= window.max_timestamp(): nonspeculative_index = -1 timing = windowed_value.PaneInfoTiming.EARLY if state.get_state(window, self.NONSPECULATIVE_INDEX): nonspeculative_index = state.get_state( window, self.NONSPECULATIVE_INDEX) state.add_state(window, self.NONSPECULATIVE_INDEX, 1) _LOGGER.warning( 'Watermark moved backwards in time ' 'or late data moved window end forward.') else: nonspeculative_index = state.get_state(window, self.NONSPECULATIVE_INDEX) state.add_state(window, self.NONSPECULATIVE_INDEX, 1) timing = ( windowed_value.PaneInfoTiming.ON_TIME if maybe_ontime and nonspeculative_index == 0 else windowed_value.PaneInfoTiming.LATE) pane_info = windowed_value.PaneInfo( index == 0, finished, timing, index, nonspeculative_index) values = state.get_state(window, self.ELEMENTS) if finished: # TODO(robertwb): allowed lateness state.clear_state(window, self.ELEMENTS) state.add_state(window, self.TOMBSTONE, 1) elif self.accumulation_mode == AccumulationMode.DISCARDING: state.clear_state(window, self.ELEMENTS) timestamp = state.get_state(window, self.WATERMARK_HOLD) if timestamp is None: # If no watermark hold was set, output at end of window. timestamp = window.max_timestamp() elif input_watermark < window.end and self.trigger_fn.has_ontime_pane(): # Hold the watermark in case there is an empty pane that needs to be fired # at the end of the window. pass else: state.clear_state(window, self.WATERMARK_HOLD) return WindowedValue(values, timestamp, (window, ), pane_info)
def test_pane_info_param(self): with TestPipeline() as p: pc = p | Create([(None, None)]) assert_that(pc | beam.Map(lambda _, p=DoFn.PaneInfoParam: p), equal_to([windowed_value.PANE_INFO_UNKNOWN]), label='CheckUngrouped') assert_that(pc | beam.GroupByKey() | beam.Map(lambda _, p=DoFn.PaneInfoParam: p), equal_to([ windowed_value.PaneInfo( is_first=True, is_last=True, timing=windowed_value.PaneInfoTiming.ON_TIME, index=0, nonspeculative_index=0) ]), label='CheckGrouped')
def partition(self, n): # type: (int) -> List[List[bytes]] """ It is used to partition _GroupingBuffer to N parts. Once it is partitioned, it would not be re-partitioned with diff N. Re-partition is not supported now. """ if not self._grouped_output: if self._windowing.is_default(): globally_window = GlobalWindows.windowed_value( None, timestamp=GlobalWindow().max_timestamp(), pane_info=windowed_value.PaneInfo( is_first=True, is_last=True, timing=windowed_value.PaneInfoTiming.ON_TIME, index=0, nonspeculative_index=0)).with_value windowed_key_values = lambda key, values: [ globally_window((key, values)) ] else: # TODO(pabloem, BEAM-7514): Trigger driver needs access to the clock # note that this only comes through if windowing is default - but what # about having multiple firings on the global window. # May need to revise. trigger_driver = trigger.create_trigger_driver( self._windowing, True) windowed_key_values = trigger_driver.process_entire_key coder_impl = self._post_grouped_coder.get_impl() key_coder_impl = self._key_coder.get_impl() self._grouped_output = [[] for _ in range(n)] output_stream_list = [create_OutputStream() for _ in range(n)] for idx, (encoded_key, windowed_values) in enumerate(self._table.items()): key = key_coder_impl.decode(encoded_key) for wkvs in windowed_key_values(key, windowed_values): coder_impl.encode_to_stream(wkvs, output_stream_list[idx % n], True) for ix, output_stream in enumerate(output_stream_list): self._grouped_output[ix] = [output_stream.get()] self._table.clear() return self._grouped_output
def decode_from_stream(self, in_stream, nested): encoded_first_byte = in_stream.read_byte() base = windowed_value._BYTE_TO_PANE_INFO[encoded_first_byte & 0xF] assert base is not None encoding_type = encoded_first_byte >> 4 if encoding_type == PaneInfoEncoding_FIRST: return base elif encoding_type == PaneInfoEncoding.ONE_INDEX: index = in_stream.read_var_int64() if base.timing == windowed_value.PaneInfoTiming.EARLY: nonspeculative_index = -1 else: nonspeculative_index = index elif encoding_type == PaneInfoEncoding.TWO_INDICES: index = in_stream.read_var_int64() nonspeculative_index = in_stream.read_var_int64() else: raise NotImplementedError('Invalid PaneInfoEncoding: %s' % encoding_type) return windowed_value.PaneInfo( base.is_first, base.is_last, base.timing, index, nonspeculative_index)
def test_pickle(self): pane_info = windowed_value.PaneInfo( True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0) wv = windowed_value.WindowedValue(1, 3, (), pane_info) self.assertTrue(pickle.loads(pickle.dumps(wv)) == wv)
def test_with_value(self): pane_info = windowed_value.PaneInfo( True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0) wv = windowed_value.WindowedValue(1, 3, (), pane_info) self.assertEqual( wv.with_value(10), windowed_value.WindowedValue(10, 3, (), pane_info))
self.assertFalse(wv is wv_copy) self.assertEqual({wv: 100}.get(wv_copy), 100) def test_pickle(self): pane_info = windowed_value.PaneInfo( True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0) wv = windowed_value.WindowedValue(1, 3, (), pane_info) self.assertTrue(pickle.loads(pickle.dumps(wv)) == wv) WINDOWED_BATCH_INSTANCES = [ windowed_value.HomogeneousWindowedBatch.of( None, 3, (), windowed_value.PANE_INFO_UNKNOWN), windowed_value.HomogeneousWindowedBatch.of( None, 3, (), windowed_value.PaneInfo(True, False, windowed_value.PaneInfoTiming.ON_TIME, 0, 0)), ] class WindowedBatchTest(unittest.TestCase): def test_homogeneous_windowed_batch_with_values(self): pane_info = windowed_value.PaneInfo( True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0) wb = windowed_value.HomogeneousWindowedBatch.of(['foo', 'bar'], 6, (), pane_info) self.assertEqual( wb.with_values(['baz', 'foo']), windowed_value.HomogeneousWindowedBatch.of(['baz', 'foo'], 6, (), pane_info)) def test_homogeneous_windowed_batch_as_windowed_values(self):