예제 #1
0
class BatchGlobalTriggerDriver(TriggerDriver):
  """Groups all received values together.
  """
  GLOBAL_WINDOW_TUPLE = (GlobalWindow(), )
  ONLY_FIRING = windowed_value.PaneInfo(
      is_first=True,
      is_last=True,
      timing=windowed_value.PaneInfoTiming.ON_TIME,
      index=0,
      nonspeculative_index=0)

  def process_elements(
      self,
      state,
      windowed_values,
      unused_output_watermark,
      unused_input_watermark=MIN_TIMESTAMP):
    yield WindowedValue(
        _UnwindowedValues(windowed_values),
        MIN_TIMESTAMP,
        self.GLOBAL_WINDOW_TUPLE,
        self.ONLY_FIRING)

  def process_timer(
      self,
      window_id,
      name,
      time_domain,
      timestamp,
      state,
      input_watermark=None):
    raise TypeError('Triggers never set or called for batch default windowing.')
예제 #2
0
    def test_homogeneous_from_windowed_values(self):
        pane_info = windowed_value.PaneInfo(
            True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0)

        windowed_values = [
            windowed_value.WindowedValue('foofoo', 3, (), pane_info),
            windowed_value.WindowedValue('foobar', 6, (), pane_info),
            windowed_value.WindowedValue('foobaz', 9, (), pane_info),
            windowed_value.WindowedValue('barfoo', 3, (), pane_info),
            windowed_value.WindowedValue('barbar', 6, (), pane_info),
            windowed_value.WindowedValue('barbaz', 9, (), pane_info),
            windowed_value.WindowedValue('bazfoo', 3, (), pane_info),
            windowed_value.WindowedValue('bazbar', 6, (), pane_info),
            windowed_value.WindowedValue('bazbaz', 9, (), pane_info),
        ]

        self.assertEqual(
            list(
                windowed_value.WindowedBatch.from_windowed_values(
                    windowed_values, produce_fn=list)), [
                        windowed_value.HomogeneousWindowedBatch.of(
                            ['foofoo', 'barfoo', 'bazfoo'], 3, (), pane_info),
                        windowed_value.HomogeneousWindowedBatch.of(
                            ['foobar', 'barbar', 'bazbar'], 6, (), pane_info),
                        windowed_value.HomogeneousWindowedBatch.of(
                            ['foobaz', 'barbaz', 'bazbaz'], 9, (), pane_info)
                    ])
예제 #3
0
 def test_homogeneous_windowed_batch_with_values(self):
     pane_info = windowed_value.PaneInfo(
         True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0)
     wb = windowed_value.HomogeneousWindowedBatch.of(['foo', 'bar'], 6, (),
                                                     pane_info)
     self.assertEqual(
         wb.with_values(['baz', 'foo']),
         windowed_value.HomogeneousWindowedBatch.of(['baz', 'foo'], 6, (),
                                                    pane_info))
예제 #4
0
    def test_homogeneous_windowed_batch_as_windowed_values(self):
        pane_info = windowed_value.PaneInfo(
            True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0)
        wb = windowed_value.HomogeneousWindowedBatch.of(['foo', 'bar'], 3, (),
                                                        pane_info)

        self.assertEqual(list(wb.as_windowed_values(iter)), [
            windowed_value.WindowedValue('foo', 3, (), pane_info),
            windowed_value.WindowedValue('bar', 3, (), pane_info)
        ])
예제 #5
0
  def test_windowedvalue_coder_paneinfo(self):
    coder = coders.WindowedValueCoder(coders.VarIntCoder(),
                                      coders.GlobalWindowCoder())
    test_paneinfo_values = [
        windowed_value.PANE_INFO_UNKNOWN,
        windowed_value.PaneInfo(
            True, True, windowed_value.PaneInfoTiming.EARLY, 0, -1),
        windowed_value.PaneInfo(
            True, False, windowed_value.PaneInfoTiming.ON_TIME, 0, 0),
        windowed_value.PaneInfo(
            True, False, windowed_value.PaneInfoTiming.ON_TIME, 10, 0),
        windowed_value.PaneInfo(
            False, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 23),
        windowed_value.PaneInfo(
            False, True, windowed_value.PaneInfoTiming.ON_TIME, 12, 23),
        windowed_value.PaneInfo(
            False, False, windowed_value.PaneInfoTiming.LATE, 0, 123),]

    test_values = [windowed_value.WindowedValue(123, 234, (GlobalWindow(),), p)
                   for p in test_paneinfo_values]

    # Test unnested.
    self.check_coder(coder, windowed_value.WindowedValue(
        123, 234, (GlobalWindow(),), windowed_value.PANE_INFO_UNKNOWN))
    for value in test_values:
      self.check_coder(coder, value)

    # Test nested.
    for value1 in test_values:
      for value2 in test_values:
        self.check_coder(coders.TupleCoder((coder, coder)), (value1, value2))
예제 #6
0
  def _output(
      self,
      window,
      finished,
      state,
      input_watermark,
      output_watermark,
      maybe_ontime):
    """Output window and clean up if appropriate."""
    index = state.get_state(window, self.INDEX)
    state.add_state(window, self.INDEX, 1)
    if output_watermark <= window.max_timestamp():
      nonspeculative_index = -1
      timing = windowed_value.PaneInfoTiming.EARLY
      if state.get_state(window, self.NONSPECULATIVE_INDEX):
        nonspeculative_index = state.get_state(
            window, self.NONSPECULATIVE_INDEX)
        state.add_state(window, self.NONSPECULATIVE_INDEX, 1)
        _LOGGER.warning(
            'Watermark moved backwards in time '
            'or late data moved window end forward.')
    else:
      nonspeculative_index = state.get_state(window, self.NONSPECULATIVE_INDEX)
      state.add_state(window, self.NONSPECULATIVE_INDEX, 1)
      timing = (
          windowed_value.PaneInfoTiming.ON_TIME if maybe_ontime and
          nonspeculative_index == 0 else windowed_value.PaneInfoTiming.LATE)
    pane_info = windowed_value.PaneInfo(
        index == 0, finished, timing, index, nonspeculative_index)

    values = state.get_state(window, self.ELEMENTS)
    if finished:
      # TODO(robertwb): allowed lateness
      state.clear_state(window, self.ELEMENTS)
      state.add_state(window, self.TOMBSTONE, 1)
    elif self.accumulation_mode == AccumulationMode.DISCARDING:
      state.clear_state(window, self.ELEMENTS)

    timestamp = state.get_state(window, self.WATERMARK_HOLD)
    if timestamp is None:
      # If no watermark hold was set, output at end of window.
      timestamp = window.max_timestamp()
    elif input_watermark < window.end and self.trigger_fn.has_ontime_pane():
      # Hold the watermark in case there is an empty pane that needs to be fired
      # at the end of the window.
      pass
    else:
      state.clear_state(window, self.WATERMARK_HOLD)

    return WindowedValue(values, timestamp, (window, ), pane_info)
예제 #7
0
 def test_pane_info_param(self):
     with TestPipeline() as p:
         pc = p | Create([(None, None)])
         assert_that(pc | beam.Map(lambda _, p=DoFn.PaneInfoParam: p),
                     equal_to([windowed_value.PANE_INFO_UNKNOWN]),
                     label='CheckUngrouped')
         assert_that(pc | beam.GroupByKey()
                     | beam.Map(lambda _, p=DoFn.PaneInfoParam: p),
                     equal_to([
                         windowed_value.PaneInfo(
                             is_first=True,
                             is_last=True,
                             timing=windowed_value.PaneInfoTiming.ON_TIME,
                             index=0,
                             nonspeculative_index=0)
                     ]),
                     label='CheckGrouped')
예제 #8
0
 def partition(self, n):
     # type: (int) -> List[List[bytes]]
     """ It is used to partition _GroupingBuffer to N parts. Once it is
 partitioned, it would not be re-partitioned with diff N. Re-partition
 is not supported now.
 """
     if not self._grouped_output:
         if self._windowing.is_default():
             globally_window = GlobalWindows.windowed_value(
                 None,
                 timestamp=GlobalWindow().max_timestamp(),
                 pane_info=windowed_value.PaneInfo(
                     is_first=True,
                     is_last=True,
                     timing=windowed_value.PaneInfoTiming.ON_TIME,
                     index=0,
                     nonspeculative_index=0)).with_value
             windowed_key_values = lambda key, values: [
                 globally_window((key, values))
             ]
         else:
             # TODO(pabloem, BEAM-7514): Trigger driver needs access to the clock
             #   note that this only comes through if windowing is default - but what
             #   about having multiple firings on the global window.
             #   May need to revise.
             trigger_driver = trigger.create_trigger_driver(
                 self._windowing, True)
             windowed_key_values = trigger_driver.process_entire_key
         coder_impl = self._post_grouped_coder.get_impl()
         key_coder_impl = self._key_coder.get_impl()
         self._grouped_output = [[] for _ in range(n)]
         output_stream_list = [create_OutputStream() for _ in range(n)]
         for idx, (encoded_key,
                   windowed_values) in enumerate(self._table.items()):
             key = key_coder_impl.decode(encoded_key)
             for wkvs in windowed_key_values(key, windowed_values):
                 coder_impl.encode_to_stream(wkvs,
                                             output_stream_list[idx % n],
                                             True)
         for ix, output_stream in enumerate(output_stream_list):
             self._grouped_output[ix] = [output_stream.get()]
         self._table.clear()
     return self._grouped_output
예제 #9
0
 def decode_from_stream(self, in_stream, nested):
   encoded_first_byte = in_stream.read_byte()
   base = windowed_value._BYTE_TO_PANE_INFO[encoded_first_byte & 0xF]
   assert base is not None
   encoding_type = encoded_first_byte >> 4
   if encoding_type == PaneInfoEncoding_FIRST:
     return base
   elif encoding_type == PaneInfoEncoding.ONE_INDEX:
     index = in_stream.read_var_int64()
     if base.timing == windowed_value.PaneInfoTiming.EARLY:
       nonspeculative_index = -1
     else:
       nonspeculative_index = index
   elif encoding_type == PaneInfoEncoding.TWO_INDICES:
     index = in_stream.read_var_int64()
     nonspeculative_index = in_stream.read_var_int64()
   else:
     raise NotImplementedError('Invalid PaneInfoEncoding: %s' % encoding_type)
   return windowed_value.PaneInfo(
       base.is_first, base.is_last, base.timing, index, nonspeculative_index)
 def test_pickle(self):
   pane_info = windowed_value.PaneInfo(
       True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0)
   wv = windowed_value.WindowedValue(1, 3, (), pane_info)
   self.assertTrue(pickle.loads(pickle.dumps(wv)) == wv)
 def test_with_value(self):
   pane_info = windowed_value.PaneInfo(
       True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0)
   wv = windowed_value.WindowedValue(1, 3, (), pane_info)
   self.assertEqual(
       wv.with_value(10), windowed_value.WindowedValue(10, 3, (), pane_info))
예제 #12
0
        self.assertFalse(wv is wv_copy)
        self.assertEqual({wv: 100}.get(wv_copy), 100)

    def test_pickle(self):
        pane_info = windowed_value.PaneInfo(
            True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0)
        wv = windowed_value.WindowedValue(1, 3, (), pane_info)
        self.assertTrue(pickle.loads(pickle.dumps(wv)) == wv)


WINDOWED_BATCH_INSTANCES = [
    windowed_value.HomogeneousWindowedBatch.of(
        None, 3, (), windowed_value.PANE_INFO_UNKNOWN),
    windowed_value.HomogeneousWindowedBatch.of(
        None, 3, (),
        windowed_value.PaneInfo(True, False,
                                windowed_value.PaneInfoTiming.ON_TIME, 0, 0)),
]


class WindowedBatchTest(unittest.TestCase):
    def test_homogeneous_windowed_batch_with_values(self):
        pane_info = windowed_value.PaneInfo(
            True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0)
        wb = windowed_value.HomogeneousWindowedBatch.of(['foo', 'bar'], 6, (),
                                                        pane_info)
        self.assertEqual(
            wb.with_values(['baz', 'foo']),
            windowed_value.HomogeneousWindowedBatch.of(['baz', 'foo'], 6, (),
                                                       pane_info))

    def test_homogeneous_windowed_batch_as_windowed_values(self):