def test_windowed_value_coder(self): coder = coders.WindowedValueCoder(coders.VarIntCoder(), coders.GlobalWindowCoder()) # Verify cloud object representation self.assertEqual( { '@type': 'kind:windowed_value', 'is_wrapper': True, 'component_encodings': [ coders.VarIntCoder().as_cloud_object(), coders.GlobalWindowCoder().as_cloud_object(), ], }, coder.as_cloud_object()) # Test binary representation self.assertEqual( '\x01\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01', coder.encode(window.GlobalWindows.windowed_value(1))) # Test unnested self.check_coder(coders.WindowedValueCoder(coders.VarIntCoder()), windowed_value.WindowedValue(3, -100, ()), windowed_value.WindowedValue(-1, 100, (1, 2, 3))) # Test nested self.check_coder( coders.TupleCoder( (coders.WindowedValueCoder(coders.FloatCoder()), coders.WindowedValueCoder(coders.StrUtf8Coder()))), (windowed_value.WindowedValue(1.5, 0, ()), windowed_value.WindowedValue("abc", 10, ('window', ))))
def test_windowedvalue_coder_paneinfo(self): coder = coders.WindowedValueCoder(coders.VarIntCoder(), coders.GlobalWindowCoder()) test_paneinfo_values = [ windowed_value.PANE_INFO_UNKNOWN, windowed_value.PaneInfo( True, True, windowed_value.PaneInfoTiming.EARLY, 0, -1), windowed_value.PaneInfo( True, False, windowed_value.PaneInfoTiming.ON_TIME, 0, 0), windowed_value.PaneInfo( True, False, windowed_value.PaneInfoTiming.ON_TIME, 10, 0), windowed_value.PaneInfo( False, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 23), windowed_value.PaneInfo( False, True, windowed_value.PaneInfoTiming.ON_TIME, 12, 23), windowed_value.PaneInfo( False, False, windowed_value.PaneInfoTiming.LATE, 0, 123),] test_values = [windowed_value.WindowedValue(123, 234, (GlobalWindow(),), p) for p in test_paneinfo_values] # Test unnested. self.check_coder(coder, windowed_value.WindowedValue( 123, 234, (GlobalWindow(),), windowed_value.PANE_INFO_UNKNOWN)) for value in test_values: self.check_coder(coder, value) # Test nested. for value1 in test_values: for value2 in test_values: self.check_coder(coders.TupleCoder((coder, coder)), (value1, value2))
def test_timestamps(self): wv = windowed_value.WindowedValue(None, 3, ()) self.assertEqual(wv.timestamp, Timestamp.of(3)) self.assertTrue(wv.timestamp is wv.timestamp) self.assertEqual( windowed_value.WindowedValue(None, -2.5, ()).timestamp, Timestamp.of(-2.5))
def test_homogeneous_windowed_batch_as_windowed_values(self): pane_info = windowed_value.PaneInfo( True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0) wb = windowed_value.HomogeneousWindowedBatch.of(['foo', 'bar'], 3, (), pane_info) self.assertEqual(list(wb.as_windowed_values(iter)), [ windowed_value.WindowedValue('foo', 3, (), pane_info), windowed_value.WindowedValue('bar', 3, (), pane_info) ])
def test_param_windowed_value_coder(self): from apache_beam.transforms.window import IntervalWindow from apache_beam.utils.windowed_value import PaneInfo wv = windowed_value.create( b'', # Milliseconds to microseconds 1000 * 1000, (IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)) windowed_value_coder = coders.WindowedValueCoder( coders.BytesCoder(), coders.IntervalWindowCoder()) payload = windowed_value_coder.encode(wv) coder = coders.ParamWindowedValueCoder( payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]) # Test binary representation self.assertEqual(b'\x01', coder.encode(window.GlobalWindows.windowed_value(1))) # Test unnested self.check_coder( coders.ParamWindowedValueCoder( payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]), windowed_value.WindowedValue( 3, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)), windowed_value.WindowedValue( 1, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3))) # Test nested self.check_coder( coders.TupleCoder(( coders.ParamWindowedValueCoder( payload, [ coders.FloatCoder(), coders.IntervalWindowCoder()]), coders.ParamWindowedValueCoder( payload, [ coders.StrUtf8Coder(), coders.IntervalWindowCoder()]))), (windowed_value.WindowedValue( 1.5, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)), windowed_value.WindowedValue( "abc", 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3))))
def test_homogeneous_from_windowed_values(self): pane_info = windowed_value.PaneInfo( True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0) windowed_values = [ windowed_value.WindowedValue('foofoo', 3, (), pane_info), windowed_value.WindowedValue('foobar', 6, (), pane_info), windowed_value.WindowedValue('foobaz', 9, (), pane_info), windowed_value.WindowedValue('barfoo', 3, (), pane_info), windowed_value.WindowedValue('barbar', 6, (), pane_info), windowed_value.WindowedValue('barbaz', 9, (), pane_info), windowed_value.WindowedValue('bazfoo', 3, (), pane_info), windowed_value.WindowedValue('bazbar', 6, (), pane_info), windowed_value.WindowedValue('bazbaz', 9, (), pane_info), ] self.assertEqual( list( windowed_value.WindowedBatch.from_windowed_values( windowed_values, produce_fn=list)), [ windowed_value.HomogeneousWindowedBatch.of( ['foofoo', 'barfoo', 'bazfoo'], 3, (), pane_info), windowed_value.HomogeneousWindowedBatch.of( ['foobar', 'barbar', 'bazbar'], 6, (), pane_info), windowed_value.HomogeneousWindowedBatch.of( ['foobaz', 'barbaz', 'bazbaz'], 9, (), pane_info) ])
def test_equality(self): self.assertEqual(windowed_value.WindowedValue(1, 3, ()), windowed_value.WindowedValue(1, 3, ())) self.assertNotEqual(windowed_value.WindowedValue(1, 3, ()), windowed_value.WindowedValue(100, 3, ())) self.assertNotEqual(windowed_value.WindowedValue(1, 3, ()), windowed_value.WindowedValue(1, 300, ())) self.assertNotEqual(windowed_value.WindowedValue(1, 3, ()), windowed_value.WindowedValue(1, 300, ((), ))) self.assertNotEqual(windowed_value.WindowedValue(1, 3, ()), object())
def random_windowed_value(num_windows): return windowed_value.WindowedValue( value=small_int(), timestamp=12345678, windows=tuple( window.IntervalWindow(i * 10, i * 10 + small_int()) for i in range(num_windows)))
def finish_bundle(self): for (window, timestamp), tagged_parts in self._parts.items(): yield windowed_value.WindowedValue( {tag: pd.concat(parts) for tag, parts in tagged_parts.items()}, timestamp, (window, )) self.start_bundle()
def reify_timestamps(element, timestamp=DoFn.TimestampParam, window=DoFn.WindowParam): key, value = element # Transport the window as part of the value and restore it later. return key, windowed_value.WindowedValue( value, timestamp, [window])
def finish_bundle(self): for window, batch in self._batches.items(): if batch: with self._batch_size_estimator.record_time(self._batch_size): yield windowed_value.WindowedValue( batch, window.max_timestamp(), (window,)) self._batches = None self._batch_size = self._batch_size_estimator.next_batch_size()
def restore_timestamps(element, window=DoFn.WindowParam): # Pass the current window since _IdentityWindowFn wouldn't know how # to generate it. key, values = element return [ windowed_value.WindowedValue( (key, value.value), value.timestamp, [window]) for value in values]
def finish_bundle(self): for (window, timestamp), tagged_parts in self._parts.items(): yield windowed_value.WindowedValue( # yapf break { tag: pd.concat(parts) if parts else None for (tag, parts) in tagged_parts.items() }, timestamp, (window, )) self.start_bundle()
def process(self, element, window=DoFn.WindowParam): self._batches[window].append(element) if len(self._batches[window]) >= self._batch_size: with self._batch_size_estimator.record_time(self._batch_size): yield windowed_value.WindowedValue( self._batches[window], window.max_timestamp(), (window,)) del self._batches[window] self._batch_size = self._batch_size_estimator.next_batch_size() elif len(self._batches) > self._MAX_LIVE_WINDOWS: window, _ = sorted( self._batches.items(), key=lambda window_batch: len(window_batch[1]), reverse=True)[0] with self._batch_size_estimator.record_time(self._batch_size): yield windowed_value.WindowedValue( self._batches[window], window.max_timestamp(), (window,)) del self._batches[window] self._batch_size = self._batch_size_estimator.next_batch_size()
def test_windowed_value_coder(self): coder = coders.WindowedValueCoder( coders.VarIntCoder(), coders.GlobalWindowCoder()) # Verify cloud object representation self.assertEqual({ '@type': 'kind:windowed_value', 'is_wrapper': True, 'component_encodings': [ coders.VarIntCoder().as_cloud_object(), coders.GlobalWindowCoder().as_cloud_object(), ], }, coder.as_cloud_object()) # Test binary representation self.assertEqual( b'\x7f\xdf;dZ\x1c\xac\t\x00\x00\x00\x01\x0f\x01', coder.encode(window.GlobalWindows.windowed_value(1))) # Test decoding large timestamp self.assertEqual( coder.decode(b'\x7f\xdf;dZ\x1c\xac\x08\x00\x00\x00\x01\x0f\x00'), windowed_value.create(0, MIN_TIMESTAMP.micros, (GlobalWindow(), ))) # Test unnested self.check_coder( coders.WindowedValueCoder(coders.VarIntCoder()), windowed_value.WindowedValue(3, -100, ()), windowed_value.WindowedValue(-1, 100, (1, 2, 3))) # Test Global Window self.check_coder( coders.WindowedValueCoder( coders.VarIntCoder(), coders.GlobalWindowCoder()), window.GlobalWindows.windowed_value(1)) # Test nested self.check_coder( coders.TupleCoder(( coders.WindowedValueCoder(coders.FloatCoder()), coders.WindowedValueCoder(coders.StrUtf8Coder()))), ( windowed_value.WindowedValue(1.5, 0, ()), windowed_value.WindowedValue("abc", 10, ('window', ))))
def process(self, element, window=DoFn.WindowParam, timestamp=DoFn.TimestampParam): try: k, v = element except TypeError: raise TypeCheckError( 'Input to GroupByKey must be a PCollection with ' 'elements compatible with KV[A, B]') yield (k, windowed_value.WindowedValue(v, timestamp, [window]))
def _window(self, output, add_window=False): """Forces an output into the global window. While 'process' will output to the same window as its incomming element, 'finish_bundle' has to specify BatchedInferencea window to output into. Since we are dealing with a bounded input, we can use 'GlobalWindow'. Args: output: The function output that may need to be added to a window. add_window: Adds output to the GlobalWindow. Returns: output or output encapsulated in 'WindowedValue'. """ if add_window: return windowed_value.WindowedValue(output, -1, [window.GlobalWindow()]) return output
def test_nested_observables(self): class FakeObservableIterator(observable.ObservableMixin): def __iter__(self): return iter([1, 2, 3]) # Coder for elements from the observable iterator. elem_coder = coders.VarIntCoder() iter_coder = coders.TupleSequenceCoder(elem_coder) # Test nested WindowedValue observable. coder = coders.WindowedValueCoder(iter_coder) observ = FakeObservableIterator() value = windowed_value.WindowedValue(observ, 0, ()) self.assertEqual( coder.get_impl().get_estimated_size_and_observables(value)[1], [(observ, elem_coder.get_impl())]) # Test nested tuple observable. coder = coders.TupleCoder((coders.StrUtf8Coder(), iter_coder)) value = (u'123', observ) self.assertEqual( coder.get_impl().get_estimated_size_and_observables(value)[1], [(observ, elem_coder.get_impl())])
def test_pickle(self): pane_info = windowed_value.PaneInfo( True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0) wv = windowed_value.WindowedValue(1, 3, (), pane_info) self.assertTrue(pickle.loads(pickle.dumps(wv)) == wv)
def test_hash(self): wv = windowed_value.WindowedValue(1, 3, ()) wv_copy = copy.copy(wv) self.assertFalse(wv is wv_copy) self.assertEqual({wv: 100}.get(wv_copy), 100)
def test_with_value(self): pane_info = windowed_value.PaneInfo( True, True, windowed_value.PaneInfoTiming.ON_TIME, 0, 0) wv = windowed_value.WindowedValue(1, 3, (), pane_info) self.assertEqual( wv.with_value(10), windowed_value.WindowedValue(10, 3, (), pane_info))
def test_with_value(self): wv = windowed_value.WindowedValue(1, 3, ()) self.assertEqual(wv.with_value(10), windowed_value.WindowedValue(10, 3, ()))
def test_pickle(self): wv = windowed_value.WindowedValue(1, 3, ()) self.assertTrue(pickle.loads(pickle.dumps(wv)) == wv)
def finish_bundle(self): for (window, timestamp), parts in self._parts.items(): yield windowed_value.WindowedValue(_concat(parts), timestamp, (window, )) self.start_bundle()
def globally_windowed_value(): return windowed_value.WindowedValue( value=small_int(), timestamp=12345678, windows=(window.GlobalWindow(), ))
def finish_bundle(self): for k, vals in self.keys.items(): yield windowed_value.WindowedValue((k, vals), MIN_TIMESTAMP, [GlobalWindow()])
def set(self, ts): ts = timestamp.Timestamp.of(ts) self._receiver.receive( windowed_value.WindowedValue( (self._key, dict(timestamp=ts)), ts, (self._window,)))
def process(self, element, window=DoFn.WindowParam): # Pass the current window since _IdentityWindowFn wouldn't know how # to generate it. yield windowed_value.WindowedValue( (element[0], element[1].value), element[1].timestamp, [window])
def clear(self): dummy_millis = int(common_urns.constants.MAX_TIMESTAMP_MILLIS.constant) + 1 clear_ts = timestamp.Timestamp(micros=dummy_millis * 1000) self._receiver.receive( windowed_value.WindowedValue( (self._key, dict(timestamp=clear_ts)), 0, (self._window,)))