class BasicStatefulDoFn(DoFn): BUFFER_STATE = BagStateSpec('buffer', BytesCoder()) EXPIRY_TIMER = TimerSpec('expiry1', TimeDomain.WATERMARK) EXPIRY_TIMER_FAMILY = TimerSpec('expiry_family_1', TimeDomain.WATERMARK) def process( self, element, buffer=DoFn.StateParam(BUFFER_STATE), timer1=DoFn.TimerParam(EXPIRY_TIMER), dynamic_timer=DoFn.TimerParam(EXPIRY_TIMER_FAMILY)): yield element @on_timer(EXPIRY_TIMER) def expiry_callback(self, element, timer=DoFn.TimerParam(EXPIRY_TIMER)): yield element @on_timer(EXPIRY_TIMER_FAMILY) def expiry_family_callback( self, element, dynamic_timer=DoFn.TimerParam(EXPIRY_TIMER_FAMILY)): yield element
class StatefulDoFnWithTimerWithTypo3(DoFn): BUFFER_STATE = BagStateSpec('buffer', BytesCoder()) EXPIRY_TIMER_1 = TimerSpec('expiry1', TimeDomain.WATERMARK) EXPIRY_TIMER_2 = TimerSpec('expiry2', TimeDomain.WATERMARK) def process(self, element, timer1=DoFn.TimerParam(EXPIRY_TIMER_1), timer2=DoFn.TimerParam(EXPIRY_TIMER_2)): pass @on_timer(EXPIRY_TIMER_1) def on_expiry_1(self, buffer_state=DoFn.StateParam(BUFFER_STATE)): yield 'expired1' def on_expiry_2(self, buffer_state=DoFn.StateParam(BUFFER_STATE)): yield 'expired2' # Use a stable string value for matching. def __repr__(self): return 'StatefulDoFnWithTimerWithTypo3'
class TestStatefulDoFn(DoFn): """An example stateful DoFn with state and timers.""" BUFFER_STATE_1 = BagStateSpec('buffer', BytesCoder()) BUFFER_STATE_2 = BagStateSpec('buffer2', VarIntCoder()) EXPIRY_TIMER_1 = TimerSpec('expiry1', TimeDomain.WATERMARK) EXPIRY_TIMER_2 = TimerSpec('expiry2', TimeDomain.WATERMARK) EXPIRY_TIMER_3 = TimerSpec('expiry3', TimeDomain.WATERMARK) def process(self, element, t=DoFn.TimestampParam, buffer_1=DoFn.StateParam(BUFFER_STATE_1), buffer_2=DoFn.StateParam(BUFFER_STATE_2), timer_1=DoFn.TimerParam(EXPIRY_TIMER_1), timer_2=DoFn.TimerParam(EXPIRY_TIMER_2)): yield element @on_timer(EXPIRY_TIMER_1) def on_expiry_1(self, window=DoFn.WindowParam, timestamp=DoFn.TimestampParam, key=DoFn.KeyParam, buffer=DoFn.StateParam(BUFFER_STATE_1), timer_1=DoFn.TimerParam(EXPIRY_TIMER_1), timer_2=DoFn.TimerParam(EXPIRY_TIMER_2), timer_3=DoFn.TimerParam(EXPIRY_TIMER_3)): yield 'expired1' @on_timer(EXPIRY_TIMER_2) def on_expiry_2(self, buffer=DoFn.StateParam(BUFFER_STATE_2), timer_2=DoFn.TimerParam(EXPIRY_TIMER_2), timer_3=DoFn.TimerParam(EXPIRY_TIMER_3)): yield 'expired2' @on_timer(EXPIRY_TIMER_3) def on_expiry_3(self, buffer_1=DoFn.StateParam(BUFFER_STATE_1), buffer_2=DoFn.StateParam(BUFFER_STATE_2), timer_3=DoFn.TimerParam(EXPIRY_TIMER_3)): yield 'expired3'
class HashJoinStatefulDoFn(DoFn): BUFFER_STATE = BagStateSpec('buffer', BytesCoder()) UNMATCHED_TIMER = TimerSpec('unmatched', TimeDomain.WATERMARK) def process(self, element, state=DoFn.StateParam(BUFFER_STATE), timer=DoFn.TimerParam(UNMATCHED_TIMER)): key, value = element existing_values = list(state.read()) if not existing_values: state.add(value) timer.set(100) else: yield b'Record<%s,%s,%s>' % (key, existing_values[0], value) state.clear() timer.clear() @on_timer(UNMATCHED_TIMER) def expiry_callback(self, state=DoFn.StateParam(BUFFER_STATE)): buffered = list(state.read()) assert len(buffered) == 1, buffered state.clear() yield b'Unmatched<%s>' % (buffered[0],)
class StatefulDoFnWithTimerWithTypo2(DoFn): BUFFER_STATE = BagStateSpec('buffer', BytesCoder()) EXPIRY_TIMER_1 = TimerSpec('expiry1', TimeDomain.WATERMARK) EXPIRY_TIMER_2 = TimerSpec('expiry2', TimeDomain.WATERMARK) def process(self, element, timer1=DoFn.TimerParam(EXPIRY_TIMER_1), timer2=DoFn.TimerParam(EXPIRY_TIMER_2)): pass @on_timer(EXPIRY_TIMER_1) def on_expiry_1(self, buffer_state=DoFn.StateParam(BUFFER_STATE)): yield 'expired1' # Note that we mistakenly reuse the "on_expiry_1" name; this is valid # syntactically in Python. @on_timer(EXPIRY_TIMER_2) def on_expiry_1(self, buffer_state=DoFn.StateParam(BUFFER_STATE)): yield 'expired2' # Use a stable string value for matching. def __repr__(self): return 'StatefulDoFnWithTimerWithTypo2'
def test_param_construction(self): with self.assertRaises(ValueError): DoFn.StateParam(TimerSpec('timer', TimeDomain.WATERMARK)) with self.assertRaises(ValueError): DoFn.TimerParam(BagStateSpec('elements', BytesCoder()))
from apache_beam.runners.worker import bundle_processor from apache_beam.transforms import trigger from apache_beam.transforms import window from apache_beam.transforms.window import GlobalWindow from apache_beam.transforms.window import GlobalWindows from apache_beam.utils import proto_utils from apache_beam.utils import windowed_value if TYPE_CHECKING: from apache_beam.coders.coder_impl import CoderImpl from apache_beam.runners.portability.fn_api_runner import worker_handlers from apache_beam.runners.portability.fn_api_runner.translations import DataSideInput from apache_beam.transforms.window import BoundedWindow ENCODED_IMPULSE_VALUE = WindowedValueCoder( BytesCoder(), GlobalWindowCoder()).get_impl().encode_nested( GlobalWindows.windowed_value(b'')) SAFE_WINDOW_FNS = set(window.WindowFn._known_urns.keys()) - set( [python_urns.PICKLED_WINDOWFN]) class Buffer(Protocol): def __iter__(self): # type: () -> Iterator[bytes] pass def append(self, item): # type: (bytes) -> None pass
class BadStatefulDoFn1(DoFn): BUFFER_STATE = BagStateSpec('buffer', BytesCoder()) def process(self, element, b1=DoFn.StateParam(BUFFER_STATE), b2=DoFn.StateParam(BUFFER_STATE)): yield element
def _encode_str(str_obj): encoded_str = str_obj.encode('utf-8') coder = LengthPrefixCoder(BytesCoder()) coder_urns = ['beam:coder:bytes:v1'] return ConfigValue(coder_urn=coder_urns, payload=coder.encode(encoded_str))
def _encode_list(list_obj): encoded_list = [val.encode('utf-8') for val in list_obj] coder = IterableCoder(LengthPrefixCoder(BytesCoder())) coder_urns = ['beam:coder:iterable:v1', 'beam:coder:bytes:v1'] return ConfigValue(coder_urn=coder_urns, payload=coder.encode(encoded_list))