class DynamicTimerDoFn(DoFn): EMIT_TIMER_FAMILY1 = TimerSpec('emit_family_1', TimeDomain.WATERMARK) EMIT_TIMER_FAMILY2 = TimerSpec('emit_family_2', TimeDomain.WATERMARK) def process(self, element, emit1=DoFn.TimerParam(EMIT_TIMER_FAMILY1), emit2=DoFn.TimerParam(EMIT_TIMER_FAMILY2)): emit1.set(10, dynamic_timer_tag='emit11') emit1.set(20, dynamic_timer_tag='emit12') emit1.set(30, dynamic_timer_tag='emit13') emit2.set(30, dynamic_timer_tag='emit21') emit2.set(20, dynamic_timer_tag='emit22') emit2.set(10, dynamic_timer_tag='emit23') @on_timer(EMIT_TIMER_FAMILY1) def emit_callback(self, ts=DoFn.TimestampParam, tag=DoFn.DynamicTimerTagParam): yield (tag, ts) @on_timer(EMIT_TIMER_FAMILY2) def emit_callback_2(self, ts=DoFn.TimestampParam, tag=DoFn.DynamicTimerTagParam): yield (tag, ts)
class StatefulDoFnWithTimerWithTypo2(DoFn): BUFFER_STATE = BagStateSpec('buffer', BytesCoder()) EXPIRY_TIMER_1 = TimerSpec('expiry1', TimeDomain.WATERMARK) EXPIRY_TIMER_2 = TimerSpec('expiry2', TimeDomain.WATERMARK) def process( self, element, timer1=DoFn.TimerParam(EXPIRY_TIMER_1), timer2=DoFn.TimerParam(EXPIRY_TIMER_2)): pass @on_timer(EXPIRY_TIMER_1) def on_expiry_1(self, buffer_state=DoFn.StateParam(BUFFER_STATE)): yield 'expired1' # Note that we mistakenly reuse the "on_expiry_1" name; this is valid # syntactically in Python. @on_timer(EXPIRY_TIMER_2) def on_expiry_1(self, buffer_state=DoFn.StateParam(BUFFER_STATE)): # pylint: disable=function-redefined yield 'expired2' # Use a stable string value for matching. def __repr__(self): return 'StatefulDoFnWithTimerWithTypo2'
class TimerEmittingStatefulDoFn(DoFn): EMIT_TIMER_1 = TimerSpec('emit1', TimeDomain.WATERMARK) EMIT_TIMER_2 = TimerSpec('emit2', TimeDomain.WATERMARK) EMIT_TIMER_3 = TimerSpec('emit3', TimeDomain.WATERMARK) def process( self, element, timer1=DoFn.TimerParam(EMIT_TIMER_1), timer2=DoFn.TimerParam(EMIT_TIMER_2), timer3=DoFn.TimerParam(EMIT_TIMER_3)): timer1.set(10) timer2.set(20) timer3.set(30) @on_timer(EMIT_TIMER_1) def emit_callback_1(self): yield 'timer1' @on_timer(EMIT_TIMER_2) def emit_callback_2(self): yield 'timer2' @on_timer(EMIT_TIMER_3) def emit_callback_3(self): yield 'timer3'
class SetStateClearingStatefulDoFn(beam.DoFn): SET_STATE = SetStateSpec('buffer', StrUtf8Coder()) EMIT_TIMER = TimerSpec('emit_timer', TimeDomain.WATERMARK) CLEAR_TIMER = TimerSpec('clear_timer', TimeDomain.WATERMARK) def process( self, element, set_state=beam.DoFn.StateParam(SET_STATE), emit_timer=beam.DoFn.TimerParam(EMIT_TIMER), clear_timer=beam.DoFn.TimerParam(CLEAR_TIMER)): value = element[1] set_state.add(value) clear_timer.set(100) emit_timer.set(1000) @on_timer(EMIT_TIMER) def emit_values(self, set_state=beam.DoFn.StateParam(SET_STATE)): for value in set_state.read(): yield value @on_timer(CLEAR_TIMER) def clear_values(self, set_state=beam.DoFn.StateParam(SET_STATE)): set_state.clear() set_state.add('different-value')
class BagStateClearingStatefulDoFn(beam.DoFn): BAG_STATE = BagStateSpec('bag_state', StrUtf8Coder()) EMIT_TIMER = TimerSpec('emit_timer', TimeDomain.WATERMARK) CLEAR_TIMER = TimerSpec('clear_timer', TimeDomain.WATERMARK) def process( self, element, bag_state=beam.DoFn.StateParam(BAG_STATE), emit_timer=beam.DoFn.TimerParam(EMIT_TIMER), clear_timer=beam.DoFn.TimerParam(CLEAR_TIMER)): value = element[1] bag_state.add(value) clear_timer.set(100) emit_timer.set(1000) @on_timer(EMIT_TIMER) def emit_values(self, bag_state=beam.DoFn.StateParam(BAG_STATE)): for value in bag_state.read(): yield value yield 'extra' @on_timer(CLEAR_TIMER) def clear_values(self, bag_state=beam.DoFn.StateParam(BAG_STATE)): bag_state.clear()
def test_spec_construction(self): BagStateSpec('statename', VarIntCoder()) with self.assertRaises(TypeError): BagStateSpec(123, VarIntCoder()) CombiningValueStateSpec('statename', VarIntCoder(), TopCombineFn(10)) with self.assertRaises(TypeError): CombiningValueStateSpec(123, VarIntCoder(), TopCombineFn(10)) with self.assertRaises(TypeError): CombiningValueStateSpec('statename', VarIntCoder(), object()) SetStateSpec('setstatename', VarIntCoder()) with self.assertRaises(TypeError): SetStateSpec(123, VarIntCoder()) with self.assertRaises(TypeError): SetStateSpec('setstatename', object()) ReadModifyWriteStateSpec('valuestatename', VarIntCoder()) with self.assertRaises(TypeError): ReadModifyWriteStateSpec(123, VarIntCoder()) with self.assertRaises(TypeError): ReadModifyWriteStateSpec('valuestatename', object()) # TODO: add more spec tests with self.assertRaises(ValueError): DoFn.TimerParam(BagStateSpec('elements', BytesCoder())) TimerSpec('timer', TimeDomain.WATERMARK) TimerSpec('timer', TimeDomain.REAL_TIME) with self.assertRaises(ValueError): TimerSpec('timer', 'bogus_time_domain') with self.assertRaises(ValueError): DoFn.StateParam(TimerSpec('timer', TimeDomain.WATERMARK))
def _pardo_group_into_batches( input_coder, batch_size, max_buffering_duration_secs, clock=time.time): ELEMENT_STATE = BagStateSpec('values', input_coder) COUNT_STATE = CombiningValueStateSpec('count', input_coder, CountCombineFn()) WINDOW_TIMER = TimerSpec('window_end', TimeDomain.WATERMARK) BUFFERING_TIMER = TimerSpec('buffering_end', TimeDomain.REAL_TIME) class _GroupIntoBatchesDoFn(DoFn): def process( self, element, window=DoFn.WindowParam, element_state=DoFn.StateParam(ELEMENT_STATE), count_state=DoFn.StateParam(COUNT_STATE), window_timer=DoFn.TimerParam(WINDOW_TIMER), buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)): # Allowed lateness not supported in Python SDK # https://beam.apache.org/documentation/programming-guide/#watermarks-and-late-data window_timer.set(window.end) element_state.add(element) count_state.add(1) count = count_state.read() if count == 1 and max_buffering_duration_secs > 0: # This is the first element in batch. Start counting buffering time if a # limit was set. # pylint: disable=deprecated-method buffering_timer.set(clock() + max_buffering_duration_secs) if count >= batch_size: return self.flush_batch(element_state, count_state, buffering_timer) @on_timer(WINDOW_TIMER) def on_window_timer( self, element_state=DoFn.StateParam(ELEMENT_STATE), count_state=DoFn.StateParam(COUNT_STATE), buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)): return self.flush_batch(element_state, count_state, buffering_timer) @on_timer(BUFFERING_TIMER) def on_buffering_timer( self, element_state=DoFn.StateParam(ELEMENT_STATE), count_state=DoFn.StateParam(COUNT_STATE), buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)): return self.flush_batch(element_state, count_state, buffering_timer) def flush_batch(self, element_state, count_state, buffering_timer): batch = [element for element in element_state.read()] if not batch: return key, _ = batch[0] batch_values = [v for (k, v) in batch] element_state.clear() count_state.clear() buffering_timer.clear() yield key, batch_values return _GroupIntoBatchesDoFn()
class BadStatefulDoFn4(DoFn): BUFFER_STATE = BagStateSpec('buffer', BytesCoder()) EXPIRY_TIMER_1 = TimerSpec('expiry1', TimeDomain.WATERMARK) EXPIRY_TIMER_2 = TimerSpec('expiry2', TimeDomain.WATERMARK) @on_timer(EXPIRY_TIMER_1) def expiry_callback(self, element, t1=DoFn.TimerParam(EXPIRY_TIMER_2), t2=DoFn.TimerParam(EXPIRY_TIMER_2)): yield element
class TestStatefulDoFn(DoFn): """An example stateful DoFn with state and timers.""" BUFFER_STATE_1 = BagStateSpec('buffer', BytesCoder()) BUFFER_STATE_2 = BagStateSpec('buffer2', VarIntCoder()) EXPIRY_TIMER_1 = TimerSpec('expiry1', TimeDomain.WATERMARK) EXPIRY_TIMER_2 = TimerSpec('expiry2', TimeDomain.WATERMARK) EXPIRY_TIMER_3 = TimerSpec('expiry3', TimeDomain.WATERMARK) EXPIRY_TIMER_FAMILY = TimerSpec('expiry_family', TimeDomain.WATERMARK) def process( self, element, t=DoFn.TimestampParam, buffer_1=DoFn.StateParam(BUFFER_STATE_1), buffer_2=DoFn.StateParam(BUFFER_STATE_2), timer_1=DoFn.TimerParam(EXPIRY_TIMER_1), timer_2=DoFn.TimerParam(EXPIRY_TIMER_2), dynamic_timer=DoFn.TimerParam(EXPIRY_TIMER_FAMILY)): yield element @on_timer(EXPIRY_TIMER_1) def on_expiry_1( self, window=DoFn.WindowParam, timestamp=DoFn.TimestampParam, key=DoFn.KeyParam, buffer=DoFn.StateParam(BUFFER_STATE_1), timer_1=DoFn.TimerParam(EXPIRY_TIMER_1), timer_2=DoFn.TimerParam(EXPIRY_TIMER_2), timer_3=DoFn.TimerParam(EXPIRY_TIMER_3)): yield 'expired1' @on_timer(EXPIRY_TIMER_2) def on_expiry_2( self, buffer=DoFn.StateParam(BUFFER_STATE_2), timer_2=DoFn.TimerParam(EXPIRY_TIMER_2), timer_3=DoFn.TimerParam(EXPIRY_TIMER_3)): yield 'expired2' @on_timer(EXPIRY_TIMER_3) def on_expiry_3( self, buffer_1=DoFn.StateParam(BUFFER_STATE_1), buffer_2=DoFn.StateParam(BUFFER_STATE_2), timer_3=DoFn.TimerParam(EXPIRY_TIMER_3)): yield 'expired3' @on_timer(EXPIRY_TIMER_FAMILY) def on_expiry_family( self, dynamic_timer=DoFn.TimerParam(EXPIRY_TIMER_FAMILY), dynamic_timer_tag=DoFn.DynamicTimerTagParam): yield (dynamic_timer_tag, 'expired_dynamic_timer')
def _pardo_group_into_batches(batch_size, input_coder): ELEMENT_STATE = BagStateSpec('values', input_coder) COUNT_STATE = CombiningValueStateSpec('count', input_coder, CountCombineFn()) EXPIRY_TIMER = TimerSpec('expiry', TimeDomain.WATERMARK) class _GroupIntoBatchesDoFn(DoFn): def process(self, element, window=DoFn.WindowParam, element_state=DoFn.StateParam(ELEMENT_STATE), count_state=DoFn.StateParam(COUNT_STATE), expiry_timer=DoFn.TimerParam(EXPIRY_TIMER)): # Allowed lateness not supported in Python SDK # https://beam.apache.org/documentation/programming-guide/#watermarks-and-late-data expiry_timer.set(window.end) element_state.add(element) count_state.add(1) count = count_state.read() if count >= batch_size: batch = [element for element in element_state.read()] yield batch element_state.clear() count_state.clear() @on_timer(EXPIRY_TIMER) def expiry(self, element_state=DoFn.StateParam(ELEMENT_STATE), count_state=DoFn.StateParam(COUNT_STATE)): batch = [element for element in element_state.read()] if batch: yield batch element_state.clear() count_state.clear() return _GroupIntoBatchesDoFn()
class BigBagDoFn(DoFn): VALUES_STATE = BagStateSpec('values', BytesCoder()) END_OF_WINDOW_TIMER = TimerSpec('end_of_window', TimeDomain.WATERMARK) def process(self, element: Tuple[str, bytes], window=DoFn.WindowParam, values_state=DoFn.StateParam(VALUES_STATE), end_of_window_timer=DoFn.TimerParam(END_OF_WINDOW_TIMER)): logging.info('start process.') key, value = element end_of_window_timer.set(window.end) values_state.add(value) logging.info('end process.') @on_timer(END_OF_WINDOW_TIMER) def end_of_window(self, values_state=DoFn.StateParam(VALUES_STATE)): logging.info('start end_of_window.') read_count = 0 read_bytes = 0 values = values_state.read() for value in values: read_count += 1 read_bytes += len(value) logging.info('read_count: %s, read_bytes: %s', read_count, read_bytes) logging.info('end end_of_window.')
class HashJoinStatefulDoFn(DoFn): BUFFER_STATE = BagStateSpec('buffer', BytesCoder()) UNMATCHED_TIMER = TimerSpec('unmatched', TimeDomain.WATERMARK) def process(self, element, state=DoFn.StateParam(BUFFER_STATE), timer=DoFn.TimerParam(UNMATCHED_TIMER)): key, value = element existing_values = list(state.read()) if not existing_values: state.add(value) timer.set(100) else: yield b'Record<%s,%s,%s>' % (key, existing_values[0], value) state.clear() timer.clear() @on_timer(UNMATCHED_TIMER) def expiry_callback(self, state=DoFn.StateParam(BUFFER_STATE)): buffered = list(state.read()) assert len(buffered) == 1, buffered state.clear() yield b'Unmatched<%s>' % (buffered[0], )
class GenerateRecords(beam.DoFn): EMIT_TIMER = TimerSpec('emit_timer', TimeDomain.REAL_TIME) COUNT_STATE = CombiningValueStateSpec('count_state', VarIntCoder(), CountCombineFn()) def __init__(self, frequency, total_records): self.total_records = total_records self.frequency = frequency def process(self, element, emit_timer=beam.DoFn.TimerParam(EMIT_TIMER)): # Processing time timers should be set on ABSOLUTE TIME. emit_timer.set(self.frequency) yield element[1] @on_timer(EMIT_TIMER) def emit_values(self, emit_timer=beam.DoFn.TimerParam(EMIT_TIMER), count_state=beam.DoFn.StateParam(COUNT_STATE)): count = count_state.read() or 0 if self.total_records == count: return count_state.add(1) # Processing time timers should be set on ABSOLUTE TIME. emit_timer.set(count + 1 + self.frequency) yield 'value'
class StatefulDoFnWithTimerWithTypo1(DoFn): # pylint: disable=unused-variable BUFFER_STATE = BagStateSpec('buffer', BytesCoder()) EXPIRY_TIMER_1 = TimerSpec('expiry1', TimeDomain.WATERMARK) EXPIRY_TIMER_2 = TimerSpec('expiry2', TimeDomain.WATERMARK) def process(self, element): pass @on_timer(EXPIRY_TIMER_1) def on_expiry_1(self, buffer_state=DoFn.StateParam(BUFFER_STATE)): yield 'expired1' # Note that we mistakenly associate this with the first timer. @on_timer(EXPIRY_TIMER_1) def on_expiry_2(self, buffer_state=DoFn.StateParam(BUFFER_STATE)): yield 'expired2'
class BadStatefulDoFn2(DoFn): TIMER = TimerSpec('timer', TimeDomain.WATERMARK) def process(self, element, t1=DoFn.TimerParam(TIMER), t2=DoFn.TimerParam(TIMER)): yield element
class BadStatefulDoFn5(DoFn): EXPIRY_TIMER_FAMILY = TimerSpec('dynamic_timer', TimeDomain.WATERMARK) def process(self, element, dynamic_timer_1=DoFn.TimerParam(EXPIRY_TIMER_FAMILY), dynamic_timer_2=DoFn.TimerParam(EXPIRY_TIMER_FAMILY)): yield element
class TimerEmittingStatefulDoFn(DoFn): EMIT_TIMER_1 = TimerSpec('emit1', TimeDomain.WATERMARK) def process(self, element, timer1=DoFn.TimerParam(EMIT_TIMER_1)): timer1.set(10) @on_timer(EMIT_TIMER_1) def emit_callback_1(self, window=DoFn.WindowParam, ts=DoFn.TimestampParam): yield ('timer1', int(ts), int(window.start), int(window.end))
class DynamicTimerDoFn(DoFn): EMIT_TIMER_FAMILY = TimerSpec('emit', TimeDomain.WATERMARK) def process(self, element, emit=DoFn.TimerParam(EMIT_TIMER_FAMILY)): emit.set(10) emit.set(20, dynamic_timer_tag='') @on_timer(EMIT_TIMER_FAMILY) def emit_callback( self, ts=DoFn.TimestampParam, tag=DoFn.DynamicTimerTagParam): yield (tag, ts)
class BasicStatefulDoFn(DoFn): BUFFER_STATE = BagStateSpec('buffer', BytesCoder()) EXPIRY_TIMER = TimerSpec('expiry1', TimeDomain.WATERMARK) def process(self, element, buffer=DoFn.StateParam(BUFFER_STATE), timer1=DoFn.TimerParam(EXPIRY_TIMER)): yield element @on_timer(EXPIRY_TIMER) def expiry_callback(self, element, timer=DoFn.TimerParam(EXPIRY_TIMER)): yield element
class StatefulDoFnWithTimerWithTypo3(DoFn): BUFFER_STATE = BagStateSpec('buffer', BytesCoder()) EXPIRY_TIMER_1 = TimerSpec('expiry1', TimeDomain.WATERMARK) EXPIRY_TIMER_2 = TimerSpec('expiry2', TimeDomain.WATERMARK) def process(self, element, timer1=DoFn.TimerParam(EXPIRY_TIMER_1), timer2=DoFn.TimerParam(EXPIRY_TIMER_2)): pass @on_timer(EXPIRY_TIMER_1) def on_expiry_1(self, buffer_state=DoFn.StateParam(BUFFER_STATE)): yield 'expired1' def on_expiry_2(self, buffer_state=DoFn.StateParam(BUFFER_STATE)): yield 'expired2' # Use a stable string value for matching. def __repr__(self): return 'StatefulDoFnWithTimerWithTypo3'
class EmitTwoEvents(DoFn): EMIT_CLEAR_SET_TIMER = TimerSpec('emitclear', TimeDomain.WATERMARK) def process(self, element, emit=DoFn.TimerParam(EMIT_CLEAR_SET_TIMER)): yield ('1', 'set') emit.set(1) @on_timer(EMIT_CLEAR_SET_TIMER) def emit_clear(self): yield ('1', 'clear')
class TestStatefulDoFn(DoFn): """An example stateful DoFn with state and timers.""" BUFFER_STATE_1 = BagStateSpec('buffer', BytesCoder()) BUFFER_STATE_2 = BagStateSpec('buffer2', VarIntCoder()) EXPIRY_TIMER_1 = TimerSpec('expiry1', TimeDomain.WATERMARK) EXPIRY_TIMER_2 = TimerSpec('expiry2', TimeDomain.WATERMARK) EXPIRY_TIMER_3 = TimerSpec('expiry3', TimeDomain.WATERMARK) def process(self, element, t=DoFn.TimestampParam, buffer_1=DoFn.StateParam(BUFFER_STATE_1), buffer_2=DoFn.StateParam(BUFFER_STATE_2), timer_1=DoFn.TimerParam(EXPIRY_TIMER_1), timer_2=DoFn.TimerParam(EXPIRY_TIMER_2)): yield element @on_timer(EXPIRY_TIMER_1) def on_expiry_1(self, buffer=DoFn.StateParam(BUFFER_STATE_1), timer_1=DoFn.TimerParam(EXPIRY_TIMER_1), timer_2=DoFn.TimerParam(EXPIRY_TIMER_2), timer_3=DoFn.TimerParam(EXPIRY_TIMER_3)): yield 'expired1' @on_timer(EXPIRY_TIMER_2) def on_expiry_2(self, buffer=DoFn.StateParam(BUFFER_STATE_2), timer_2=DoFn.TimerParam(EXPIRY_TIMER_2), timer_3=DoFn.TimerParam(EXPIRY_TIMER_3)): yield 'expired2' @on_timer(EXPIRY_TIMER_3) def on_expiry_3(self, buffer_1=DoFn.StateParam(BUFFER_STATE_1), buffer_2=DoFn.StateParam(BUFFER_STATE_2), timer_3=DoFn.TimerParam(EXPIRY_TIMER_3)): yield 'expired3'
class SimpleTestSetStatefulDoFn(DoFn): BUFFER_STATE = SetStateSpec('buffer', VarIntCoder()) EXPIRY_TIMER = TimerSpec('expiry', TimeDomain.WATERMARK) def process(self, element, buffer=DoFn.StateParam(BUFFER_STATE), timer1=DoFn.TimerParam(EXPIRY_TIMER)): unused_key, value = element buffer.add(value) timer1.set(20) @on_timer(EXPIRY_TIMER) def expiry_callback(self, buffer=DoFn.StateParam(BUFFER_STATE)): yield sorted(buffer.read())
class DynamicTimerDoFn(DoFn): EMIT_TIMER_FAMILY = TimerSpec('emit', TimeDomain.WATERMARK) GC_TIMER = TimerSpec('gc', TimeDomain.WATERMARK) def process(self, element, emit=DoFn.TimerParam(EMIT_TIMER_FAMILY), gc=DoFn.TimerParam(GC_TIMER)): emit.set(10, dynamic_timer_tag='emit1') emit.set(20, dynamic_timer_tag='emit2') emit.set(30, dynamic_timer_tag='emit3') gc.set(40) @on_timer(EMIT_TIMER_FAMILY) def emit_callback(self, ts=DoFn.TimestampParam, tag=DoFn.DynamicTimerTagParam): yield (tag, ts) @on_timer(GC_TIMER) def gc(self, ts=DoFn.TimestampParam): yield ('gc', ts)
class SimpleTestStatefulDoFn(DoFn): BUFFER_STATE = BagStateSpec('buffer', BytesCoder()) EXPIRY_TIMER = TimerSpec('expiry', TimeDomain.WATERMARK) def process(self, element, buffer=DoFn.StateParam(BUFFER_STATE), timer1=DoFn.TimerParam(EXPIRY_TIMER)): unused_key, value = element buffer.add(b'A' + str(value).encode('latin1')) timer1.set(20) @on_timer(EXPIRY_TIMER) def expiry_callback(self, buffer=DoFn.StateParam(BUFFER_STATE), timer=DoFn.TimerParam(EXPIRY_TIMER)): yield b''.join(sorted(buffer.read()))
class CountAndSchedule(beam.DoFn): COUNTER = BagStateSpec('counter', VarIntCoder()) SCHEDULED_TIMESTAMP = BagStateSpec('nextSchedule', VarIntCoder()) TIMER = TimerSpec('timer', TimeDomain.WATERMARK) def process(self, element, timestamp=beam.DoFn.TimestampParam, timer=beam.DoFn.TimerParam(TIMER), counter=beam.DoFn.StateParam(COUNTER), next_schedule=beam.DoFn.StateParam(SCHEDULED_TIMESTAMP), *args, **kwargs): current_count, = list(counter.read()) or [0] counter.clear() counter.add(current_count + 1) event_datetime = timestamp.to_utc_datetime() current_hour_end = event_datetime.replace( second=0, microsecond=0) + timedelta(minutes=1) next_tick = calendar.timegm(current_hour_end.timetuple()) timer.set(next_tick) next_schedule.clear() next_schedule.add(next_tick) @on_timer(TIMER) def timer_ticked(self, timer=beam.DoFn.TimerParam(TIMER), counter=beam.DoFn.StateParam(COUNTER), next_schedule=beam.DoFn.StateParam(SCHEDULED_TIMESTAMP)): print("TICKTICK") current_count, = counter.read() this_tick, = next_schedule.read() next_tick = this_tick + 60 next_schedule.clear() next_schedule.add(next_tick) counter.clear() counter.add(0) timer.clear() timer.set(next_tick) yield {'count': current_count, 'timestamp': this_tick}
class SimpleTestStatefulDoFn(DoFn): BUFFER_STATE = CombiningValueStateSpec( 'buffer', IterableCoder(VarIntCoder()), ToListCombineFn()) EXPIRY_TIMER = TimerSpec('expiry1', TimeDomain.WATERMARK) def process(self, element, buffer=DoFn.StateParam(BUFFER_STATE), timer1=DoFn.TimerParam(EXPIRY_TIMER)): unused_key, value = element buffer.add(value) timer1.set(20) @on_timer(EXPIRY_TIMER) def expiry_callback(self, buffer=DoFn.StateParam(BUFFER_STATE), timer=DoFn.TimerParam(EXPIRY_TIMER)): yield ''.join(str(x) for x in sorted(buffer.read()))
class BagInStateOutputAfterTimer(beam.DoFn): SET_STATE = SetStateSpec('buffer', VarIntCoder()) EMIT_TIMER = TimerSpec('emit_timer', TimeDomain.WATERMARK) def process(self, element, set_state=beam.DoFn.StateParam(SET_STATE), emit_timer=beam.DoFn.TimerParam(EMIT_TIMER)): _, values = element for v in values: set_state.add(v) emit_timer.set(1) @on_timer(EMIT_TIMER) def emit_values(self, set_state=beam.DoFn.StateParam(SET_STATE)): values = set_state.read() return [(random.randint(0, 1000), v) for v in values]
class DynamicTimerDoFn(DoFn): EMIT_TIMER_FAMILY = TimerSpec('emit', TimeDomain.WATERMARK) def process(self, element, emit=DoFn.TimerParam(EMIT_TIMER_FAMILY)): if element[1] == 'set': emit.set(10, dynamic_timer_tag='emit1') emit.set(20, dynamic_timer_tag='emit2') if element[1] == 'clear': emit.set(30, dynamic_timer_tag='emit3') emit.clear(dynamic_timer_tag='emit3') emit.set(40, dynamic_timer_tag='emit3') return [] @on_timer(EMIT_TIMER_FAMILY) def emit_callback(self, ts=DoFn.TimestampParam, tag=DoFn.DynamicTimerTagParam): yield (tag, ts)
class TimerExample(beam.DoFn): EXPIRY_TIMER = TimerSpec('expiry', beam.TimeDomain.REAL_TIME) EXPIRY_TIMER_DURATION_SECONDS = 5 def process(self, elem, timestamp=beam.DoFn.TimestampParam, expiry_timer=beam.DoFn.TimerParam(EXPIRY_TIMER)): (key, msg) = elem expiration = time.time() + TimerExample.EXPIRY_TIMER_DURATION_SECONDS logging.info('Current element (%s, %s, %s) => Setting the timer to %s', timestamp.to_utc_datetime(), key, msg, datetime.fromtimestamp(expiration)) expiry_timer.set(expiration) yield elem @on_timer(EXPIRY_TIMER) def expiry(self): logging.info("Timer expired after {} seconds".format( TimerExample.EXPIRY_TIMER_DURATION_SECONDS))