def watermark_trigger( self, key=DoFn.KeyParam, timer_tag=DoFn.DynamicTimerTagParam, timestamp=DoFn.TimestampParam, latest_watermark=DoFn.StateParam(LAST_KNOWN_WATERMARK), all_elements=DoFn.StateParam(WINDOW_ELEMENT_PAIRS), processing_time_timer=DoFn.TimerParam(PROCESSING_TIME_TIMER), window_tag_values: BagRuntimeState = DoFn.StateParam( WINDOW_TAG_VALUES), # type: ignore finished_windows_state: SetRuntimeState = DoFn. StateParam( # type: ignore FINISHED_WINDOWS), watermark_timer=DoFn.TimerParam(WATERMARK_TIMER)): context = FnRunnerStatefulTriggerContext( processing_time_timer=processing_time_timer, watermark_timer=watermark_timer, latest_processing_time=None, latest_watermark=latest_watermark, all_elements_state=all_elements, window_tag_values=window_tag_values, finished_windows_state=finished_windows_state) result = self._fire_eligible_windows(key, TimeDomain.WATERMARK, timestamp, timer_tag, context) latest_watermark.add(timestamp) return result
def flush(self, stamp=DoFn.TimestampParam, key=DoFn.KeyParam, buffer=DoFn.StateParam(BUFFER), minStamp=DoFn.StateParam(MIN_STAMP), flushTimer=DoFn.TimerParam(FLUSH_TIMER)): keep, flush = [], [] minKeepStamp = None for item in buffer.read(): if item[2] <= stamp: flush.append(item) else: keep.append(item) if not minKeepStamp or minKeepStamp > item[2]: minKeepStamp = item[2] outputs = [] if flush: flush = list(sorted(flush, key=lambda x: x[2])) outputs = list(self.flushMetrics(flush, key)) keep.append(flush[-1]) buffer.clear() for item in keep: buffer.add(item) if minKeepStamp: flushTimer.set(minKeepStamp) minStamp.write(minKeepStamp) else: minStamp.clear() return outputs
def process(self, element, stamp=DoFn.TimestampParam, buffer=DoFn.StateParam(BUFFER), minStamp=DoFn.StateParam(MIN_STAMP), flushTimer=DoFn.TimerParam(FLUSH_TIMER)): currentMinStamp = minStamp.read() or stamp if currentMinStamp == stamp: minStamp.write(stamp) flushTimer.set(stamp) buffer.add(element[1])
def process(self, element, batch=DoFn.StateParam(BATCH), batchSize=DoFn.StateParam(BATCH_SIZE), flushTimer=DoFn.TimerParam(FLUSH_TIMER), endOfTime=DoFn.TimerParam(EOW_TIMER)): from apache_beam.utils.timestamp import Timestamp, Duration from apache_beam.transforms.window import GlobalWindow currentSize = batchSize.read() if not currentSize: currentSize = 1 flushTimer.set(Timestamp.now() + Duration(micros=self.maxWaitTime * 1000)) endOfTime.set(GlobalWindow().max_timestamp()) else: currentSize += 1 batchSize.write(currentSize) batch.add(element[1]) if currentSize >= self.batchSize: return self.flush(batch, batchSize)
def process(self, element, counter=DoFn.StateParam(BYTES_STATE)): return self.return_recursive(1)
def process( self, element: typing.Tuple[ K, typing.Iterable[windowed_value.WindowedValue]], all_elements: BagRuntimeState = DoFn.StateParam( WINDOW_ELEMENT_PAIRS), # type: ignore latest_processing_time: AccumulatingRuntimeState = DoFn.StateParam( LAST_KNOWN_TIME), # type: ignore latest_watermark: AccumulatingRuntimeState = DoFn. StateParam( # type: ignore LAST_KNOWN_WATERMARK), window_tag_values: BagRuntimeState = DoFn.StateParam( WINDOW_TAG_VALUES), # type: ignore windows_state: SetRuntimeState = DoFn.StateParam( KNOWN_WINDOWS), # type: ignore finished_windows_state: SetRuntimeState = DoFn. StateParam( # type: ignore FINISHED_WINDOWS), processing_time_timer=DoFn.TimerParam(PROCESSING_TIME_TIMER), watermark_timer=DoFn.TimerParam(WATERMARK_TIMER), *args, **kwargs): context = FnRunnerStatefulTriggerContext( processing_time_timer=processing_time_timer, watermark_timer=watermark_timer, latest_processing_time=latest_processing_time, latest_watermark=latest_watermark, all_elements_state=all_elements, window_tag_values=window_tag_values, finished_windows_state=finished_windows_state) key, windowed_values = element watermark = read_watermark(latest_watermark) windows_to_elements = collections.defaultdict(list) for wv in windowed_values: for window in wv.windows: # ignore expired windows if watermark > window.end + self.windowing.allowed_lateness: continue if window in finished_windows_state.read(): continue windows_to_elements[window].append( TimestampedValue(wv.value, wv.timestamp)) # Processing merging of windows if self.merging_windows: old_windows = set(windows_state.read()) all_windows = old_windows.union(list(windows_to_elements)) if all_windows != old_windows: merge_context = TriggerMergeContext(all_windows, context, self.windowing) self.windowing.windowfn.merge(merge_context) merged_windows_to_elements = collections.defaultdict(list) for window, values in windows_to_elements.items(): while window in merge_context.merged_away: window = merge_context.merged_away[window] merged_windows_to_elements[window].extend(values) windows_to_elements = merged_windows_to_elements for w in windows_to_elements: windows_state.add(w) # Done processing merging of windows seen_windows = set() for w in windows_to_elements: window_context = context.for_window(w) seen_windows.add(w) for value_w_timestamp in windows_to_elements[w]: _LOGGER.debug(value_w_timestamp) all_elements.add((w, value_w_timestamp)) self.windowing.triggerfn.on_element(windowed_values, w, window_context) return self._fire_eligible_windows(key, TimeDomain.WATERMARK, watermark, None, context, seen_windows)
def onEndOfTime(self, batch=DoFn.StateParam(BATCH), batchSize=DoFn.StateParam(BATCH_SIZE)): return self.flush(batch, batchSize)