class AfterCount(TriggerFn): """Fire when there are at least count elements in this window pane. AfterCount is experimental. No backwards compatibility guarantees. """ COUNT_TAG = _CombiningValueStateTag('count', combiners.CountCombineFn()) def __init__(self, count): if not isinstance(count, numbers.Integral) or count < 1: raise ValueError("count (%d) must be a positive integer." % count) self.count = count def __repr__(self): return 'AfterCount(%s)' % self.count def __eq__(self, other): return type(self) == type(other) and self.count == other.count def __hash__(self): return hash(self.count) def on_element(self, element, window, context): context.add_state(self.COUNT_TAG, 1) def on_merge(self, to_be_merged, merge_result, context): # states automatically merged pass def should_fire(self, time_domain, watermark, window, context): return context.get_state(self.COUNT_TAG) >= self.count def on_fire(self, watermark, window, context): return True def reset(self, window, context): context.clear_state(self.COUNT_TAG) @staticmethod def from_runner_api(proto, unused_context): return AfterCount(proto.element_count.element_count) def to_runner_api(self, unused_context): return beam_runner_api_pb2.Trigger( element_count=beam_runner_api_pb2.Trigger.ElementCount( element_count=self.count)) def has_ontime_pane(self): return False
class AfterCount(TriggerFn): """Fire when there are at least count elements in this window pane.""" COUNT_TAG = _CombiningValueStateTag('count', combiners.CountCombineFn()) def __init__(self, count): self.count = count def __repr__(self): return 'AfterCount(%s)' % self.count def __eq__(self, other): return type(self) == type(other) and self.count == other.count def on_element(self, element, window, context): context.add_state(self.COUNT_TAG, 1) def on_merge(self, to_be_merged, merge_result, context): # states automatically merged pass def should_fire(self, watermark, window, context): return context.get_state(self.COUNT_TAG) >= self.count def on_fire(self, watermark, window, context): return True def reset(self, window, context): context.clear_state(self.COUNT_TAG) @staticmethod def from_runner_api(proto, unused_context): return AfterCount(proto.element_count.element_count) def to_runner_api(self, unused_context): return beam_runner_api_pb2.Trigger( element_count=beam_runner_api_pb2.Trigger.ElementCount( element_count=self.count))
class GeneralTriggerDriver(TriggerDriver): """Breaks a series of bundle and timer firings into window (pane)s. Suitable for all variants of Windowing. """ ELEMENTS = ListStateTag('elements') TOMBSTONE = CombiningValueStateTag('tombstone', combiners.CountCombineFn()) def __init__(self, windowing): self.window_fn = windowing.windowfn self.output_time_fn_impl = OutputTimeFn.get_impl(windowing.output_time_fn, self.window_fn) # pylint: disable=invalid-name self.WATERMARK_HOLD = WatermarkHoldStateTag('watermark', self.output_time_fn_impl) # pylint: enable=invalid-name self.trigger_fn = windowing.triggerfn self.accumulation_mode = windowing.accumulation_mode self.is_merging = True def process_elements(self, state, windowed_values, output_watermark): if self.is_merging: state = MergeableStateAdapter(state) windows_to_elements = collections.defaultdict(list) for wv in windowed_values: for window in wv.windows: windows_to_elements[window].append((wv.value, wv.timestamp)) # First handle merging. if self.is_merging: old_windows = set(state.known_windows()) all_windows = old_windows.union(windows_to_elements.keys()) if all_windows != old_windows: merged_away = {} class TriggerMergeContext(WindowFn.MergeContext): def merge(_, to_be_merged, merge_result): # pylint: disable=no-self-argument for window in to_be_merged: if window != merge_result: merged_away[window] = merge_result state.merge(to_be_merged, merge_result) # using the outer self argument. self.trigger_fn.on_merge( to_be_merged, merge_result, state.at(merge_result)) self.window_fn.merge(TriggerMergeContext(all_windows)) merged_windows_to_elements = collections.defaultdict(list) for window, values in windows_to_elements.items(): while window in merged_away: window = merged_away[window] merged_windows_to_elements[window].extend(values) windows_to_elements = merged_windows_to_elements for window in merged_away: state.clear_state(window, self.WATERMARK_HOLD) # Next handle element adding. for window, elements in windows_to_elements.items(): if state.get_state(window, self.TOMBSTONE): continue # Add watermark hold. # TODO(ccy): Add late data and garbage-collection hold support. output_time = self.output_time_fn_impl.merge( window, (element_output_time for element_output_time in (self.output_time_fn_impl.assign_output_time(window, timestamp) for unused_value, timestamp in elements) if element_output_time >= output_watermark)) if output_time is not None: state.add_state(window, self.WATERMARK_HOLD, output_time) context = state.at(window) for value, unused_timestamp in elements: state.add_state(window, self.ELEMENTS, value) self.trigger_fn.on_element(value, window, context) # Maybe fire this window. watermark = MIN_TIMESTAMP if self.trigger_fn.should_fire(watermark, window, context): finished = self.trigger_fn.on_fire(watermark, window, context) yield self._output(window, finished, state) def process_timer(self, window_id, unused_name, time_domain, timestamp, state): if self.is_merging: state = MergeableStateAdapter(state) window = state.get_window(window_id) if state.get_state(window, self.TOMBSTONE): return if time_domain == TimeDomain.WATERMARK: if not self.is_merging or window in state.known_windows(): context = state.at(window) if self.trigger_fn.should_fire(timestamp, window, context): finished = self.trigger_fn.on_fire(timestamp, window, context) yield self._output(window, finished, state) else: raise Exception('Unexpected time domain: %s' % time_domain) def _output(self, window, finished, state): """Output window and clean up if appropriate.""" values = state.get_state(window, self.ELEMENTS) if finished: # TODO(robertwb): allowed lateness state.clear_state(window, self.ELEMENTS) state.add_state(window, self.TOMBSTONE, 1) elif self.accumulation_mode == AccumulationMode.DISCARDING: state.clear_state(window, self.ELEMENTS) timestamp = state.get_state(window, self.WATERMARK_HOLD) if timestamp is None: # If no watermark hold was set, output at end of window. timestamp = window.end else: state.clear_state(window, self.WATERMARK_HOLD) return WindowedValue(values, timestamp, (window,))
class GeneralTriggerDriver(TriggerDriver): """Breaks a series of bundle and timer firings into window (pane)s. Suitable for all variants of Windowing. """ ELEMENTS = _ListStateTag('elements') TOMBSTONE = _CombiningValueStateTag('tombstone', combiners.CountCombineFn()) INDEX = _CombiningValueStateTag('index', combiners.CountCombineFn()) NONSPECULATIVE_INDEX = _CombiningValueStateTag( 'nonspeculative_index', combiners.CountCombineFn()) def __init__(self, windowing, clock): self.clock = clock self.allowed_lateness = windowing.allowed_lateness self.window_fn = windowing.windowfn self.timestamp_combiner_impl = TimestampCombiner.get_impl( windowing.timestamp_combiner, self.window_fn) # pylint: disable=invalid-name self.WATERMARK_HOLD = _WatermarkHoldStateTag( 'watermark', self.timestamp_combiner_impl) # pylint: enable=invalid-name self.trigger_fn = windowing.triggerfn self.accumulation_mode = windowing.accumulation_mode self.is_merging = True def process_elements( self, state, windowed_values, output_watermark, input_watermark=MIN_TIMESTAMP): if self.is_merging: state = MergeableStateAdapter(state) windows_to_elements = collections.defaultdict(list) for wv in windowed_values: for window in wv.windows: # ignore expired windows if input_watermark > window.end + self.allowed_lateness: continue windows_to_elements[window].append((wv.value, wv.timestamp)) # First handle merging. if self.is_merging: old_windows = set(state.known_windows()) all_windows = old_windows.union(list(windows_to_elements)) if all_windows != old_windows: merged_away = {} class TriggerMergeContext(WindowFn.MergeContext): def merge(_, to_be_merged, merge_result): # pylint: disable=no-self-argument for window in to_be_merged: if window != merge_result: merged_away[window] = merge_result # Clear state associated with PaneInfo since it is # not preserved across merges. state.clear_state(window, self.INDEX) state.clear_state(window, self.NONSPECULATIVE_INDEX) state.merge(to_be_merged, merge_result) # using the outer self argument. self.trigger_fn.on_merge( to_be_merged, merge_result, state.at(merge_result, self.clock)) self.window_fn.merge(TriggerMergeContext(all_windows)) merged_windows_to_elements = collections.defaultdict(list) for window, values in windows_to_elements.items(): while window in merged_away: window = merged_away[window] merged_windows_to_elements[window].extend(values) windows_to_elements = merged_windows_to_elements for window in merged_away: state.clear_state(window, self.WATERMARK_HOLD) # Next handle element adding. for window, elements in windows_to_elements.items(): if state.get_state(window, self.TOMBSTONE): continue # Add watermark hold. # TODO(ccy): Add late data and garbage-collection hold support. output_time = self.timestamp_combiner_impl.merge( window, ( element_output_time for element_output_time in ( self.timestamp_combiner_impl.assign_output_time( window, timestamp) for unused_value, timestamp in elements) if element_output_time >= output_watermark)) if output_time is not None: state.add_state(window, self.WATERMARK_HOLD, output_time) context = state.at(window, self.clock) for value, unused_timestamp in elements: state.add_state(window, self.ELEMENTS, value) self.trigger_fn.on_element(value, window, context) # Maybe fire this window. if self.trigger_fn.should_fire(TimeDomain.WATERMARK, input_watermark, window, context): finished = self.trigger_fn.on_fire(input_watermark, window, context) yield self._output( window, finished, state, input_watermark, output_watermark, False) def process_timer( self, window_id, unused_name, time_domain, timestamp, state, input_watermark=None): if input_watermark is None: input_watermark = timestamp if self.is_merging: state = MergeableStateAdapter(state) window = state.get_window(window_id) if state.get_state(window, self.TOMBSTONE): return if time_domain in (TimeDomain.WATERMARK, TimeDomain.REAL_TIME): if not self.is_merging or window in state.known_windows(): context = state.at(window, self.clock) if self.trigger_fn.should_fire(time_domain, timestamp, window, context): finished = self.trigger_fn.on_fire(timestamp, window, context) yield self._output( window, finished, state, input_watermark, timestamp, time_domain == TimeDomain.WATERMARK) else: raise Exception('Unexpected time domain: %s' % time_domain) def _output( self, window, finished, state, input_watermark, output_watermark, maybe_ontime): """Output window and clean up if appropriate.""" index = state.get_state(window, self.INDEX) state.add_state(window, self.INDEX, 1) if output_watermark <= window.max_timestamp(): nonspeculative_index = -1 timing = windowed_value.PaneInfoTiming.EARLY if state.get_state(window, self.NONSPECULATIVE_INDEX): nonspeculative_index = state.get_state( window, self.NONSPECULATIVE_INDEX) state.add_state(window, self.NONSPECULATIVE_INDEX, 1) _LOGGER.warning( 'Watermark moved backwards in time ' 'or late data moved window end forward.') else: nonspeculative_index = state.get_state(window, self.NONSPECULATIVE_INDEX) state.add_state(window, self.NONSPECULATIVE_INDEX, 1) timing = ( windowed_value.PaneInfoTiming.ON_TIME if maybe_ontime and nonspeculative_index == 0 else windowed_value.PaneInfoTiming.LATE) pane_info = windowed_value.PaneInfo( index == 0, finished, timing, index, nonspeculative_index) values = state.get_state(window, self.ELEMENTS) if finished: # TODO(robertwb): allowed lateness state.clear_state(window, self.ELEMENTS) state.add_state(window, self.TOMBSTONE, 1) elif self.accumulation_mode == AccumulationMode.DISCARDING: state.clear_state(window, self.ELEMENTS) timestamp = state.get_state(window, self.WATERMARK_HOLD) if timestamp is None: # If no watermark hold was set, output at end of window. timestamp = window.max_timestamp() elif input_watermark < window.end and self.trigger_fn.has_ontime_pane(): # Hold the watermark in case there is an empty pane that needs to be fired # at the end of the window. pass else: state.clear_state(window, self.WATERMARK_HOLD) return WindowedValue(values, timestamp, (window, ), pane_info)
def run(): print("Town of Squirreliwink Bureau Of Tolls and Nuts Affair\n\n[PART-4]") # parse command line args: # - parse both beam args and known script args parser = argparse.ArgumentParser( description="Town of Squirreliwink Bureau Of Tolls and Nuts Affair") parser.add_argument('-i', '--input', type=str, default='./data/input', help='Input folder') parser.add_argument('-o', '--output', type=str, default='./data/output', help='Output folder') known_args, beam_args = parser.parse_known_args(sys.argv) # delete previous run files delete_files(os.path.join(known_args.output, "report*")) # construct pipeline and run options = PipelineOptions(beam_args) with beam.Pipeline(options=options) as pipeline: # create a pcollection of nut prices logger.info("creating nut prices side input") nut_prices = (pipeline | beam.Create([('cornsilk', 2.0), ('slate_gray', 3.5), ('navajo_white', 7.0)])) # read toll records and pass in nut prices as a side_input # you can convert a (k, v) tuple pcollection into a {k: v} with beam.pvalue.AsDict() logger.info("reading toll records") records = (pipeline | beam.io.ReadFromText(os.path.join(known_args.input, 'tollbooth_logs.csv'), skip_header_lines=1) | beam.Map(parse_csv) | beam.ParDo(PrepareAndAddTotalsWithSideInput(), nut_prices=beam.pvalue.AsDict(nut_prices))) # multi-keys multi-values combiner by using beam.combiners.TupleCombineFn() # first normalize rows into ((license_plate, month), (1, total, cornsilk, slate gray, navajo white, total)) tuple # then apply a tuple of combiners over values records = (records | beam.Map(key_by_license_plate_month) | beam.CombinePerKey( beam.combiners.TupleCombineFn(combine.CountCombineFn(), sum, sum, sum, sum, combine.MeanCombineFn()))) # read squirreliwink population file # file consist of newline delimited json rows. read each json row as dict logger.info("reading Squirreliwink's residents file") residents = (pipeline | "residents" >> beam.io.ReadFromText( os.path.join(known_args.input, 'squirreliwink_population.json')) | beam.Map(lambda line: json.loads(line))) # key residents by their license plate logger.info("key residents by license_plate") residents_by_plate = ( residents | beam.Map(lambda element: (element['car'], element))) # lookup residents by their license plate using SideInputs records = ( records | beam.Map( lambda e, lookup: ( # add family_name and address from resident lookup to the keys tuple. # Remember e[0][0] (first value in the keys tuple) should contain our license_plate info (e[0] + tuple(v for k, v in lookup[e[0][0]].items() if k in ('family_name', 'address'))), e[1]), lookup=beam.pvalue.AsDict(residents_by_plate) ) # pass in residents info as a SideInput ) # (records | beam.Map(print)) # output to a newline delimited json file logger.info("output record into csv file") (records | beam.Map( lambda e: e[0] + e[1] ) # flatten ((keys), (values)) tuple into a single tuple (keys + values) | beam.Map(lambda t: dict( zip( # stitch up the results as a dict, adding back column names ('license_plate', 'month', 'family_name', 'address', 'visit_count', 'total', 'cornsilk', 'slate_gray', 'navajo_white', 'avg_total'), t))) | beam.Map(lambda d: json.dumps(d, ensure_ascii=False) ) # json output the results | beam.io.WriteToText(os.path.join(known_args.output, "report"), file_name_suffix='.json'))