def run_trigger_simple(self, window_fn, trigger_fn, accumulation_mode, timestamped_data, expected_panes, *groupings, **kwargs): late_data = kwargs.pop('late_data', []) assert not kwargs def bundle_data(data, size): bundle = [] for timestamp, elem in data: windows = window_fn.assign( WindowFn.AssignContext(timestamp, elem)) bundle.append(WindowedValue(elem, timestamp, windows)) if len(bundle) == size: yield bundle bundle = [] if bundle: yield bundle if not groupings: groupings = [1] for group_by in groupings: bundles = [] bundle = [] for timestamp, elem in timestamped_data: windows = window_fn.assign( WindowFn.AssignContext(timestamp, elem)) bundle.append(WindowedValue(elem, timestamp, windows)) if len(bundle) == group_by: bundles.append(bundle) bundle = [] bundles.append(bundle) self.run_trigger(window_fn, trigger_fn, accumulation_mode, bundle_data(timestamped_data, group_by), bundle_data(late_data, group_by), expected_panes)
def process(self, e, w=beam.DoFn.WindowParam, p=beam.DoFn.PaneInfoParam, t=beam.DoFn.TimestampParam): yield test_stream.WindowedValueHolder( WindowedValue(e, t, [w], p))
def process_elements(self, state, windowed_values, unused_output_watermark): if isinstance(windowed_values, list): unwindowed = [wv.value for wv in windowed_values] else: unwindowed = _UnwindowedValues(windowed_values) yield WindowedValue(unwindowed, MIN_TIMESTAMP, self.GLOBAL_WINDOW_TUPLE)
def _windowed_value_info_map_fn(k, vs, window=beam.DoFn.WindowParam, t=beam.DoFn.TimestampParam, p=beam.DoFn.PaneInfoParam): return (k, _windowed_value_info( WindowedValue(vs, windows=[window], timestamp=t, pane_info=p)))
def process_elements(self, state, windowed_values, unused_output_watermark, unused_input_watermark=MIN_TIMESTAMP): yield WindowedValue( _UnwindowedValues(windowed_values), MIN_TIMESTAMP, self.GLOBAL_WINDOW_TUPLE, self.ONLY_FIRING)
def finish_bundle(self): from apache_beam.utils import timestamp from apache_beam.transforms.window import WindowedValue, GlobalWindow if len(self._buffer) != 0: logging.info("Final Buffer Length: {}".format(len(self._buffer))) yield WindowedValue(self._buffer, timestamp.MIN_TIMESTAMP, [GlobalWindow()]) self._buffer = []
def process(self, element, timestamp=beam.DoFn.TimestampParam, window=beam.DoFn.WindowParam): new_windows = self.windowing.windowfn.assign( WindowFn.AssignContext(timestamp, element=element, window=window)) yield WindowedValue(element, timestamp, new_windows)
def test_picklable_output(self): global_window = (trigger.GlobalWindow(), ) driver = trigger.DiscardingGlobalTriggerDriver() unpicklable = (WindowedValue(k, 0, global_window) for k in range(10)) with self.assertRaises(TypeError): pickle.dumps(unpicklable) for unwindowed in driver.process_elements(None, unpicklable, None): self.assertEqual( pickle.loads(pickle.dumps(unwindowed)).value, list(range(10)))
def bundle_data(data, size): bundle = [] for timestamp, elem in data: windows = window_fn.assign(WindowFn.AssignContext(timestamp, elem)) bundle.append(WindowedValue(elem, timestamp, windows)) if len(bundle) == size: yield bundle bundle = [] if bundle: yield bundle
def _output( self, window, finished, state, input_watermark, output_watermark, maybe_ontime): """Output window and clean up if appropriate.""" index = state.get_state(window, self.INDEX) state.add_state(window, self.INDEX, 1) if output_watermark <= window.max_timestamp(): nonspeculative_index = -1 timing = windowed_value.PaneInfoTiming.EARLY if state.get_state(window, self.NONSPECULATIVE_INDEX): nonspeculative_index = state.get_state( window, self.NONSPECULATIVE_INDEX) state.add_state(window, self.NONSPECULATIVE_INDEX, 1) _LOGGER.warning( 'Watermark moved backwards in time ' 'or late data moved window end forward.') else: nonspeculative_index = state.get_state(window, self.NONSPECULATIVE_INDEX) state.add_state(window, self.NONSPECULATIVE_INDEX, 1) timing = ( windowed_value.PaneInfoTiming.ON_TIME if maybe_ontime and nonspeculative_index == 0 else windowed_value.PaneInfoTiming.LATE) pane_info = windowed_value.PaneInfo( index == 0, finished, timing, index, nonspeculative_index) values = state.get_state(window, self.ELEMENTS) if finished: # TODO(robertwb): allowed lateness state.clear_state(window, self.ELEMENTS) state.add_state(window, self.TOMBSTONE, 1) elif self.accumulation_mode == AccumulationMode.DISCARDING: state.clear_state(window, self.ELEMENTS) timestamp = state.get_state(window, self.WATERMARK_HOLD) if timestamp is None: # If no watermark hold was set, output at end of window. timestamp = window.max_timestamp() elif input_watermark < window.end and self.trigger_fn.has_ontime_pane(): # Hold the watermark in case there is an empty pane that needs to be fired # at the end of the window. pass else: state.clear_state(window, self.WATERMARK_HOLD) return WindowedValue(values, timestamp, (window, ), pane_info)
def process_element(self, element): # In order to keep the order of the elements between the script and what # flows through the pipeline the same, emit the elements here. event = element.value if isinstance(event, WatermarkEvent): self._watermark = event.new_watermark elif isinstance(event, ElementEvent): main_output = list(self._outputs)[0] bundle = self._evaluation_context.create_bundle(main_output) for tv in event.timestamped_values: # Unreify the value into the correct window. try: bundle.output(WindowedValue(**tv.value)) except TypeError: bundle.output( GlobalWindows.windowed_value(tv.value, timestamp=tv.timestamp)) self.bundles.append(bundle)
def _output(self, window, finished, state): """Output window and clean up if appropriate.""" values = state.get_state(window, self.ELEMENTS) if finished: # TODO(robertwb): allowed lateness state.clear_state(window, self.ELEMENTS) state.add_state(window, self.TOMBSTONE, 1) elif self.accumulation_mode == AccumulationMode.DISCARDING: state.clear_state(window, self.ELEMENTS) timestamp = state.get_state(window, self.WATERMARK_HOLD) if timestamp is None: # If no watermark hold was set, output at end of window. timestamp = window.end else: state.clear_state(window, self.WATERMARK_HOLD) return WindowedValue(values, timestamp, (window,))
def _execute(self, window_fn, trigger_fn, accumulation_mode, timestamp_combiner, transcript, unused_spec): driver = GeneralTriggerDriver( Windowing(window_fn, trigger_fn, accumulation_mode, timestamp_combiner), TestClock()) state = InMemoryUnmergedState() output = [] watermark = MIN_TIMESTAMP def fire_timers(): to_fire = state.get_and_clear_timers(watermark) while to_fire: for timer_window, (name, time_domain, t_timestamp) in to_fire: for wvalue in driver.process_timer(timer_window, name, time_domain, t_timestamp, state): window, = wvalue.windows output.append({ 'window': [window.start, window.end - 1], 'values': sorted(wvalue.value), 'timestamp': wvalue.timestamp }) to_fire = state.get_and_clear_timers(watermark) for action, params in transcript: if action != 'expect': # Fail if we have output that was not expected in the transcript. self.assertEqual([], output, msg='Unexpected output: %s before %s: %s' % (output, action, params)) if action == 'input': bundle = [ WindowedValue( t, t, window_fn.assign(WindowFn.AssignContext(t, t))) for t in params ] output = [{ 'window': [wvalue.windows[0].start, wvalue.windows[0].end - 1], 'values': sorted(wvalue.value), 'timestamp': wvalue.timestamp } for wvalue in driver.process_elements( state, bundle, watermark)] fire_timers() elif action == 'watermark': watermark = params fire_timers() elif action == 'expect': for expected_output in params: for candidate in output: if all(candidate[k] == expected_output[k] for k in candidate if k in expected_output): output.remove(candidate) break else: self.fail('Unmatched output %s in %s' % (expected_output, output)) elif action == 'state': # TODO(robertwb): Implement once we support allowed lateness. pass else: self.fail('Unknown action: ' + action) # Fail if we have output that was not expected in the transcript. self.assertEqual([], output, msg='Unexpected output: %s' % output)
def _execute(self, window_fn, trigger_fn, accumulation_mode, timestamp_combiner, transcript, spec): runner_name = TestPipeline().runner.__class__.__name__ if runner_name in spec.get('broken_on', ()): self.skipTest('Known to be broken on %s' % runner_name) test_stream = TestStream() for action, params in transcript: if action == 'expect': test_stream.add_elements([('expect', params)]) else: test_stream.add_elements([('expect', [])]) if action == 'input': test_stream.add_elements([('input', e) for e in params]) elif action == 'watermark': test_stream.advance_watermark_to(params) elif action == 'clock': test_stream.advance_processing_time(params) elif action == 'state': pass # Requires inspection of implementation details. else: raise ValueError('Unexpected action: %s' % action) test_stream.add_elements([('expect', [])]) class Check(beam.DoFn): """A StatefulDoFn that verifies outputs are produced as expected. This DoFn takes in two kinds of inputs, actual outputs and expected outputs. When an actual output is received, it is buffered into state, and when an expected output is received, this buffered state is retrieved and compared against the expected value(s) to ensure they match. The key is ignored, but all items must be on the same key to share state. """ def process(self, element, seen=beam.DoFn.StateParam( beam.transforms.userstate.BagStateSpec( 'seen', beam.coders.FastPrimitivesCoder()))): _, (action, data) = element if action == 'actual': seen.add(data) elif action == 'expect': actual = list(seen.read()) seen.clear() if len(actual) > len(data): raise AssertionError( 'Unexpected output: expected %s but got %s' % (data, actual)) elif len(data) > len(actual): raise AssertionError( 'Unmatched output: expected %s but got %s' % (data, actual)) else: def diff(actual, expected): for key in sorted(expected.keys(), reverse=True): if key in actual: if actual[key] != expected[key]: return key for output in actual: diffs = [ diff(output, expected) for expected in data ] if all(diffs): raise AssertionError( 'Unmatched output: %s not found in %s (diffs in %s)' % (output, data, diffs)) else: raise ValueError('Unexpected action: %s' % action) with TestPipeline(options=PipelineOptions(streaming=True)) as p: # Split the test stream into a branch of to-be-processed elements, and # a branch of expected results. inputs, expected = ( p | test_stream | beam.MapTuple(lambda tag, value: beam.pvalue.TaggedOutput( tag, ('key', value))).with_outputs('input', 'expect')) # Process the inputs with the given windowing to produce actual outputs. outputs = ( inputs | beam.MapTuple(lambda key, value: TimestampedValue( (key, value), value)) | beam.WindowInto(window_fn, trigger=trigger_fn, accumulation_mode=accumulation_mode, timestamp_combiner=timestamp_combiner) | beam.GroupByKey() | beam.MapTuple( lambda k, vs, window=beam.DoFn.WindowParam, t=beam.DoFn. TimestampParam, p=beam.DoFn.PaneInfoParam: (k, self._windowed_value_info( WindowedValue( vs, windows=[window], timestamp=t, pane_info=p)))) # Place outputs back into the global window to allow flattening # and share a single state in Check. | 'Global' >> beam.WindowInto( beam.transforms.window.GlobalWindows())) # Feed both the expected and actual outputs to Check() for comparison. tagged_expected = ( expected | beam.MapTuple(lambda key, value: (key, ('expect', value)))) tagged_outputs = ( outputs | beam.MapTuple(lambda key, value: (key, ('actual', value)))) # pylint: disable=expression-not-assigned (tagged_expected, tagged_outputs) | beam.Flatten() | beam.ParDo( Check())
def _execute(self, window_fn, trigger_fn, accumulation_mode, timestamp_combiner, transcript, spec): runner_name = TestPipeline().runner.__class__.__name__ if runner_name in spec.get('broken_on', ()): self.skipTest('Known to be broken on %s' % runner_name) # Elements are encoded as a json strings to allow other languages to # decode elements while executing the test stream. # TODO(BEAM-8600): Eliminate these gymnastics. test_stream = TestStream( coder=coders.StrUtf8Coder()).with_output_types(str) for action, params in transcript: if action == 'expect': test_stream.add_elements([json.dumps(('expect', params))]) else: test_stream.add_elements([json.dumps(('expect', []))]) if action == 'input': test_stream.add_elements( [json.dumps(('input', e)) for e in params]) elif action == 'watermark': test_stream.advance_watermark_to(params) elif action == 'clock': test_stream.advance_processing_time(params) elif action == 'state': pass # Requires inspection of implementation details. else: raise ValueError('Unexpected action: %s' % action) test_stream.add_elements([json.dumps(('expect', []))]) read_test_stream = test_stream | beam.Map(json.loads) class Check(beam.DoFn): """A StatefulDoFn that verifies outputs are produced as expected. This DoFn takes in two kinds of inputs, actual outputs and expected outputs. When an actual output is received, it is buffered into state, and when an expected output is received, this buffered state is retrieved and compared against the expected value(s) to ensure they match. The key is ignored, but all items must be on the same key to share state. """ def __init__(self, allow_out_of_order=True): # Some runners don't support cross-stage TestStream semantics. self.allow_out_of_order = allow_out_of_order def process(self, element, seen=beam.DoFn.StateParam( beam.transforms.userstate.BagStateSpec( 'seen', beam.coders.FastPrimitivesCoder())), expected=beam.DoFn.StateParam( beam.transforms.userstate.BagStateSpec( 'expected', beam.coders.FastPrimitivesCoder()))): _, (action, data) = element if self.allow_out_of_order: if action == 'expect' and not list(seen.read()): if data: expected.add(data) return elif action == 'actual' and list(expected.read()): seen.add(data) all_data = list(seen.read()) all_expected = list(expected.read()) if len(all_data) == len(all_expected[0]): expected.clear() for expect in all_expected[1:]: expected.add(expect) action, data = 'expect', all_expected[0] else: return if action == 'actual': seen.add(data) elif action == 'expect': actual = list(seen.read()) seen.clear() if len(actual) > len(data): raise AssertionError( 'Unexpected output: expected %s but got %s' % (data, actual)) elif len(data) > len(actual): raise AssertionError( 'Unmatched output: expected %s but got %s' % (data, actual)) else: def diff(actual, expected): for key in sorted(expected.keys(), reverse=True): if key in actual: if actual[key] != expected[key]: return key for output in actual: diffs = [ diff(output, expected) for expected in data ] if all(diffs): raise AssertionError( 'Unmatched output: %s not found in %s (diffs in %s)' % (output, data, diffs)) else: raise ValueError('Unexpected action: %s' % action) with TestPipeline() as p: # TODO(BEAM-8601): Pass this during pipeline construction. p.options.view_as(StandardOptions).streaming = True # Split the test stream into a branch of to-be-processed elements, and # a branch of expected results. inputs, expected = ( p | read_test_stream | beam.MapTuple(lambda tag, value: beam.pvalue.TaggedOutput( tag, ('key', value))).with_outputs('input', 'expect')) # Process the inputs with the given windowing to produce actual outputs. outputs = ( inputs | beam.MapTuple(lambda key, value: TimestampedValue( (key, value), value)) | beam.WindowInto(window_fn, trigger=trigger_fn, accumulation_mode=accumulation_mode, timestamp_combiner=timestamp_combiner) | beam.GroupByKey() | beam.MapTuple( lambda k, vs, window=beam.DoFn.WindowParam, t=beam.DoFn. TimestampParam, p=beam.DoFn.PaneInfoParam: (k, _windowed_value_info( WindowedValue( vs, windows=[window], timestamp=t, pane_info=p)))) # Place outputs back into the global window to allow flattening # and share a single state in Check. | 'Global' >> beam.WindowInto( beam.transforms.window.GlobalWindows())) # Feed both the expected and actual outputs to Check() for comparison. tagged_expected = ( expected | beam.MapTuple(lambda key, value: (key, ('expect', value)))) tagged_outputs = ( outputs | beam.MapTuple(lambda key, value: (key, ('actual', value)))) # pylint: disable=expression-not-assigned ([tagged_expected, tagged_outputs] | beam.Flatten() | beam.ParDo(Check(self.allow_out_of_order)))
def _run_log(self, spec): def parse_int_list(s): """Parses strings like '[1, 2, 3]'.""" s = s.strip() assert s[0] == '[' and s[-1] == ']', s if not s[1:-1].strip(): return [] return [int(x) for x in s[1:-1].split(',')] def split_args(s): """Splits 'a, b, [c, d]' into ['a', 'b', '[c, d]'].""" args = [] start = 0 depth = 0 for ix in range(len(s)): c = s[ix] if c in '({[': depth += 1 elif c in ')}]': depth -= 1 elif c == ',' and depth == 0: args.append(s[start:ix].strip()) start = ix + 1 assert depth == 0, s args.append(s[start:].strip()) return args def parse(s, names): """Parse (recursive) 'Foo(arg, kw=arg)' for Foo in the names dict.""" s = s.strip() if s in names: return names[s] elif s[0] == '[': return parse_int_list(s) elif '(' in s: assert s[-1] == ')', s callee = parse(s[:s.index('(')], names) posargs = [] kwargs = {} for arg in split_args(s[s.index('(') + 1:-1]): if '=' in arg: kw, value = arg.split('=', 1) kwargs[kw] = parse(value, names) else: posargs.append(parse(arg, names)) return callee(*posargs, **kwargs) else: try: return int(s) except ValueError: raise ValueError('Unknown function: %s' % s) def parse_fn(s, names): """Like parse(), but implicitly calls no-arg constructors.""" fn = parse(s, names) if isinstance(fn, type): return fn() return fn # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.transforms import window as window_module # pylint: enable=wrong-import-order, wrong-import-position window_fn_names = dict(window_module.__dict__) window_fn_names.update({ 'CustomTimestampingFixedWindowsWindowFn': CustomTimestampingFixedWindowsWindowFn }) trigger_names = {'Default': DefaultTrigger} trigger_names.update(trigger.__dict__) window_fn = parse_fn(spec.get('window_fn', 'GlobalWindows'), window_fn_names) trigger_fn = parse_fn(spec.get('trigger_fn', 'Default'), trigger_names) accumulation_mode = getattr( AccumulationMode, spec.get('accumulation_mode', 'ACCUMULATING').upper()) timestamp_combiner = getattr( TimestampCombiner, spec.get('timestamp_combiner', 'OUTPUT_AT_EOW').upper()) driver = GeneralTriggerDriver( Windowing(window_fn, trigger_fn, accumulation_mode, timestamp_combiner), TestClock()) state = InMemoryUnmergedState() output = [] watermark = MIN_TIMESTAMP def fire_timers(): to_fire = state.get_and_clear_timers(watermark) while to_fire: for timer_window, (name, time_domain, t_timestamp) in to_fire: for wvalue in driver.process_timer(timer_window, name, time_domain, t_timestamp, state): window, = wvalue.windows output.append({ 'window': [window.start, window.end - 1], 'values': sorted(wvalue.value), 'timestamp': wvalue.timestamp }) to_fire = state.get_and_clear_timers(watermark) for line in spec['transcript']: action, params = line.items()[0] if action != 'expect': # Fail if we have output that was not expected in the transcript. self.assertEquals([], output, msg='Unexpected output: %s before %s' % (output, line)) if action == 'input': bundle = [ WindowedValue( t, t, window_fn.assign(WindowFn.AssignContext(t, t))) for t in params ] output = [{ 'window': [wvalue.windows[0].start, wvalue.windows[0].end - 1], 'values': sorted(wvalue.value), 'timestamp': wvalue.timestamp } for wvalue in driver.process_elements( state, bundle, watermark)] fire_timers() elif action == 'watermark': watermark = params fire_timers() elif action == 'expect': for expected_output in params: for candidate in output: if all(candidate[k] == expected_output[k] for k in candidate if k in expected_output): output.remove(candidate) break else: self.fail('Unmatched output %s in %s' % (expected_output, output)) elif action == 'state': # TODO(robertwb): Implement once we support allowed lateness. pass else: self.fail('Unknown action: ' + action) # Fail if we have output that was not expected in the transcript. self.assertEquals([], output, msg='Unexpected output: %s' % output)
def timestamped_key_values(self, pipeline, key, *timestamps): return (pipeline | 'start' >> Create(timestamps) | Map(lambda x: WindowedValue((key, x), x, [GlobalWindow()])))
def process_element(self, element): result = WindowedValue((element.value, self.timing_info), element.timestamp, element.windows, element.pane_info) self.bundle.output(result)
def finish_bundle(self): user_id_list = [] new_engagement_records = [] for group in self.batch: if len(group[1].get('engagements')) > 0: user_id_list.append(int(group[0])) events = [] if len(user_id_list) > 0: # Get open PN events that happened from (from_ts -engagement_range) to (to_ts) # This make sure that if a question is created right after from_ts, we may # still get the open PN event that happened (engagement_range) days before that which leads to that question events = get_open_pn_events_of_user_list( user_id_list, from_ts=self.from_ts - self.engagement_range * 24 * 60 * 60, to_ts=self.to_ts, giap_es_username=self.giap_es_username, giap_es_password=self.giap_es_password, giap_es_index=self.giap_es_index) for group in self.batch: uid = int(group[0]) questions = group[1].get('questions') engagements = group[1].get('engagements') users = group[1].get('users') if len(users) == 0 or len(questions) == 0 or len(engagements) == 0: continue user = users[0] user_events = [ event for event in events if event['_source']['uid'] == uid ] for question in questions: question_created = int(question.get('created')) for engagement in engagements: send_push_noti_time = int( engagement.get('send_push_noti_time')) question_created_date = datetime.fromtimestamp( question_created) n_days_ago_date = question_created_date - timedelta( days=self.engagement_range) n_days_ago_ts = n_days_ago_date.timestamp() pn_campaign = None for event in user_events: # Find the last event which happens after send_push_noti_time and before question_created if send_push_noti_time * 1000 < event['_source'][ '$time'] < question_created * 1000: pn_campaign = event['_source'].get('campaign') break if question_created > send_push_noti_time > n_days_ago_ts: new_engagement_record = { 'created': int(datetime.now().timestamp()), 'action': 'ask', 'question_id': question['qid'], 'uid': uid, 'type': engagement.get('type'), 'inactive_days': engagement.get('inactive_days'), 'balance': user.get('balance', 0), 'grade': engagement.get('grade'), 'send_push_noti_time': engagement.get('send_push_noti_time'), 'action_time': question['created'], 'campaign': pn_campaign } new_engagement_records.append(new_engagement_record) for record in new_engagement_records: yield WindowedValue( value=record, timestamp=0, windows=[self.window], )