def test_param_windowed_value_coder(self): from apache_beam.transforms.window import IntervalWindow from apache_beam.utils.windowed_value import PaneInfo wv = windowed_value.create( b'', # Milliseconds to microseconds 1000 * 1000, (IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)) windowed_value_coder = coders.WindowedValueCoder( coders.BytesCoder(), coders.IntervalWindowCoder()) payload = windowed_value_coder.encode(wv) coder = coders.ParamWindowedValueCoder( payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]) # Test binary representation self.assertEqual(b'\x01', coder.encode(window.GlobalWindows.windowed_value(1))) # Test unnested self.check_coder( coders.ParamWindowedValueCoder( payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]), windowed_value.WindowedValue( 3, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)), windowed_value.WindowedValue( 1, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3))) # Test nested self.check_coder( coders.TupleCoder(( coders.ParamWindowedValueCoder( payload, [ coders.FloatCoder(), coders.IntervalWindowCoder()]), coders.ParamWindowedValueCoder( payload, [ coders.StrUtf8Coder(), coders.IntervalWindowCoder()]))), (windowed_value.WindowedValue( 1.5, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)), windowed_value.WindowedValue( "abc", 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3))))
def test_windowed_values_interpreted_correctly(self): windowed_value = WindowedValueHolder( WindowedValue('a', Timestamp(5), [beam.window.IntervalWindow(5, 10)], PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0))) test_stream = (TestStream() .advance_processing_time(10) .advance_watermark_to(10) .add_elements([windowed_value]) .advance_watermark_to_infinity()) # yapf: disable class RecordFn(beam.DoFn): def process(self, element=beam.DoFn.ElementParam, timestamp=beam.DoFn.TimestampParam, window=beam.DoFn.WindowParam): yield (element, timestamp, window) options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: my_record_fn = RecordFn() records = p | test_stream | beam.ParDo(my_record_fn) assert_that( records, equal_to([ ('a', timestamp.Timestamp(5), beam.window.IntervalWindow(5, 10)), ]))
def test_pane_info_formatter(self): self.assertEqual( 'Pane 0: Final Early', pv.pane_info_formatter( PaneInfo(is_first=False, is_last=True, timing=PaneInfoTiming.EARLY, index=0, nonspeculative_index=0)))
def test_instance_check_windowed_value_holder(self): windowed_value = WindowedValue( 'a', Timestamp(5), [beam.window.IntervalWindow(5, 10)], PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)) self.assertTrue( isinstance(WindowedValueHolder(windowed_value), WindowedValueHolder)) self.assertTrue( isinstance( beam.Row(windowed_value=windowed_value, urn=common_urns.coders.ROW.urn), WindowedValueHolder)) self.assertFalse( isinstance(beam.Row(windowed_value=windowed_value), WindowedValueHolder)) self.assertFalse(isinstance(windowed_value, WindowedValueHolder)) self.assertFalse( isinstance(beam.Row(x=windowed_value), WindowedValueHolder)) self.assertFalse( isinstance(beam.Row(windowed_value=1), WindowedValueHolder))
def test_wordcount(self): class WordExtractingDoFn(beam.DoFn): def process(self, element): text_line = element.strip() words = text_line.split() return words p = beam.Pipeline(runner=interactive_runner.InteractiveRunner( direct_runner.DirectRunner())) # Count the occurrences of each word. counts = (p | beam.Create(['to be or not to be that is the question']) | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1])))) # Watch the local scope for Interactive Beam so that counts will be cached. ib.watch(locals()) result = p.run() result.wait_until_finish() actual = list(result.get(counts)) self.assertSetEqual( set(actual), set([ ('or', 1), ('that', 1), ('be', 2), ('is', 1), ('question', 1), ('to', 2), ('the', 1), ('not', 1), ])) # Truncate the precision to millis because the window coder uses millis # as units then gets upcast to micros. end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000 df_counts = ib.collect(counts, include_window_info=True) df_expected = pd.DataFrame( { 0: [e[0] for e in actual], 1: [e[1] for e in actual], 'event_time': [end_of_window for _ in actual], 'windows': [[GlobalWindow()] for _ in actual], 'pane_info': [ PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0) for _ in actual ] }, columns=[0, 1, 'event_time', 'windows', 'pane_info']) pd.testing.assert_frame_equal(df_expected, df_counts) actual_reified = result.get(counts, include_window_info=True) expected_reified = [ WindowedValue(e, Timestamp(micros=end_of_window), [GlobalWindow()], PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)) for e in actual ] self.assertEqual(actual_reified, expected_reified)
def test_streaming_wordcount(self): class WordExtractingDoFn(beam.DoFn): def process(self, element): text_line = element.strip() words = text_line.split() return words # Add the TestStream so that it can be cached. ib.options.capturable_sources.add(TestStream) p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(), options=StandardOptions(streaming=True)) data = ( p | TestStream() .advance_watermark_to(0) .advance_processing_time(1) .add_elements(['to', 'be', 'or', 'not', 'to', 'be']) .advance_watermark_to(20) .advance_processing_time(1) .add_elements(['that', 'is', 'the', 'question']) | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable counts = (data | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1])))) # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # Create a fake limiter that cancels the BCJ once the main job receives the # expected amount of results. class FakeLimiter: def __init__(self, p, pcoll): self.p = p self.pcoll = pcoll def is_triggered(self): result = ie.current_env().pipeline_result(self.p) if result: try: results = result.get(self.pcoll) except ValueError: return False return len(results) >= 10 return False # This sets the limiters to stop reading when the test receives 10 elements. ie.current_env().options.capture_control.set_limiters_for_test( [FakeLimiter(p, data)]) # This tests that the data was correctly cached. pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0) expected_data_df = pd.DataFrame([ ('to', 0, [IntervalWindow(0, 10)], pane_info), ('be', 0, [IntervalWindow(0, 10)], pane_info), ('or', 0, [IntervalWindow(0, 10)], pane_info), ('not', 0, [IntervalWindow(0, 10)], pane_info), ('to', 0, [IntervalWindow(0, 10)], pane_info), ('be', 0, [IntervalWindow(0, 10)], pane_info), ('that', 20000000, [IntervalWindow(20, 30)], pane_info), ('is', 20000000, [IntervalWindow(20, 30)], pane_info), ('the', 20000000, [IntervalWindow(20, 30)], pane_info), ('question', 20000000, [IntervalWindow(20, 30)], pane_info) ], columns=[0, 'event_time', 'windows', 'pane_info']) # yapf: disable data_df = ib.collect(data, include_window_info=True) pd.testing.assert_frame_equal(expected_data_df, data_df) # This tests that the windowing was passed correctly so that all the data # is aggregated also correctly. pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0) expected_counts_df = pd.DataFrame([ ('be', 2, 9999999, [IntervalWindow(0, 10)], pane_info), ('not', 1, 9999999, [IntervalWindow(0, 10)], pane_info), ('or', 1, 9999999, [IntervalWindow(0, 10)], pane_info), ('to', 2, 9999999, [IntervalWindow(0, 10)], pane_info), ('is', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ('question', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ('that', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ('the', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ], columns=[0, 1, 'event_time', 'windows', 'pane_info']) # yapf: disable counts_df = ib.collect(counts, include_window_info=True) # The group by key has no guarantee of order. So we post-process the DF by # sorting so we can test equality. sorted_counts_df = (counts_df .sort_values(['event_time', 0], ascending=True) .reset_index(drop=True)) # yapf: disable pd.testing.assert_frame_equal(expected_counts_df, sorted_counts_df)
def test_streaming_wordcount(self): self.skipTest('[BEAM-9601] Test is breaking PreCommits') class WordExtractingDoFn(beam.DoFn): def process(self, element): text_line = element.strip() words = text_line.split() return words # Add the TestStream so that it can be cached. ib.options.capturable_sources.add(TestStream) ib.options.capture_duration = timedelta(seconds=1) p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(), options=StandardOptions(streaming=True)) data = ( p | TestStream() .advance_watermark_to(0) .advance_processing_time(1) .add_elements(['to', 'be', 'or', 'not', 'to', 'be']) .advance_watermark_to(20) .advance_processing_time(1) .add_elements(['that', 'is', 'the', 'question']) | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable counts = (data | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1])))) # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # This tests that the data was correctly cached. pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0) expected_data_df = pd.DataFrame( [('to', 0, [beam.window.IntervalWindow(0, 10)], pane_info), ('be', 0, [beam.window.IntervalWindow(0, 10)], pane_info), ('or', 0, [beam.window.IntervalWindow(0, 10)], pane_info), ('not', 0, [beam.window.IntervalWindow(0, 10)], pane_info), ('to', 0, [beam.window.IntervalWindow(0, 10)], pane_info), ('be', 0, [beam.window.IntervalWindow(0, 10)], pane_info), ('that', 20000000, [beam.window.IntervalWindow(20, 30) ], pane_info), ('is', 20000000, [beam.window.IntervalWindow(20, 30)], pane_info), ('the', 20000000, [beam.window.IntervalWindow(20, 30) ], pane_info), ('question', 20000000, [beam.window.IntervalWindow(20, 30) ], pane_info)], columns=[0, 'event_time', 'windows', 'pane_info']) data_df = ib.collect(data, include_window_info=True) pd.testing.assert_frame_equal(expected_data_df, data_df) # This tests that the windowing was passed correctly so that all the data # is aggregated also correctly. pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0) expected_counts_df = pd.DataFrame([ ('to', 2, 9999999, [beam.window.IntervalWindow(0, 10)], pane_info), ('be', 2, 9999999, [beam.window.IntervalWindow(0, 10)], pane_info), ('or', 1, 9999999, [beam.window.IntervalWindow(0, 10)], pane_info), ('not', 1, 9999999, [beam.window.IntervalWindow(0, 10) ], pane_info), ('that', 1, 29999999, [beam.window.IntervalWindow(20, 30) ], pane_info), ('is', 1, 29999999, [beam.window.IntervalWindow(20, 30) ], pane_info), ('the', 1, 29999999, [beam.window.IntervalWindow(20, 30) ], pane_info), ('question', 1, 29999999, [beam.window.IntervalWindow(20, 30) ], pane_info) ], columns=[ 0, 1, 'event_time', 'windows', 'pane_info' ]) counts_df = ib.collect(counts, include_window_info=True) pd.testing.assert_frame_equal(expected_counts_df, counts_df)
class StandardCodersTest(unittest.TestCase): _urn_to_json_value_parser = { 'beam:coder:bytes:v1': lambda x: x.encode('utf-8'), 'beam:coder:bool:v1': lambda x: x, 'beam:coder:string_utf8:v1': lambda x: x, 'beam:coder:varint:v1': lambda x: x, 'beam:coder:kv:v1': lambda x, key_parser, value_parser: (key_parser(x['key']), value_parser(x['value'])), 'beam:coder:interval_window:v1': lambda x: IntervalWindow(start=Timestamp(micros=(x['end'] - x['span']) * 1000), end=Timestamp(micros=x['end'] * 1000)), 'beam:coder:iterable:v1': lambda x, parser: list(map(parser, x)), 'beam:coder:global_window:v1': lambda x: window.GlobalWindow(), 'beam:coder:windowed_value:v1': lambda x, value_parser, window_parser: windowed_value.create( value_parser(x['value']), x['timestamp'] * 1000, tuple([window_parser(w) for w in x['windows']])), 'beam:coder:param_windowed_value:v1': lambda x, value_parser, window_parser: windowed_value.create( value_parser(x['value']), x['timestamp'] * 1000, tuple([window_parser(w) for w in x['windows']]), PaneInfo(x['pane']['is_first'], x['pane']['is_last'], PaneInfoTiming.from_string(x['pane']['timing']), x['pane'] ['index'], x['pane']['on_time_index'])), 'beam:coder:timer:v1': lambda x, value_parser, window_parser: userstate.Timer( user_key=value_parser(x['userKey']), dynamic_timer_tag=x['dynamicTimerTag'], clear_bit=x['clearBit'], windows=tuple([window_parser(w) for w in x['windows']]), fire_timestamp=None, hold_timestamp=None, paneinfo=None) if x['clearBit'] else userstate.Timer( user_key=value_parser(x['userKey']), dynamic_timer_tag=x['dynamicTimerTag'], clear_bit=x['clearBit'], fire_timestamp=Timestamp(micros=x['fireTimestamp'] * 1000), hold_timestamp=Timestamp(micros=x['holdTimestamp'] * 1000), windows=tuple([window_parser(w) for w in x['windows']]), paneinfo=PaneInfo(x['pane']['is_first'], x['pane']['is_last'], PaneInfoTiming.from_string(x['pane']['timing']), x['pane']['index'], x['pane']['on_time_index'])), 'beam:coder:double:v1': parse_float, 'beam:coder:sharded_key:v1': lambda x, value_parser: ShardedKey( key=value_parser(x['key']), shard_id=x['shardId'].encode('utf-8')), 'beam:coder:custom_window:v1': lambda x, window_parser: window_parser(x['window']) } def test_standard_coders(self): for name, spec in _load_test_cases(STANDARD_CODERS_YAML): logging.info('Executing %s test.', name) self._run_standard_coder(name, spec) def _run_standard_coder(self, name, spec): def assert_equal(actual, expected): """Handle nan values which self.assertEqual fails on.""" if (isinstance(actual, float) and isinstance(expected, float) and math.isnan(actual) and math.isnan(expected)): return self.assertEqual(actual, expected) coder = self.parse_coder(spec['coder']) parse_value = self.json_value_parser(spec['coder']) nested_list = [spec['nested']] if 'nested' in spec else [True, False] for nested in nested_list: for expected_encoded, json_value in spec['examples'].items(): value = parse_value(json_value) expected_encoded = expected_encoded.encode('latin1') if not spec['coder'].get('non_deterministic', False): actual_encoded = encode_nested(coder, value, nested) if self.fix and actual_encoded != expected_encoded: self.to_fix[spec['index'], expected_encoded] = actual_encoded else: self.assertEqual(expected_encoded, actual_encoded) decoded = decode_nested(coder, expected_encoded, nested) assert_equal(decoded, value) else: # Only verify decoding for a non-deterministic coder self.assertEqual( decode_nested(coder, expected_encoded, nested), value) def parse_coder(self, spec): context = pipeline_context.PipelineContext() coder_id = str(hash(str(spec))) component_ids = [ context.coders.get_id(self.parse_coder(c)) for c in spec.get('components', ()) ] context.coders.put_proto( coder_id, beam_runner_api_pb2.Coder(spec=beam_runner_api_pb2.FunctionSpec( urn=spec['urn'], payload=spec.get('payload', '').encode('latin1')), component_coder_ids=component_ids)) return context.coders.get_by_id(coder_id) def json_value_parser(self, coder_spec): # TODO: integrate this with the logic for the other parsers if coder_spec['urn'] == 'beam:coder:row:v1': schema = schema_pb2.Schema.FromString( coder_spec['payload'].encode('latin1')) return value_parser_from_schema(schema) component_parsers = [ self.json_value_parser(c) for c in coder_spec.get('components', ()) ] return lambda x: self._urn_to_json_value_parser[coder_spec['urn']]( x, *component_parsers) # Used when --fix is passed. fix = False to_fix = {} # type: Dict[Tuple[int, bytes], bytes] @classmethod def tearDownClass(cls): if cls.fix and cls.to_fix: print("FIXING", len(cls.to_fix), "TESTS") doc_sep = '\n---\n' docs = open(STANDARD_CODERS_YAML).read().split(doc_sep) def quote(s): return json.dumps(s.decode('latin1')).replace(r'\u0000', r'\0') for (doc_ix, expected_encoded), actual_encoded in cls.to_fix.items(): print(quote(expected_encoded), "->", quote(actual_encoded)) docs[doc_ix] = docs[doc_ix].replace( quote(expected_encoded) + ':', quote(actual_encoded) + ':') open(STANDARD_CODERS_YAML, 'w').write(doc_sep.join(docs))
def pane_info_formatter(self): PaneInfo(is_last=True, timing=PaneInfoTiming.EARLY) self.assertEqual('Pane Final EARLY')