def test_windowing_behavior(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: base_json_pickup = "{\"ride_id\":\"x\",\"point_idx\":1,\"latitude\":0.0,\"longitude\":0.0," \ "\"timestamp\":\"00:00:00\",\"meter_reading\":1.0,\"meter_increment\":0.1," \ "\"ride_status\":\"pickup\",\"passenger_count\":1}" base_json_enroute = "{\"ride_id\":\"x\",\"point_idx\":1,\"latitude\":0.0,\"longitude\":0.0," \ "\"timestamp\":\"00:00:00\",\"meter_reading\":1.0,\"meter_increment\":0.1," \ "\"ride_status\":\"pickup\",\"passenger_count\":1}" test_stream = TestStream().advance_watermark_to(0).add_elements([ TimestampedValue(base_json_pickup, 0), TimestampedValue(base_json_pickup, 0), TimestampedValue(base_json_enroute, 0), TimestampedValue(base_json_pickup, 60) ]).advance_watermark_to(60).advance_processing_time( 60).add_elements([TimestampedValue(base_json_pickup, 120) ]).advance_watermark_to_infinity() taxi_counts = (p | test_stream | TaxiCountTransform()) EXPECTED_WINDOW_COUNTS = { IntervalWindow(0, 60): [3], IntervalWindow(60, 120): [1], IntervalWindow(120, 180): [1] } assert_that(taxi_counts, equal_to_per_window(EXPECTED_WINDOW_COUNTS), reify_windows=True)
def test_gbk_execution_after_processing_trigger_fired(self): """Advance TestClock to (X + delta) and see the pipeline does finish.""" # TODO(mariagh): Add test_gbk_execution_after_processing_trigger_unfired # Advance TestClock to (X + delta) and see the pipeline does finish # Possibly to the framework trigger_transcripts.yaml test_stream = (TestStream().advance_watermark_to(10).add_elements([ 'a' ]).advance_processing_time(5.1).advance_watermark_to_infinity()) options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p | test_stream | beam.WindowInto( beam.window.FixedWindows(15), trigger=trigger.AfterProcessingTime(5), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. expected_window_to_elements = { window.IntervalWindow(0, 15): [('k', ['a'])], } assert_that(records, equal_to_per_window(expected_window_to_elements), label='assert per window') p.run()
def test_gbk_execution_after_watermark_trigger(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements([TimestampedValue('a', 11)]) .advance_watermark_to(20) .add_elements([TimestampedValue('b', 21)]) .advance_watermark_to_infinity()) # yapf: disable options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = ( p # pylint: disable=unused-variable | test_stream | beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. # assert per window expected_window_to_elements = { window.IntervalWindow(0, 15): [('k', ['a']), ('k', [])], window.IntervalWindow(15, 30): [('k', ['b']), ('k', [])], } assert_that(records, equal_to_per_window(expected_window_to_elements), label='assert per window') p.run()
def test_combiner_latest(self): """Test TimestampCombiner with LATEST.""" options = PipelineOptions(streaming=True) with TestPipeline(options=options) as p: result = ( p | TestStream().add_elements([ window.TimestampedValue(('k', 100), 2) ]).add_elements([window.TimestampedValue( ('k', 400), 7)]).advance_watermark_to_infinity() | beam.WindowInto( window.FixedWindows(10), timestamp_combiner=TimestampCombiner.OUTPUT_AT_LATEST) | beam.CombinePerKey(sum)) records = ( result | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts))) # All the KV pairs are applied GBK using LATEST timestamp for # the same key. expected_window_to_elements = { window.IntervalWindow(0, 10): [ (('k', 500), Timestamp(7)), ], } assert_that(records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window')
def test_late_data_behavior(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: base_json_pickup = "{\"ride_id\":\"x\",\"point_idx\":1,\"latitude\":0.0,\"longitude\":0.0," \ "\"timestamp\":\"00:00:00\",\"meter_reading\":1.0,\"meter_increment\":0.1," \ "\"ride_status\":\"pickup\",\"passenger_count\":1}" test_stream = TestStream().advance_watermark_to(0).add_elements([ TimestampedValue(base_json_pickup, 0), TimestampedValue(base_json_pickup, 0), ]).advance_watermark_to( 60).advance_processing_time(60).add_elements([ TimestampedValue(base_json_pickup, 0) ]).advance_watermark_to(300).advance_processing_time( 240).add_elements([TimestampedValue(base_json_pickup, 0)]) EXPECTED_RESULTS = { IntervalWindow(0, 60): [2, 3] } #On Time and Late Result taxi_counts_late = (p | test_stream | TaxiCountTransform()) assert_that(taxi_counts_late, equal_to_per_window(EXPECTED_RESULTS), reify_windows=True)
def test_multi_triggered_gbk_side_input(self): """Test a GBK sideinput, with multiple triggering.""" # TODO(BEAM-9322): Remove use of this experiment. # This flag is only necessary when using the multi-output TestStream b/c # it relies on using the PCollection output tags as the PCollection output # ids. p = TestPipeline(additional_pipeline_args=[ '--experiments=' + 'passthrough_pcollection_output_ids' ]) test_stream = ( p | 'Mixed TestStream' >> TestStream().advance_watermark_to( 3, tag='main').add_elements( ['a1'], tag='main').advance_watermark_to( 8, tag='main').add_elements(['a2'], tag='main'). add_elements([window.TimestampedValue( ('k', 100), 2)], tag='side').add_elements( [window.TimestampedValue(('k', 400), 7)], tag='side').advance_watermark_to_infinity( tag='main').advance_watermark_to_infinity(tag='side')) main_data = ( test_stream['main'] | 'Main windowInto' >> beam.WindowInto( window.FixedWindows(5), accumulation_mode=trigger.AccumulationMode.DISCARDING)) side_data = ( test_stream['side'] | 'Side windowInto' >> beam.WindowInto( window.FixedWindows(5), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.CombinePerKey(sum) | 'Values' >> Map(lambda k_vs: k_vs[1])) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_data | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data))) expected_window_to_elements = { window.IntervalWindow(0, 5): [ ('a1', Timestamp(3), [100, 0]), ], window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])], } assert_that(records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def test_basic_execution_sideinputs_fixed_windows(self): options = PipelineOptions() options.view_as(DebugOptions).add_experiment( 'passthrough_pcollection_output_ids') options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) test_stream = (p | TestStream() .advance_watermark_to(12, tag='side') .add_elements([window.TimestampedValue('s1', 10)], tag='side') .advance_watermark_to(20, tag='side') .add_elements([window.TimestampedValue('s2', 20)], tag='side') .advance_watermark_to(9, tag='main') .add_elements(['a1', 'a2', 'a3', 'a4'], tag='main') .add_elements(['b'], tag='main') .advance_watermark_to(18, tag='main') .add_elements('c', tag='main') ) # yapf: disable main_stream = ( test_stream['main'] | 'main windowInto' >> beam.WindowInto(window.FixedWindows(1))) side_stream = ( test_stream['side'] | 'side windowInto' >> beam.WindowInto(window.FixedWindows(3))) class RecordFn(beam.DoFn): def process( self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = ( main_stream # pylint: disable=unused-variable | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream))) # assert per window expected_window_to_elements = { window.IntervalWindow(9, 10): [ ('a1', Timestamp(9), ['s1']), ('a2', Timestamp(9), ['s1']), ('a3', Timestamp(9), ['s1']), ('a4', Timestamp(9), ['s1']), ('b', Timestamp(9), ['s1']) ], window.IntervalWindow(18, 19): [('c', Timestamp(18), ['s2'])], } assert_that( records, equal_to_per_window(expected_window_to_elements), label='assert per window') p.run()
def test_basic_execution(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a', 'b', 'c']) .advance_watermark_to(20) .add_elements(['d']) .add_elements(['e']) .advance_processing_time(10) .advance_watermark_to(300) .add_elements([TimestampedValue('late', 12)]) .add_elements([TimestampedValue('last', 310)])) class RecordFn(beam.DoFn): def process(self, element=beam.DoFn.ElementParam, timestamp=beam.DoFn.TimestampParam): yield (element, timestamp) options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) my_record_fn = RecordFn() records = p | test_stream | beam.ParDo(my_record_fn) assert_that(records, equal_to([ ('a', timestamp.Timestamp(10)), ('b', timestamp.Timestamp(10)), ('c', timestamp.Timestamp(10)), ('d', timestamp.Timestamp(20)), ('e', timestamp.Timestamp(20)), ('late', timestamp.Timestamp(12)), ('last', timestamp.Timestamp(310)),])) # assert per window expected_window_to_elements = { window.IntervalWindow(0, 15): [ ('a', Timestamp(10)), ('b', Timestamp(10)), ('c', Timestamp(10)), ('late', Timestamp(12)) ], window.IntervalWindow(15, 30): [ ('d', Timestamp(20)), ('e', Timestamp(20)) ], window.IntervalWindow(300, 315): [ ('last', Timestamp(310)), ], } assert_that( records, equal_to_per_window(expected_window_to_elements), custom_windowing=window.FixedWindows(15), label='assert per window') p.run()
def test_basic_execution(self): test_stream = (TestStream() .advance_watermark_to(0) .advance_processing_time(5) .add_elements(['a', 'b', 'c']) .advance_watermark_to(2) .advance_processing_time(1) .advance_watermark_to(4) .advance_processing_time(1) .advance_watermark_to(6) .advance_processing_time(1) .advance_watermark_to(8) .advance_processing_time(1) .advance_watermark_to(10) .advance_processing_time(1) .add_elements([TimestampedValue('1', 15), TimestampedValue('2', 15), TimestampedValue('3', 15)])) # yapf: disable options = StandardOptions(streaming=True) p = TestPipeline(options=options) records = ( p | test_stream | ReverseTestStream(sample_resolution_sec=1, output_tag=None)) assert_that( records, equal_to_per_window({ beam.window.GlobalWindow(): [ [ProcessingTimeEvent(5), WatermarkEvent(0)], [ ElementEvent([ TimestampedValue('a', 0), TimestampedValue('b', 0), TimestampedValue('c', 0) ]) ], [ProcessingTimeEvent(1), WatermarkEvent(2000000)], [ProcessingTimeEvent(1), WatermarkEvent(4000000)], [ProcessingTimeEvent(1), WatermarkEvent(6000000)], [ProcessingTimeEvent(1), WatermarkEvent(8000000)], [ProcessingTimeEvent(1), WatermarkEvent(10000000)], [ ElementEvent([ TimestampedValue('1', 15), TimestampedValue('2', 15), TimestampedValue('3', 15) ]) ], ], })) p.run()
def test_multi_triggered_gbk_side_input(self): """Test a GBK sideinput, with multiple triggering.""" options = StandardOptions(streaming=True) p = TestPipeline(options=options) test_stream = ( p | 'Mixed TestStream' >> TestStream().advance_watermark_to( 3, tag='main').add_elements( ['a1'], tag='main').advance_watermark_to( 8, tag='main').add_elements(['a2'], tag='main'). add_elements([window.TimestampedValue( ('k', 100), 2)], tag='side').add_elements( [window.TimestampedValue(('k', 400), 7)], tag='side').advance_watermark_to_infinity( tag='main').advance_watermark_to_infinity(tag='side')) main_data = ( test_stream['main'] | 'Main windowInto' >> beam.WindowInto( window.FixedWindows(5), accumulation_mode=trigger.AccumulationMode.DISCARDING)) side_data = ( test_stream['side'] | 'Side windowInto' >> beam.WindowInto( window.FixedWindows(5), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.CombinePerKey(sum) | 'Values' >> Map(lambda k_vs: k_vs[1])) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_data | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data))) expected_window_to_elements = { window.IntervalWindow(0, 5): [ ('a1', Timestamp(3), [100, 0]), ], window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])], } assert_that(records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def test_basic_execution_sideinputs_fixed_windows(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) main_stream = (p | 'main TestStream' >> TestStream() .advance_watermark_to(9) .add_elements(['a1', 'a2', 'a3', 'a4']) .add_elements(['b']) .advance_watermark_to(18) .add_elements('c') | 'main windowInto' >> beam.WindowInto( window.FixedWindows(1)) ) side_stream = (p | 'side TestStream' >> TestStream() .advance_watermark_to(12) .add_elements([window.TimestampedValue('s1', 10)]) .advance_watermark_to(20) .add_elements([window.TimestampedValue('s2', 20)]) | 'side windowInto' >> beam.WindowInto( window.FixedWindows(3)) ) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_stream # pylint: disable=unused-variable | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream))) # assert per window expected_window_to_elements = { window.IntervalWindow(9, 10): [ ('a1', Timestamp(9), ['s1']), ('a2', Timestamp(9), ['s1']), ('a3', Timestamp(9), ['s1']), ('a4', Timestamp(9), ['s1']), ('b', Timestamp(9), ['s1']) ], window.IntervalWindow(18, 19):[('c', Timestamp(18), ['s2'])], } assert_that( records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def test_gbk_execution_no_triggers(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a', 'b', 'c']) .advance_watermark_to(20) .add_elements(['d']) .add_elements(['e']) .advance_processing_time(10) .advance_watermark_to(300) .add_elements([TimestampedValue('late', 12)]) .add_elements([TimestampedValue('last', 310)])) options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p | test_stream | beam.WindowInto(FixedWindows(15)) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. assert_that(records, equal_to([ ('k', ['a', 'b', 'c']), ('k', ['d', 'e']), ('k', ['late']), ('k', ['last'])])) # assert per window expected_window_to_elements = { window.IntervalWindow(15, 30): [ ('k', ['a', 'b', 'c']), ('k', ['late']), ], window.IntervalWindow(30, 45): [ ('k', ['d', 'e']), ], window.IntervalWindow(300, 315): [ ('k', ['last']), ], } assert_that( records, equal_to_per_window(expected_window_to_elements), custom_windowing=window.FixedWindows(15), label='assert per window') p.run()
def test_equal_to_per_window_fail_unmatched_window(self): with self.assertRaises(BeamAssertException): expected = { window.IntervalWindow(50, 100): [('k', [1])], } with TestPipeline(options=StandardOptions(streaming=True)) as p: assert_that((p | Create([1]) | beam.WindowInto( FixedWindows(20), trigger=trigger.AfterWatermark(), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()), equal_to_per_window(expected), reify_windows=True)
def test_equal_to_per_window_succeeds_no_reify_windows(self): start = int(MIN_TIMESTAMP.micros // 1e6) - 5 end = start + 20 expected = { window.IntervalWindow(start, end): [('k', [1])], } with TestPipeline(options=StandardOptions(streaming=True)) as p: assert_that((p | Create([1]) | beam.WindowInto( FixedWindows(20), trigger=trigger.AfterWatermark(), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()), equal_to_per_window(expected))
def test_gbk_execution_no_triggers(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a', 'b', 'c']) .advance_watermark_to(20) .add_elements(['d']) .add_elements(['e']) .advance_processing_time(10) .advance_watermark_to(300) .add_elements([TimestampedValue('late', 12)]) .add_elements([TimestampedValue('last', 310)])) options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p | test_stream | beam.WindowInto(FixedWindows(15)) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. # assert per window expected_window_to_elements = { window.IntervalWindow(0, 15): [ ('k', ['a', 'b', 'c']), ('k', ['late']), ], window.IntervalWindow(15, 30): [ ('k', ['d', 'e']), ], window.IntervalWindow(300, 315): [ ('k', ['last']), ], } assert_that( records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def test_gbk_execution_no_triggers(self): test_stream = (TestStream().advance_watermark_to(10).add_elements([ 'a', 'b', 'c' ]).advance_watermark_to(20).add_elements(['d']).add_elements([ 'e' ]).advance_processing_time(10).advance_watermark_to(300).add_elements([ TimestampedValue('late', 12) ]).add_elements([TimestampedValue('last', 310) ]).advance_watermark_to_infinity()) options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p | test_stream | beam.WindowInto(FixedWindows(15), allowed_lateness=300) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()) # TODO(https://github.com/apache/beam/issues/18441): timestamp assignment # for elements from a GBK should respect the TimestampCombiner. The test # below should also verify the timestamps of the outputted elements once # this is implemented. # assert per window expected_window_to_elements = { window.IntervalWindow(0, 15): [ ('k', ['a', 'b', 'c']), ('k', ['late']), ], window.IntervalWindow(15, 30): [ ('k', ['d', 'e']), ], window.IntervalWindow(300, 315): [ ('k', ['last']), ], } assert_that(records, equal_to_per_window(expected_window_to_elements), label='assert per window') p.run()
def test_basic_execution_sideinputs(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) main_stream = (p | 'main TestStream' >> TestStream() .advance_watermark_to(10) .add_elements(['e'])) side_stream = (p | 'side TestStream' >> TestStream() .add_elements([window.TimestampedValue(2, 2)]) .add_elements([window.TimestampedValue(1, 1)]) .add_elements([window.TimestampedValue(7, 7)]) .add_elements([window.TimestampedValue(4, 4)]) ) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_stream # pylint: disable=unused-variable | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream))) # assert per window expected_window_to_elements = { window.IntervalWindow(0, 15): [ ('e', Timestamp(10), [2, 1, 7, 4]), ], } assert_that( records, equal_to_per_window(expected_window_to_elements), custom_windowing=window.FixedWindows(15), label='assert per window') assert_that(records, equal_to([('e', Timestamp(10), [2, 1, 7, 4])])) p.run()
def test_basic_execution_batch_sideinputs_fixed_windows(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) main_stream = ( p | 'main TestStream' >> TestStream().advance_watermark_to(2).add_elements( ['a']).advance_watermark_to(4).add_elements( ['b']).advance_watermark_to_infinity() | 'main window' >> beam.WindowInto(window.FixedWindows(1))) side = ( p | beam.Create([2, 1, 4]) | beam.Map(lambda t: window.TimestampedValue(t, t)) | beam.WindowInto(window.FixedWindows(2))) class RecordFn(beam.DoFn): def process( self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = ( main_stream # pylint: disable=unused-variable | beam.ParDo(RecordFn(), beam.pvalue.AsList(side))) # assert per window expected_window_to_elements = { window.IntervalWindow(2, 3): [('a', Timestamp(2), [2])], window.IntervalWindow(4, 5): [('b', Timestamp(4), [4])] } assert_that( records, equal_to_per_window(expected_window_to_elements), label='assert per window') p.run()
def test_deduplication_in_different_windows(self): with self.create_pipeline() as p: test_stream = ( TestStream( coder=coders.StrUtf8Coder()).advance_watermark_to(0).add_elements( [ window.TimestampedValue('k1', 0), window.TimestampedValue('k2', 10), window.TimestampedValue('k3', 20), window.TimestampedValue('k1', 30), window.TimestampedValue('k2', 40), window.TimestampedValue('k3', 50), window.TimestampedValue('k4', 60), window.TimestampedValue('k5', 70), window.TimestampedValue('k6', 80) ]).advance_watermark_to_infinity()) res = ( p | test_stream | beam.WindowInto(window.FixedWindows(30)) | deduplicate.Deduplicate(processing_time_duration=10 * 60) | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts))) # Deduplication should happen per window. expect_unique_keys_per_window = { window.IntervalWindow(0, 30): [('k1', Timestamp(0)), ('k2', Timestamp(10)), ('k3', Timestamp(20))], window.IntervalWindow(30, 60): [('k1', Timestamp(30)), ('k2', Timestamp(40)), ('k3', Timestamp(50))], window.IntervalWindow(60, 90): [('k4', Timestamp(60)), ('k5', Timestamp(70)), ('k6', Timestamp(80))], } assert_that( res, equal_to_per_window(expect_unique_keys_per_window), use_global_window=False, label='assert per window')
def test_basic_execution_batch_sideinputs_fixed_windows(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) main_stream = (p | 'main TestStream' >> TestStream() .advance_watermark_to(2) .add_elements(['a']) .advance_watermark_to(4) .add_elements(['b']) | 'main window' >> beam.WindowInto(window.FixedWindows(1))) side = (p | beam.Create([2, 1, 4]) | beam.Map(lambda t: window.TimestampedValue(t, t)) | beam.WindowInto(window.FixedWindows(2))) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_stream # pylint: disable=unused-variable | beam.ParDo(RecordFn(), beam.pvalue.AsList(side))) # assert per window expected_window_to_elements = { window.IntervalWindow(2, 3):[('a', Timestamp(2), [2])], window.IntervalWindow(4, 5):[('b', Timestamp(4), [4])] } assert_that( records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def test_gbk_execution_after_processing_trigger_fired(self): """Advance TestClock to (X + delta) and see the pipeline does finish.""" # TODO(mariagh): Add test_gbk_execution_after_processing_trigger_unfired # Advance TestClock to (X + delta) and see the pipeline does finish # Possibly to the framework trigger_transcripts.yaml test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a']) .advance_processing_time(5.1)) options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p | test_stream | beam.WindowInto( beam.window.FixedWindows(15), trigger=trigger.AfterProcessingTime(5), accumulation_mode=trigger.AccumulationMode.DISCARDING ) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. expected_window_to_elements = { window.IntervalWindow(0, 15): [('k', ['a'])], } assert_that( records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def test_gbk_execution_after_watermark_trigger(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a']) .advance_watermark_to(20)) options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p # pylint: disable=unused-variable | test_stream | beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. # assert per window expected_window_to_elements = { window.IntervalWindow(15, 30): [ ('k', ['a']), ('k', []), ], } assert_that( records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def test_multiple_outputs_with_watermark_advancement(self): """Tests that the TestStream can independently control output watermarks.""" # Purposely set the watermark of numbers to 20 then letters to 5 to test # that the watermark advancement is per PCollection. # # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be # emitted at different times so that they will have different windows. The # watermark advancement is checked by checking their windows. If the # watermark does not advance, then the windows will be [-inf, -inf). If the # windows do not advance separately, then the PCollections will both # windowed in [15, 30). letters_elements = [ TimestampedValue('a', 6), TimestampedValue('b', 7), TimestampedValue('c', 8), ] numbers_elements = [ TimestampedValue('1', 21), TimestampedValue('2', 22), TimestampedValue('3', 23), ] test_stream = (TestStream().advance_watermark_to( 0, tag='letters').advance_watermark_to( 0, tag='numbers').advance_watermark_to( 20, tag='numbers').advance_watermark_to( 5, tag='letters').add_elements( letters_elements, tag='letters').advance_watermark_to( 10, tag='letters').add_elements( numbers_elements, tag='numbers').advance_watermark_to( 30, tag='numbers')) options = StandardOptions(streaming=True) p = TestPipeline(is_integration_test=True, options=options) main = p | test_stream # Use an AfterWatermark trigger with an early firing to test that the # watermark is advancing properly and that the element is being emitted in # the correct window. letters = ( main['letters'] | 'letter windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'letter with key' >> beam.Map(lambda x: ('k', x)) | 'letter gbk' >> beam.GroupByKey()) numbers = ( main['numbers'] | 'number windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'number with key' >> beam.Map(lambda x: ('k', x)) | 'number gbk' >> beam.GroupByKey()) # The letters were emitted when the watermark was at 5, thus we expect to # see the elements in the [0, 15) window. We used an early trigger to make # sure that the ON_TIME empty pane was also emitted with a TestStream. # This pane has no data because of the early trigger causes the elements to # fire before the end of the window and because the accumulation mode # discards any data after the trigger fired. expected_letters = { window.IntervalWindow(0, 15): [ ('k', ['a', 'b', 'c']), ('k', []), ], } # Same here, except the numbers were emitted at watermark = 20, thus they # are in the [15, 30) window. expected_numbers = { window.IntervalWindow(15, 30): [ ('k', ['1', '2', '3']), ('k', []), ], } assert_that(letters, equal_to_per_window(expected_letters), label='letters assert per window') assert_that(numbers, equal_to_per_window(expected_numbers), label='numbers assert per window') p.run()
def assertion_matcher(pcol, expected_values): """Assertion matcher to match the pipeline output to expected output""" assert_that(pcol, equal_to_per_window(expected_values), use_global_window=False, label='Assert events per window.')
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--output_topic', required=True, help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) group = parser.add_mutually_exclusive_group(required=True) group.add_argument( '--input_topic', help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) group.add_argument( '--input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=pipeline_options) as p: # Read from PubSub into a PCollection. if known_args.input_subscription: lines = p | beam.io.ReadFromPubSub( subscription=known_args.input_subscription) else: lines = p | beam.io.ReadFromPubSub(topic=known_args.input_topic) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = (lines | 'AddTimestampFn' >> beam.ParDo(AddTimestampFn()) | 'After AddTimestampFn' >> ParDo(PrintFn('After AddTimestampFn')) | 'Split' >> (beam.ParDo(WordExtractingDoFn()) .with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | beam.WindowInto(window.FixedWindows(5, 0)) | 'GroupByKey' >> beam.GroupByKey() | 'CountOnes' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = counts | 'format' >> beam.Map(format_result) # Write to PubSub. # pylint: disable=expression-not-assigned output | beam.io.WriteStringsToPubSub(known_args.output_topic) def check_gbk_format(): # A matcher that checks that the output of GBK is of the form word: count. def matcher(elements): # pylint: disable=unused-variable actual_elements_in_window, window = elements for elm in actual_elements_in_window: assert re.match(r'\S+:\s+\d+', elm) is not None return matcher # Check that the format of the output is correct. assert_that( output, check_gbk_format(), use_global_window=False, label='Assert word:count format.') # Check also that elements are ouput in the right window. # This expects exactly 1 occurrence of any subset of the elements # 150, 151, 152, 153, 154 in the window [150, 155) # or exactly 1 occurrence of any subset of the elements # 210, 211, 212, 213, 214 in the window [210, 215). expected_window_to_elements = { window.IntervalWindow(150, 155): [ ('150: 1'), ('151: 1'), ('152: 1'), ('153: 1'), ('154: 1'), ], window.IntervalWindow(210, 215): [ ('210: 1'), ('211: 1'), ('212: 1'), ('213: 1'), ('214: 1'), ], } # To pass, publish numbers in [150-155) or [210-215) with no repeats. # To fail, publish a repeated number in the range above range. # For example: '210 213 151 213' assert_that( output, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='Assert correct streaming windowing.')
def test_basic_execution_in_records_format(self): test_stream = (TestStream() .advance_watermark_to(0) .advance_processing_time(5) .add_elements(['a', 'b', 'c']) .advance_watermark_to(2) .advance_processing_time(1) .advance_watermark_to(4) .advance_processing_time(1) .advance_watermark_to(6) .advance_processing_time(1) .advance_watermark_to(8) .advance_processing_time(1) .advance_watermark_to(10) .advance_processing_time(1) .add_elements([TimestampedValue('1', 15), TimestampedValue('2', 15), TimestampedValue('3', 15)])) # yapf: disable options = StandardOptions(streaming=True) p = TestPipeline(options=options) coder = beam.coders.FastPrimitivesCoder() records = (p | test_stream | ReverseTestStream( sample_resolution_sec=1, coder=coder, output_format=OutputFormat.TEST_STREAM_FILE_RECORDS, output_tag=None) | 'stringify' >> beam.Map(str)) assert_that( records, equal_to_per_window({ beam.window.GlobalWindow(): [ str(TestStreamFileHeader()), str( TestStreamFileRecord( recorded_event=TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event. AdvanceProcessingTime( advance_duration=5000000)))), str( TestStreamFileRecord( recorded_event=TestStreamPayload.Event( watermark_event=TestStreamPayload.Event. AdvanceWatermark(new_watermark=0)))), str( TestStreamFileRecord( recorded_event=TestStreamPayload.Event( element_event=TestStreamPayload.Event. AddElements(elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode('a'), timestamp=0), TestStreamPayload.TimestampedElement( encoded_element=coder.encode('b'), timestamp=0), TestStreamPayload.TimestampedElement( encoded_element=coder.encode('c'), timestamp=0), ])))), str( TestStreamFileRecord( recorded_event=TestStreamPayload.Event( watermark_event=TestStreamPayload.Event. AdvanceWatermark(new_watermark=2000000)))), str( TestStreamFileRecord( recorded_event=TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event. AdvanceProcessingTime( advance_duration=1000000)))), str( TestStreamFileRecord( recorded_event=TestStreamPayload.Event( watermark_event=TestStreamPayload.Event. AdvanceWatermark(new_watermark=4000000)))), str( TestStreamFileRecord( recorded_event=TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event. AdvanceProcessingTime( advance_duration=1000000)))), str( TestStreamFileRecord( recorded_event=TestStreamPayload.Event( watermark_event=TestStreamPayload.Event. AdvanceWatermark(new_watermark=6000000)))), str( TestStreamFileRecord( recorded_event=TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event. AdvanceProcessingTime( advance_duration=1000000)))), str( TestStreamFileRecord( recorded_event=TestStreamPayload.Event( watermark_event=TestStreamPayload.Event. AdvanceWatermark(new_watermark=8000000)))), str( TestStreamFileRecord( recorded_event=TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event. AdvanceProcessingTime( advance_duration=1000000)))), str( TestStreamFileRecord( recorded_event=TestStreamPayload.Event( watermark_event=TestStreamPayload.Event. AdvanceWatermark(new_watermark=10000000)))), str( TestStreamFileRecord( recorded_event=TestStreamPayload.Event( processing_time_event=TestStreamPayload.Event. AdvanceProcessingTime( advance_duration=1000000)))), str( TestStreamFileRecord( recorded_event=TestStreamPayload.Event( element_event=TestStreamPayload.Event. AddElements(elements=[ TestStreamPayload.TimestampedElement( encoded_element=coder.encode('1'), timestamp=15000000), TestStreamPayload.TimestampedElement( encoded_element=coder.encode('2'), timestamp=15000000), TestStreamPayload.TimestampedElement( encoded_element=coder.encode('3'), timestamp=15000000), ])))), ], })) p.run()
def test_windowing(self): test_stream = (TestStream() .advance_watermark_to(0) .add_elements(['a', 'b', 'c']) .advance_processing_time(1) .advance_processing_time(1) .advance_processing_time(1) .advance_processing_time(1) .advance_processing_time(1) .advance_watermark_to(5) .add_elements(['1', '2', '3']) .advance_processing_time(1) .advance_watermark_to(6) .advance_processing_time(1) .advance_watermark_to(7) .advance_processing_time(1) .advance_watermark_to(8) .advance_processing_time(1) .advance_watermark_to(9) .advance_processing_time(1) .advance_watermark_to(10) .advance_processing_time(1) .advance_watermark_to(11) .advance_processing_time(1) .advance_watermark_to(12) .advance_processing_time(1) .advance_watermark_to(13) .advance_processing_time(1) .advance_watermark_to(14) .advance_processing_time(1) .advance_watermark_to(15) .advance_processing_time(1) ) # yapf: disable options = StandardOptions(streaming=True) p = TestPipeline(options=options) records = (p | test_stream | 'letter windows' >> beam.WindowInto( FixedWindows(5), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'letter with key' >> beam.Map(lambda x: ('k', x)) | 'letter gbk' >> beam.GroupByKey() | ReverseTestStream(sample_resolution_sec=1, output_tag=None)) assert_that( records, equal_to_per_window({ beam.window.GlobalWindow(): [ [ProcessingTimeEvent(5), WatermarkEvent(4999998)], [ ElementEvent([ TimestampedValue(('k', ['a', 'b', 'c']), 4.999999) ]) ], [ProcessingTimeEvent(1), WatermarkEvent(5000000)], [ProcessingTimeEvent(1), WatermarkEvent(6000000)], [ProcessingTimeEvent(1), WatermarkEvent(7000000)], [ProcessingTimeEvent(1), WatermarkEvent(8000000)], [ProcessingTimeEvent(1), WatermarkEvent(9000000)], [ ElementEvent([ TimestampedValue(('k', ['1', '2', '3']), 9.999999) ]) ], [ProcessingTimeEvent(1), WatermarkEvent(10000000)], [ProcessingTimeEvent(1), WatermarkEvent(11000000)], [ProcessingTimeEvent(1), WatermarkEvent(12000000)], [ProcessingTimeEvent(1), WatermarkEvent(13000000)], [ProcessingTimeEvent(1), WatermarkEvent(14000000)], [ProcessingTimeEvent(1), WatermarkEvent(15000000)], ], })) p.run()
def test_output(): options = PipelineOptions() options.view_as(StandardOptions).streaming = True test_pipeline = TestPipeline(options=options) events = (test_pipeline | TestStream().add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 1, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 2, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 3, 0, tzinfo=pytz.UTC).timestamp()). add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 4, 0, tzinfo=pytz.UTC).timestamp()).advance_watermark_to( datetime( 2021, 3, 1, 0, 0, 5, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 5, 0, tzinfo=pytz.UTC).timestamp()). add_elements(elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 6, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 7, 0, tzinfo=pytz.UTC).timestamp()). add_elements(elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 8, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 9, 0, tzinfo=pytz.UTC).timestamp()). advance_watermark_to( datetime( 2021, 3, 1, 0, 0, 10, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 10, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 11, 0, tzinfo=pytz.UTC).timestamp()). add_elements(elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 12, 0, tzinfo=pytz.UTC). timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 13, 0, tzinfo=pytz.UTC).timestamp()). add_elements(elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 14, 0, tzinfo=pytz.UTC).timestamp()). advance_watermark_to( datetime( 2021, 3, 1, 0, 0, 15, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 15, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 16, 0, tzinfo=pytz.UTC).timestamp()). add_elements(elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 17, 0, tzinfo=pytz.UTC). timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 18, 0, tzinfo=pytz.UTC).timestamp()). add_elements(elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 19, 0, tzinfo=pytz.UTC).timestamp()). advance_watermark_to( datetime(2021, 3, 1, 0, 0, 20, 0, tzinfo=pytz.UTC).timestamp() ).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 20, 0, tzinfo=pytz.UTC).timestamp()).advance_watermark_to( datetime(2021, 3, 1, 0, 0, 25, 0, tzinfo=pytz.UTC). timestamp()).advance_watermark_to_infinity()) results = apply_transform(events) answers = { window.IntervalWindow( datetime(2021, 3, 1, 0, 0, 0, 0, tzinfo=pytz.UTC).timestamp(), datetime(2021, 3, 2, 0, 0, 0, 0, tzinfo=pytz.UTC).timestamp()): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], } assert_that(results, equal_to_per_window(answers), label='count assert per window') test_pipeline.run()
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--output_topic', required=True, help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) group = parser.add_mutually_exclusive_group(required=True) group.add_argument( '--input_topic', help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) group.add_argument( '--input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=pipeline_options) # Read from PubSub into a PCollection. if known_args.input_subscription: lines = p | beam.io.ReadStringsFromPubSub( subscription=known_args.input_subscription) else: lines = p | beam.io.ReadStringsFromPubSub(topic=known_args.input_topic) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = (lines | 'AddTimestampFn' >> beam.ParDo(AddTimestampFn()) | 'After AddTimestampFn' >> ParDo(PrintFn('After AddTimestampFn')) | 'Split' >> (beam.ParDo(WordExtractingDoFn()) .with_output_types(six.text_type)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | beam.WindowInto(window.FixedWindows(5, 0)) | 'GroupByKey' >> beam.GroupByKey() | 'CountOnes' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = counts | 'format' >> beam.Map(format_result) # Write to PubSub. # pylint: disable=expression-not-assigned output | beam.io.WriteStringsToPubSub(known_args.output_topic) def check_gbk_format(): # A matcher that checks that the output of GBK is of the form word: count. def matcher(elements): # pylint: disable=unused-variable actual_elements_in_window, window = elements for elm in actual_elements_in_window: assert re.match(r'\S+:\s+\d+', elm) is not None return matcher # Check that the format of the output is correct. assert_that( output, check_gbk_format(), use_global_window=False, label='Assert word:count format.') # Check also that elements are ouput in the right window. # This expects exactly 1 occurrence of any subset of the elements # 150, 151, 152, 153, 154 in the window [150, 155) # or exactly 1 occurrence of any subset of the elements # 210, 211, 212, 213, 214 in the window [210, 215). expected_window_to_elements = { window.IntervalWindow(150, 155): [ ('150: 1'), ('151: 1'), ('152: 1'), ('153: 1'), ('154: 1'), ], window.IntervalWindow(210, 215): [ ('210: 1'), ('211: 1'), ('212: 1'), ('213: 1'), ('214: 1'), ], } # To make it pass, publish numbers in [150-155) or [210-215) with no repeats. # To make it fail, publish a repeated number in the range above range. # For example: '210 213 151 213' assert_that( output, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='Assert correct streaming windowing.') result = p.run() result.wait_until_finish()