def test_windowing_encoding(self): for windowing in ( Windowing(GlobalWindows()), Windowing(FixedWindows(1, 3), AfterCount(6), accumulation_mode=AccumulationMode.ACCUMULATING), Windowing(SlidingWindows(10, 15, 21), AfterCount(28), timestamp_combiner=TimestampCombiner.OUTPUT_AT_LATEST, accumulation_mode=AccumulationMode.DISCARDING)): context = pipeline_context.PipelineContext() self.assertEqual( windowing, Windowing.from_runner_api(windowing.to_runner_api(context), context))
def test_trigger_encoding(self): for trigger_fn in (DefaultTrigger(), AfterAll(AfterCount(1), AfterCount(10)), AfterAny(AfterCount(10), AfterCount(100)), AfterWatermark(early=AfterCount(1000)), AfterWatermark(early=AfterCount(1000), late=AfterCount(1)), Repeatedly(AfterCount(100)), trigger.OrFinally(AfterCount(3), AfterCount(10))): context = pipeline_context.PipelineContext() self.assertEqual( trigger_fn, TriggerFn.from_runner_api(trigger_fn.to_runner_api(context), context))
def test_fixed_watermark_with_early_late(self): self.run_trigger_simple( FixedWindows(100), # pyformat break AfterWatermark(early=AfterCount(3), late=AfterCount(2)), AccumulationMode.DISCARDING, zip(range(9), 'abcdefghi'), {IntervalWindow(0, 100): [ set('abcd'), set('efgh'), # early set('i'), # on time set('vw'), set('xy') # late ]}, 2, late_data=zip(range(5), 'vwxyz'))
def test_fixed_watermark_with_early(self): self.run_trigger_simple( FixedWindows(10), # pyformat break AfterWatermark(early=AfterCount(2)), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('ab'), set('abc')]}, 2) self.run_trigger_simple( FixedWindows(10), # pyformat break AfterWatermark(early=AfterCount(2)), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('abc'), set('abc')]}, 3)
def expand(self, pcoll): class ReifyTimestamps(DoFn): def process(self, element, timestamp=DoFn.TimestampParam): yield element[0], TimestampedValue(element[1], timestamp) class RestoreTimestamps(DoFn): def process(self, element, window=DoFn.WindowParam): # Pass the current window since _IdentityWindowFn wouldn't know how # to generate it. yield windowed_value.WindowedValue( (element[0], element[1].value), element[1].timestamp, [window]) windowing_saved = pcoll.windowing # The linter is confused. # pylint: disable=abstract-class-instantiated result = ( pcoll | ParDo(ReifyTimestamps()) | 'IdentityWindow' >> WindowInto( _IdentityWindowFn(windowing_saved.windowfn.get_window_coder()), trigger=AfterCount(1), accumulation_mode=AccumulationMode.DISCARDING, timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST, ) | GroupByKey() | 'ExpandIterable' >> FlatMap(lambda e: [(e[0], value) for value in e[1]]) | ParDo(RestoreTimestamps())) result._windowing = windowing_saved return result
def test_multiple_accumulating_firings(self): # PCollection will contain elements from 1 to 10. elements = [i for i in range(1, 11)] ts = TestStream().advance_watermark_to(0) for i in elements: ts.add_elements([('key', str(i))]) if i % 5 == 0: ts.advance_watermark_to(i) ts.advance_processing_time(5) options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: _ = ( p | ts | beam.WindowInto( FixedWindows(10), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, trigger=AfterWatermark( early=AfterAll(AfterCount(1), AfterProcessingTime(5)))) | beam.GroupByKey() | beam.FlatMap(lambda x: x[1]) | beam.ParDo(self.record_dofn())) # The trigger should fire twice. Once after 5 seconds, and once after 10. # The firings should accumulate the output. first_firing = [str(i) for i in elements if i <= 5] second_firing = [str(i) for i in elements] self.assertListEqual(first_firing + second_firing, TriggerPipelineTest.all_records)
def test_fixed_after_count_accumulating(self): # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([('k1', 1), ('k1', 1), ('k2', 1), ('k2', 1)]) .add_elements([('k1', 1), ('k1', 1)]) .advance_watermark_to(2) .add_elements([('k1', 2), ('k2', 2)]) # This values are discarded. .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(2), triggerfn=Repeatedly(AfterCount(2)), accumulation_mode=AccumulationMode.ACCUMULATING) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 2), [1, 1]), ('k2', IntervalWindow(0, 2), [1, 1]), ('k1', IntervalWindow(0, 2), [1, 1, 1, 1]), ]))
def test_after_count_streaming(self): test_options = PipelineOptions( flags=['--allow_unsafe_triggers', '--streaming']) with TestPipeline(options=test_options) as p: # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([('A', 1), ('A', 2), ('A', 3)]) .add_elements([('A', 4), ('A', 5), ('A', 6)]) .add_elements([('B', 1), ('B', 2), ('B', 3)]) .advance_watermark_to_infinity()) # yapf: enable results = (p | test_stream | beam.WindowInto( FixedWindows(10), trigger=AfterCount(3), accumulation_mode=AccumulationMode.ACCUMULATING) | beam.GroupByKey()) assert_that( results, equal_to( list({ 'A': [1, 2, 3], # 4 - 6 discarded because trigger finished 'B': [1, 2, 3] }.items())))
def test_model_early_late_triggers(self): pipeline_options = PipelineOptions() pipeline_options.view_as(StandardOptions).streaming = True with TestPipeline(options=pipeline_options) as p: test_stream = ( TestStream().advance_watermark_to(10).add_elements([ 'a', 'a', 'a', 'b', 'b' ]).add_elements([ TimestampedValue('a', 10) ]).advance_watermark_to(20).advance_processing_time(60).add_elements( [TimestampedValue('a', 10)])) trigger = ( # [START model_early_late_triggers] AfterWatermark( early=AfterProcessingTime(delay=1 * 60), late=AfterCount(1)) # [END model_early_late_triggers] ) counts = ( p | test_stream | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | WindowInto( FixedWindows(15), trigger=trigger, allowed_lateness=20, accumulation_mode=AccumulationMode.DISCARDING) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1])))) assert_that(counts, equal_to([('a', 4), ('b', 2), ('a', 1)]))
def test_after_count(self): with TestPipeline() as p: def construct_timestamped(k_t): return TimestampedValue((k_t[0], k_t[1]), k_t[1]) def format_result(k_v): return ('%s-%s' % (k_v[0], len(k_v[1])), set(k_v[1])) result = (p | beam.Create([1, 2, 3, 4, 5, 10, 11]) | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)]) | beam.Map(construct_timestamped) | beam.WindowInto( FixedWindows(10), trigger=AfterCount(3), accumulation_mode=AccumulationMode.DISCARDING) | beam.GroupByKey() | beam.Map(format_result)) assert_that( result, equal_to( list({ 'A-5': {1, 2, 3, 4, 5}, # A-10, A-11 never emitted due to AfterCount(3) never firing. 'B-4': {6, 7, 8, 9}, 'B-3': {10, 15, 16}, }.items())))
def test_combining_with_accumulation_mode_and_fanout(self): # PCollection will contain elements from 1 to 5. elements = [i for i in range(1, 6)] ts = TestStream().advance_watermark_to(0) for i in elements: ts.add_elements([i]) ts.advance_watermark_to_infinity() options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: result = ( p | ts | beam.WindowInto( GlobalWindows(), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, trigger=AfterWatermark(early=AfterAll(AfterCount(1)))) | beam.CombineGlobally(sum).without_defaults().with_fanout(2)) def has_expected_values(actual): from hamcrest.core import assert_that as hamcrest_assert from hamcrest.library.collection import contains from hamcrest.library.collection import only_contains ordered = sorted(actual) # Early firings. hamcrest_assert(ordered[:4], contains(1, 3, 6, 10)) # Different runners have different number of 15s, but there should # be at least one 15. hamcrest_assert(ordered[4:], only_contains(15)) assert_that(result, has_expected_values)
def test_model_other_composite_triggers(self): pipeline_options = PipelineOptions() pipeline_options.view_as(StandardOptions).streaming = True with TestPipeline(options=pipeline_options) as p: test_stream = ( TestStream().advance_watermark_to(10).add_elements( ['a', 'a']).add_elements( ['a', 'b', 'b']).advance_processing_time(60).add_elements(['a'] * 100)) pcollection = ( p | test_stream | 'pair_with_one' >> beam.Map(lambda x: (x, 1))) counts = ( # [START model_other_composite_triggers] pcollection | WindowInto( FixedWindows(1 * 60), trigger=Repeatedly( AfterAny(AfterCount(100), AfterProcessingTime(1 * 60))), accumulation_mode=AccumulationMode.DISCARDING) # [END model_other_composite_triggers] | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1])))) assert_that(counts, equal_to([('a', 3), ('b', 2), ('a', 100)]))
def test_fixed_after_count(self): self.run_trigger_simple( FixedWindows(10), # pyformat break AfterCount(2), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c'), (11, 'z')], {IntervalWindow(0, 10): [set('ab')]}, 1, 2) self.run_trigger_simple( FixedWindows(10), # pyformat break AfterCount(2), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c'), (11, 'z')], {IntervalWindow(0, 10): [set('abc')]}, 3, 4)
def test_sessions_repeatedly_after_count(self): self.run_trigger_simple( Sessions(10), # pyformat break Repeatedly(AfterCount(2)), AccumulationMode.ACCUMULATING, [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')], {IntervalWindow(1, 25): [set('abc'), set('abcde')]}, 1, 3) self.run_trigger_simple( Sessions(10), # pyformat break Repeatedly(AfterCount(2)), AccumulationMode.DISCARDING, [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')], {IntervalWindow(1, 25): [set('abc'), set('de')]}, 1, 3)
def test_sessions_after_all(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterAll(AfterCount(2), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(1, 13): [set('abc')]}, 1, 2) self.run_trigger_simple( Sessions(10), # pyformat break AfterAll(AfterCount(5), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(1, 13): [set('abcxy')]}, 1, 2, late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
def expand(self, events): return (events | beam.WindowInto( FixedWindows(1 * 24 * 60 * 60), # 1 Day Window trigger=AfterWatermark(early=AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING, allowed_lateness=Duration(seconds=0)) | beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults())
def test_fixed_after_first(self): self.run_trigger_simple( FixedWindows(10), # pyformat break AfterAny(AfterCount(2), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('ab')]}, 1, 2) self.run_trigger_simple( FixedWindows(10), # pyformat break AfterAny(AfterCount(5), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('abc')]}, 1, 2, late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
def run(argv=None): from apache_beam.transforms.window import TimestampedValue, FixedWindows pubsub_input_topic = 'projects/professionaldataengineercourse/topics/faces_on_images' with beam.Pipeline(options=get_pipeline_options()) as pipeline: logging.info("pubsub_input_topic = {}".format(pubsub_input_topic)) json_messages = \ (pipeline | 'ReadFromPubSubTopic' >> beam.io.ReadFromPubSub(topic=pubsub_input_topic).with_output_types(bytes) | 'DecodeMessagesFromPubSub' >> beam.Map(decode_message) ) window_size_s = 30 allowed_lateness_s = 60 high_confidence_faces_grouped_by_emotion_count_per_window = ( json_messages | 'ParseJsonMessage' >> beam.Map(parse_jsons) | 'FilterHighFaceConfidence' >> beam.ParDo(FilterHighConfidenceFacesDoFn()) | 'FlatMapFAcesWithHighEmotionLikelihood' >> beam.FlatMap(get_faces_with_high_emotion_likelihood) | 'UseCustomTimestamp' >> beam.Map(lambda face_info: TimestampedValue(face_info, face_info['ts_seconds'])) | 'WindowFaceInfo' >> beam.WindowInto( FixedWindows(window_size_s, 0), trigger=AfterWatermark( early=AfterAny(AfterCount(5), AfterProcessingTime(10)), late=AfterAll(AfterCount(2), AfterProcessingTime(20))), allowed_lateness=allowed_lateness_s, accumulation_mode=AccumulationMode.DISCARDING) | 'PairEmotionWithFace' >> beam.Map(lambda face_info: (face_info['emotion'], face_info)) | 'GroupByEmotion' >> beam.GroupByKey() | 'FormatOutputForBigQuery' >> beam.ParDo(FormatFaceInfoPerWindow()) ) log_p_collection(high_confidence_faces_grouped_by_emotion_count_per_window, "OutputToBigQuery") high_confidence_faces_grouped_by_emotion_count_per_window | 'WriteToBigQuery' >> beam.io.WriteToBigQuery( bq_faces_windowed_table_name, schema={"fields": bq_faces_windowed_table_schema}, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) pipeline_result = pipeline.run() pipeline_result.wait_until_finish()
def test_sessions_and_complex_trigger_accumulating(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 1), tsv('k1', 2, 15), tsv('k1', 3, 7), tsv('k1', 4, 30)]) .advance_watermark_to(50) .add_elements([tsv('k1', -3, 1), tsv('k1', -2, 2),]) .add_elements([tsv('k1', -1, 21)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(Sessions(10), triggerfn=AfterWatermark(early=AfterCount(2), late=AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING, allowed_lateness=MAX_TIMESTAMP.seconds()) with TestPipeline() as p: result = (p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo( trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], set(v.value for v in elm[1])))) assert_that( result, equal_to([ ('k1', IntervalWindow(1, 25), {1, 2, 3}), # early ('k1', IntervalWindow(1, 25), {1, 2, 3}), # on time ('k1', IntervalWindow(30, 40), {4}), # on time ('k1', IntervalWindow(1, 25), {1, 2, 3, -3, -2}), # late ('k1', IntervalWindow(1, 40), {1, 2, 3, 4, -3, -2, -1}), # late ]))
def test_sessions_after_each(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterEach(AfterCount(2), AfterCount(3)), AccumulationMode.ACCUMULATING, zip(range(10), 'abcdefghij'), {IntervalWindow(0, 11): [set('ab')], IntervalWindow(0, 15): [set('abcdef')]}, 2) self.run_trigger_simple( Sessions(10), # pyformat break Repeatedly(AfterEach(AfterCount(2), AfterCount(3))), AccumulationMode.ACCUMULATING, zip(range(10), 'abcdefghij'), {IntervalWindow(0, 11): [set('ab')], IntervalWindow(0, 15): [set('abcdef')], IntervalWindow(0, 17): [set('abcdefgh')]}, 2)
def expand(self, pcoll): windowing_saved = pcoll.windowing if windowing_saved.is_default(): # In this (common) case we can use a trivial trigger driver # and avoid the (expensive) window param. globally_windowed = window.GlobalWindows.windowed_value(None) window_fn = window.GlobalWindows() MIN_TIMESTAMP = window.MIN_TIMESTAMP def reify_timestamps(element, timestamp=DoFn.TimestampParam): key, value = element if timestamp == MIN_TIMESTAMP: timestamp = None return key, (value, timestamp) def restore_timestamps(element): key, values = element return [ globally_windowed.with_value((key, value)) if timestamp is None else window.GlobalWindows.windowed_value((key, value), timestamp) for (value, timestamp) in values] else: # The linter is confused. # hash(1) is used to force "runtime" selection of _IdentityWindowFn # pylint: disable=abstract-class-instantiated cls = hash(1) and _IdentityWindowFn window_fn = cls( windowing_saved.windowfn.get_window_coder()) def reify_timestamps(element, timestamp=DoFn.TimestampParam): key, value = element return key, TimestampedValue(value, timestamp) def restore_timestamps(element, window=DoFn.WindowParam): # Pass the current window since _IdentityWindowFn wouldn't know how # to generate it. key, values = element return [ windowed_value.WindowedValue( (key, value.value), value.timestamp, [window]) for value in values] ungrouped = pcoll | Map(reify_timestamps) ungrouped._windowing = Windowing( window_fn, triggerfn=AfterCount(1), accumulation_mode=AccumulationMode.DISCARDING, timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST) result = (ungrouped | GroupByKey() | FlatMap(restore_timestamps)) result._windowing = windowing_saved return result
def expand(self, pcoll): windowing_saved = pcoll.windowing if windowing_saved.is_default(): # In this (common) case we can use a trivial trigger driver # and avoid the (expensive) window param. globally_windowed = window.GlobalWindows.windowed_value(None) MIN_TIMESTAMP = window.MIN_TIMESTAMP def reify_timestamps(element, timestamp=DoFn.TimestampParam): key, value = element if timestamp == MIN_TIMESTAMP: timestamp = None return key, (value, timestamp) def restore_timestamps(element): key, values = element return [ globally_windowed.with_value((key, value)) if timestamp is None else window.GlobalWindows.windowed_value( (key, value), timestamp) for (value, timestamp) in values ] else: def reify_timestamps(element, timestamp=DoFn.TimestampParam, window=DoFn.WindowParam): key, value = element # Transport the window as part of the value and restore it later. return key, windowed_value.WindowedValue( value, timestamp, [window]) def restore_timestamps(element): key, windowed_values = element return [ wv.with_value((key, wv.value)) for wv in windowed_values ] ungrouped = pcoll | Map(reify_timestamps).with_output_types(Any) # TODO(BEAM-8104) Using global window as one of the standard window. # This is to mitigate the Dataflow Java Runner Harness limitation to # accept only standard coders. ungrouped._windowing = Windowing( window.GlobalWindows(), triggerfn=Repeatedly(AfterCount(1)), accumulation_mode=AccumulationMode.DISCARDING, timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST) result = (ungrouped | GroupByKey() | FlatMap(restore_timestamps).with_output_types(Any)) result._windowing = windowing_saved return result
def test_sessions_watermark_with_early_late(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterWatermark(early=AfterCount(2), late=AfterCount(1)), AccumulationMode.ACCUMULATING, [(1, 'a'), (15, 'b'), (7, 'c'), (30, 'd')], { IntervalWindow(1, 25): [ set('abc'), # early set('abc'), # on time set('abcxy') # late ], IntervalWindow(30, 40): [ set('d'), # on time ], IntervalWindow(1, 40): [ set('abcdxyz') # late ], }, 2, late_data=[(1, 'x'), (2, 'y'), (21, 'z')])
def test_sessions_after_count(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterCount(2), AccumulationMode.ACCUMULATING, [(1, 'a'), (15, 'b'), (6, 'c'), (30, 's'), (31, 't'), (50, 'z'), (50, 'y')], {IntervalWindow(1, 25): [set('abc')], IntervalWindow(30, 41): [set('st')], IntervalWindow(50, 60): [set('yz')]}, 1, 2, 3)
def test_repeatedly_after_first(self): self.run_trigger_simple( FixedWindows(100), # pyformat break Repeatedly(AfterAny(AfterCount(3), AfterWatermark())), AccumulationMode.ACCUMULATING, zip(range(7), 'abcdefg'), {IntervalWindow(0, 100): [ set('abc'), set('abcdef'), set('abcdefg'), set('abcdefgx'), set('abcdefgxy'), set('abcdefgxyz')]}, 1, late_data=zip(range(3), 'xyz'))
def expand(self, pcoll): output = (pcoll | "ParseJson" >> beam.ParDo(JsonToTaxiRide()) | "FilterForPickups" >> beam.Filter(lambda x: x.ride_status == 'pickup') | "WindowByMinute" >> beam.WindowInto( beam.window.FixedWindows(60), trigger=AfterWatermark(late=AfterCount(1)), allowed_lateness=60, accumulation_mode=AccumulationMode.ACCUMULATING) | "CountPerMinute" >> beam.CombineGlobally( CountCombineFn()).without_defaults()) return output
def test_after_count(self): p = Pipeline('DirectRunner') result = (p | beam.Create([1, 2, 3, 4, 5, 10, 11]) | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)]) | beam.Map(lambda (k, t): TimestampedValue((k, t), t)) | beam.WindowInto(FixedWindows(10), trigger=AfterCount(3), accumulation_mode=AccumulationMode.DISCARDING) | beam.GroupByKey() | beam.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v)))) assert_that(result, equal_to( { 'A-5': {1, 2, 3, 4, 5}, # A-10, A-11 never emitted due to AfterCount(3) never firing. 'B-4': {6, 7, 8, 9}, 'B-3': {10, 15, 16}, }.iteritems()))
def test_buffering_timer_in_global_window_streaming(self): max_buffering_duration_secs = 42 start_time = timestamp.Timestamp(0) test_stream = TestStream().advance_watermark_to(start_time) for i, value in enumerate(GroupIntoBatchesTest._create_test_data()): test_stream.add_elements( [TimestampedValue(value, start_time + i)]) \ .advance_processing_time(5) test_stream.advance_watermark_to( start_time + GroupIntoBatchesTest.NUM_ELEMENTS + 1) \ .advance_watermark_to_infinity() with TestPipeline(options=StandardOptions(streaming=True)) as pipeline: # Set a batch size larger than the total number of elements. # Since we're in a global window, we would have been waiting # for all the elements to arrive without the buffering time limit. batch_size = GroupIntoBatchesTest.NUM_ELEMENTS * 2 # To trigger the processing time timer, use a fake clock with start time # being Timestamp(0). Since the fake clock never really advances during # the pipeline execution, meaning that the timer is always set to the same # value, the timer will be fired on every element after the first firing. fake_clock = FakeClock(now=start_time) num_elements_per_batch = ( pipeline | test_stream | WindowInto( GlobalWindows(), trigger=Repeatedly(AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | util.GroupIntoBatches( batch_size, max_buffering_duration_secs, fake_clock) | 'count elements in batch' >> Map(lambda x: (None, len(x[1]))) | GroupByKey() | FlatMapTuple(lambda k, vs: vs)) # We will flush twice when the max buffering duration is reached and when # the global window ends. assert_that(num_elements_per_batch, equal_to([9, 1]))
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True BATCH_SIZE = 1000000 BUFFERING_SECS = 600 p = Pipeline(options=options) (p | Create(range(100), reshuffle=True) | ParDo(make_large_elements) # 128 KiB | WithKeys('') | WindowInto(GlobalWindows(), trigger=Repeatedly( AfterAny(AfterCount(BATCH_SIZE), AfterProcessingTime(BUFFERING_SECS))), accumulation_mode=AccumulationMode.DISCARDING) | GroupByKey() | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1] )))) run = p.run() run.wait_until_finish()
def test_combining_with_accumulation_mode_and_fanout(self): # PCollection will contain elements from 1 to 5. elements = [i for i in range(1, 6)] ts = TestStream().advance_watermark_to(0) for i in elements: ts.add_elements([i]) ts.advance_watermark_to_infinity() options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: result = ( p | ts | beam.WindowInto( GlobalWindows(), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, trigger=AfterWatermark(early=AfterAll(AfterCount(1)))) | beam.CombineGlobally(sum).without_defaults().with_fanout(2)) # The frings for DISCARDING mode is [1, 2, 3, 4, 5, 0, 0]. firings = [1, 3, 6, 10, 15, 15, 15] assert_that(result, equal_to(firings))