def test_sessions_watermark(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterWatermark(), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b')], {IntervalWindow(1, 12): [set('ab')]}, 1, 2) self.run_trigger_simple( Sessions(10), # pyformat break AfterWatermark(), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (15, 'c'), (16, 'd'), (30, 'z'), (9, 'e'), (10, 'f'), (30, 'y')], { IntervalWindow(1, 26): [set('abcdef')], IntervalWindow(30, 40): [set('yz')] }, 1, 2, 3, 4, 5, 6)
def test_trigger_encoding(self): for trigger_fn in (DefaultTrigger(), AfterAll(AfterCount(1), AfterCount(10)), AfterAny(AfterCount(10), AfterCount(100)), AfterWatermark(early=AfterCount(1000)), AfterWatermark(early=AfterCount(1000), late=AfterCount(1)), Repeatedly(AfterCount(100)), trigger.OrFinally(AfterCount(3), AfterCount(10))): context = pipeline_context.PipelineContext() self.assertEqual( trigger_fn, TriggerFn.from_runner_api(trigger_fn.to_runner_api(context), context))
def test_fixed_watermark_with_early(self): self.run_trigger_simple( FixedWindows(10), # pyformat break AfterWatermark(early=AfterCount(2)), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('ab'), set('abc')]}, 2) self.run_trigger_simple( FixedWindows(10), # pyformat break AfterWatermark(early=AfterCount(2)), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('abc'), set('abc')]}, 3)
def test_combining_with_accumulation_mode_and_fanout(self): # PCollection will contain elements from 1 to 5. elements = [i for i in range(1, 6)] ts = TestStream().advance_watermark_to(0) for i in elements: ts.add_elements([i]) ts.advance_watermark_to_infinity() options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: result = ( p | ts | beam.WindowInto( GlobalWindows(), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, trigger=AfterWatermark(early=AfterAll(AfterCount(1)))) | beam.CombineGlobally(sum).without_defaults().with_fanout(2)) def has_expected_values(actual): from hamcrest.core import assert_that as hamcrest_assert from hamcrest.library.collection import contains from hamcrest.library.collection import only_contains ordered = sorted(actual) # Early firings. hamcrest_assert(ordered[:4], contains(1, 3, 6, 10)) # Different runners have different number of 15s, but there should # be at least one 15. hamcrest_assert(ordered[4:], only_contains(15)) assert_that(result, has_expected_values)
def test_model_composite_triggers(self): pipeline_options = PipelineOptions() pipeline_options.view_as(StandardOptions).streaming = True with TestPipeline(options=pipeline_options) as p: test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a', 'a', 'a', 'b', 'b']) .advance_watermark_to(70) .add_elements([TimestampedValue('a', 10), TimestampedValue('a', 10), TimestampedValue('c', 10), TimestampedValue('c', 10)]) .advance_processing_time(600)) pcollection = (p | test_stream | 'pair_with_one' >> beam.Map(lambda x: (x, 1))) counts = ( # [START model_composite_triggers] pcollection | WindowInto( FixedWindows(1 * 60), trigger=AfterWatermark( late=AfterProcessingTime(10 * 60)), accumulation_mode=AccumulationMode.DISCARDING) # [END model_composite_triggers] | 'group' >> beam.GroupByKey() | 'count' >> beam.Map( lambda word_ones: (word_ones[0], sum(word_ones[1])))) assert_that(counts, equal_to([('a', 3), ('b', 2), ('a', 2), ('c', 2)]))
def test_model_early_late_triggers(self): pipeline_options = PipelineOptions() pipeline_options.view_as(StandardOptions).streaming = True with TestPipeline(options=pipeline_options) as p: test_stream = ( TestStream().advance_watermark_to(10).add_elements([ 'a', 'a', 'a', 'b', 'b' ]).add_elements([ TimestampedValue('a', 10) ]).advance_watermark_to(20).advance_processing_time(60).add_elements( [TimestampedValue('a', 10)])) trigger = ( # [START model_early_late_triggers] AfterWatermark( early=AfterProcessingTime(delay=1 * 60), late=AfterCount(1)) # [END model_early_late_triggers] ) counts = ( p | test_stream | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | WindowInto( FixedWindows(15), trigger=trigger, allowed_lateness=20, accumulation_mode=AccumulationMode.DISCARDING) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1])))) assert_that(counts, equal_to([('a', 4), ('b', 2), ('a', 1)]))
def test_multiple_accumulating_firings(self): # PCollection will contain elements from 1 to 10. elements = [i for i in range(1, 11)] ts = TestStream().advance_watermark_to(0) for i in elements: ts.add_elements([('key', str(i))]) if i % 5 == 0: ts.advance_watermark_to(i) ts.advance_processing_time(5) options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: _ = ( p | ts | beam.WindowInto( FixedWindows(10), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, trigger=AfterWatermark( early=AfterAll(AfterCount(1), AfterProcessingTime(5)))) | beam.GroupByKey() | beam.FlatMap(lambda x: x[1]) | beam.ParDo(self.record_dofn())) # The trigger should fire twice. Once after 5 seconds, and once after 10. # The firings should accumulate the output. first_firing = [str(i) for i in elements if i <= 5] second_firing = [str(i) for i in elements] self.assertListEqual(first_firing + second_firing, TriggerPipelineTest.all_records)
def apply_transform(events): return (events | beam.WindowInto(FixedWindows(5), trigger=AfterWatermark(), accumulation_mode=AccumulationMode.DISCARDING, allowed_lateness=Duration(seconds=0)) | beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults())
def expand(self, events): return (events | beam.WindowInto( FixedWindows(1 * 24 * 60 * 60), # 1 Day Window trigger=AfterWatermark(early=AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING, allowed_lateness=Duration(seconds=0)) | beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults())
def test_fixed_after_first(self): self.run_trigger_simple( FixedWindows(10), # pyformat break AfterAny(AfterCount(2), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('ab')]}, 1, 2) self.run_trigger_simple( FixedWindows(10), # pyformat break AfterAny(AfterCount(5), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('abc')]}, 1, 2, late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
def test_sessions_after_all(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterAll(AfterCount(2), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(1, 13): [set('abc')]}, 1, 2) self.run_trigger_simple( Sessions(10), # pyformat break AfterAll(AfterCount(5), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(1, 13): [set('abcxy')]}, 1, 2, late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
def test_fixed_watermark(self): self.run_trigger_simple( FixedWindows(10), # pyformat break AfterWatermark(), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (13, 'c')], {IntervalWindow(0, 10): [set('ab')], IntervalWindow(10, 20): [set('c')]}, 1, 2, 3)
def expand(self, pcoll): return (pcoll | "Parse message" >> beam.ParDo(PubsubMessageParser()) | "Windowing" >> beam.WindowInto(FixedWindows(60), trigger=AfterWatermark( early=AfterProcessingTime(delay=20)), accumulation_mode=AccumulationMode.ACCUMULATING) | "WithKeys" >> beam.Map(lambda account_offer: ((account_offer['offer_id']), account_offer)) | beam.GroupByKey() | 'Count distinct accounts' >> beam.ParDo(DistinctAccountCount()) | 'Map to BQ row' >> beam.ParDo(ConvertStatToBQRow()))
def test_sessions_watermark(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterWatermark(), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b')], {IntervalWindow(1, 12): [set('ab')]}, 1, 2, -2, -1)
def test_fixed_watermark_with_early_late(self): self.run_trigger_simple( FixedWindows(100), # pyformat break AfterWatermark(early=AfterCount(3), late=AfterCount(2)), AccumulationMode.DISCARDING, zip(range(9), 'abcdefghi'), {IntervalWindow(0, 100): [ set('abcd'), set('efgh'), # early set('i'), # on time set('vw'), set('xy') # late ]}, 2, late_data=zip(range(5), 'vwxyz'))
def expand(self, pcoll): output = (pcoll | "ParseJson" >> beam.ParDo(JsonToTaxiRide()) | "FilterForPickups" >> beam.Filter(lambda x: x.ride_status == 'pickup') | "WindowByMinute" >> beam.WindowInto( beam.window.FixedWindows(60), trigger=AfterWatermark(late=AfterCount(1)), allowed_lateness=60, accumulation_mode=AccumulationMode.ACCUMULATING) | "CountPerMinute" >> beam.CombineGlobally( CountCombineFn()).without_defaults()) return output
def test_repeatedly_after_first(self): self.run_trigger_simple( FixedWindows(100), # pyformat break Repeatedly(AfterAny(AfterCount(3), AfterWatermark())), AccumulationMode.ACCUMULATING, zip(range(7), 'abcdefg'), {IntervalWindow(0, 100): [ set('abc'), set('abcdef'), set('abcdefg'), set('abcdefgx'), set('abcdefgxy'), set('abcdefgxyz')]}, 1, late_data=zip(range(3), 'xyz'))
def run(argv=None): from apache_beam.transforms.window import TimestampedValue, FixedWindows pubsub_input_topic = 'projects/professionaldataengineercourse/topics/faces_on_images' with beam.Pipeline(options=get_pipeline_options()) as pipeline: logging.info("pubsub_input_topic = {}".format(pubsub_input_topic)) json_messages = \ (pipeline | 'ReadFromPubSubTopic' >> beam.io.ReadFromPubSub(topic=pubsub_input_topic).with_output_types(bytes) | 'DecodeMessagesFromPubSub' >> beam.Map(decode_message) ) window_size_s = 30 allowed_lateness_s = 60 high_confidence_faces_grouped_by_emotion_count_per_window = ( json_messages | 'ParseJsonMessage' >> beam.Map(parse_jsons) | 'FilterHighFaceConfidence' >> beam.ParDo(FilterHighConfidenceFacesDoFn()) | 'FlatMapFAcesWithHighEmotionLikelihood' >> beam.FlatMap(get_faces_with_high_emotion_likelihood) | 'UseCustomTimestamp' >> beam.Map(lambda face_info: TimestampedValue(face_info, face_info['ts_seconds'])) | 'WindowFaceInfo' >> beam.WindowInto( FixedWindows(window_size_s, 0), trigger=AfterWatermark( early=AfterAny(AfterCount(5), AfterProcessingTime(10)), late=AfterAll(AfterCount(2), AfterProcessingTime(20))), allowed_lateness=allowed_lateness_s, accumulation_mode=AccumulationMode.DISCARDING) | 'PairEmotionWithFace' >> beam.Map(lambda face_info: (face_info['emotion'], face_info)) | 'GroupByEmotion' >> beam.GroupByKey() | 'FormatOutputForBigQuery' >> beam.ParDo(FormatFaceInfoPerWindow()) ) log_p_collection(high_confidence_faces_grouped_by_emotion_count_per_window, "OutputToBigQuery") high_confidence_faces_grouped_by_emotion_count_per_window | 'WriteToBigQuery' >> beam.io.WriteToBigQuery( bq_faces_windowed_table_name, schema={"fields": bq_faces_windowed_table_schema}, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) pipeline_result = pipeline.run() pipeline_result.wait_until_finish()
def test_sessions_and_complex_trigger_accumulating(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 1), tsv('k1', 2, 15), tsv('k1', 3, 7), tsv('k1', 4, 30)]) .advance_watermark_to(50) .add_elements([tsv('k1', -3, 1), tsv('k1', -2, 2),]) .add_elements([tsv('k1', -1, 21)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(Sessions(10), triggerfn=AfterWatermark(early=AfterCount(2), late=AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING, allowed_lateness=MAX_TIMESTAMP.seconds()) with TestPipeline() as p: result = (p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo( trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], set(v.value for v in elm[1])))) assert_that( result, equal_to([ ('k1', IntervalWindow(1, 25), {1, 2, 3}), # early ('k1', IntervalWindow(1, 25), {1, 2, 3}), # on time ('k1', IntervalWindow(30, 40), {4}), # on time ('k1', IntervalWindow(1, 25), {1, 2, 3, -3, -2}), # late ('k1', IntervalWindow(1, 40), {1, 2, 3, 4, -3, -2, -1}), # late ]))
def test_sessions_watermark_with_early_late(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterWatermark(early=AfterCount(2), late=AfterCount(1)), AccumulationMode.ACCUMULATING, [(1, 'a'), (15, 'b'), (7, 'c'), (30, 'd')], { IntervalWindow(1, 25): [ set('abc'), # early set('abc'), # on time set('abcxy') # late ], IntervalWindow(30, 40): [ set('d'), # on time ], IntervalWindow(1, 40): [ set('abcdxyz') # late ], }, 2, late_data=[(1, 'x'), (2, 'y'), (21, 'z')])
def test_with_trigger_window_that_finish(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 0), tsv('k1', 2, 0)]) .add_elements([tsv('k1', 3, 0)]) .advance_watermark_to(2) .add_elements([tsv('k1', 6, 0)]) # This value is discarded. .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(1), triggerfn=AfterWatermark(), allowed_lateness=0, accumulation_mode=AccumulationMode.DISCARDING) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ]))
def test_combining_with_accumulation_mode_and_fanout(self): # PCollection will contain elements from 1 to 5. elements = [i for i in range(1, 6)] ts = TestStream().advance_watermark_to(0) for i in elements: ts.add_elements([i]) ts.advance_watermark_to_infinity() options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: result = ( p | ts | beam.WindowInto( GlobalWindows(), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, trigger=AfterWatermark(early=AfterAll(AfterCount(1)))) | beam.CombineGlobally(sum).without_defaults().with_fanout(2)) # The frings for DISCARDING mode is [1, 2, 3, 4, 5, 0, 0]. firings = [1, 3, 6, 10, 15, 15, 15] assert_that(result, equal_to(firings))
def test_after_watermark_no_allowed_lateness(self): self._test(AfterWatermark(), 0, DataLossReason.NO_POTENTIAL_LOSS)
def test_after_watermark_safe_late(self): self._test(AfterWatermark(late=DefaultTrigger()), 60, DataLossReason.NO_POTENTIAL_LOSS)
def test_after_watermark_may_finish_late(self): self._test(AfterWatermark(late=AfterProcessingTime()), 60, DataLossReason.NO_POTENTIAL_LOSS)
def test_after_watermark_no_allowed_lateness_condition_late(self): self._test(AfterWatermark(late=AfterCount(5)), 0, DataLossReason.NO_POTENTIAL_LOSS)
def test_after_watermark_condition_late(self): self._test(AfterWatermark(late=AfterCount(5)), 60, DataLossReason.CONDITION_NOT_GUARANTEED)
def run(): # Command line arguments parser = argparse.ArgumentParser( description='Load from PubSub into BigQuery') parser.add_argument('--project', required=True, help='Specify Google Cloud project') parser.add_argument('--region', required=True, help='Specify Google Cloud region') parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging') parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp') parser.add_argument('--accum_mode', required=True, help='Accumulation mode for pipeline') opts, pipeline_args = parser.parse_known_args() options = PipelineOptions(pipeline_args, save_main_session=True) options.view_as( GoogleCloudOptions).job_name = f"{opts.accum_mode}-{time.time_ns()}" options.view_as(GoogleCloudOptions).project = opts.project options.view_as(GoogleCloudOptions).region = opts.region options.view_as( GoogleCloudOptions).staging_location = opts.staging_location options.view_as(GoogleCloudOptions).temp_location = opts.temp_location options.view_as(StandardOptions).runner = 'DataflowRunner' table_schema = { "fields": [ { "name": "taxi_events", "type": "INTEGER" }, { "name": "timestamp", "type": "STRING" }, ] } input_topic = "projects/pubsub-public-data/topics/taxirides-realtime" output_table = f"{opts.project}:dataflow_demos.{opts.accum_mode}" if opts.accum_mode == 'accumulating': accum_mode = beam.transforms.trigger.AccumulationMode.ACCUMULATING elif opts.accum_mode == 'discarding': accum_mode = beam.transforms.trigger.AccumulationMode.DISCARDING else: raise ValueError( 'Invalid accumulation mode value. Use \'accumulating\' or \'discarding\' ' ) p = beam.Pipeline(options=options) (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(input_topic) | 'ParseJson' >> beam.Map(parse_json).with_output_types(TaxiRide) | 'WindowByMinute' >> beam.WindowInto( beam.window.FixedWindows(60), trigger=AfterWatermark(early=AfterProcessingTime(10)), accumulation_mode=accum_mode) | "CountPerMinute" >> beam.CombineGlobally( CountCombineFn()).without_defaults() | "AddWindowTimestamp" >> beam.ParDo(GetTimestampFn()) | 'WriteAggToBQ' >> beam.io.WriteToBigQuery( output_table, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) logging.getLogger().setLevel(logging.INFO) logging.info("Building pipeline ...") p.run()
def test_after_watermark_late_none(self): self._test(AfterWatermark(), 60, DataLossReason.MAY_FINISH)