Esempio n. 1
0
    def test_sessions_watermark(self):
        self.run_trigger_simple(
            Sessions(10),  # pyformat break
            AfterWatermark(),
            AccumulationMode.ACCUMULATING,
            [(1, 'a'), (2, 'b')],
            {IntervalWindow(1, 12): [set('ab')]},
            1,
            2)

        self.run_trigger_simple(
            Sessions(10),  # pyformat break
            AfterWatermark(),
            AccumulationMode.ACCUMULATING,
            [(1, 'a'), (2, 'b'), (15, 'c'), (16, 'd'), (30, 'z'), (9, 'e'),
             (10, 'f'), (30, 'y')],
            {
                IntervalWindow(1, 26): [set('abcdef')],
                IntervalWindow(30, 40): [set('yz')]
            },
            1,
            2,
            3,
            4,
            5,
            6)
Esempio n. 2
0
 def test_trigger_encoding(self):
   for trigger_fn in (DefaultTrigger(),
                      AfterAll(AfterCount(1), AfterCount(10)),
                      AfterAny(AfterCount(10), AfterCount(100)),
                      AfterWatermark(early=AfterCount(1000)),
                      AfterWatermark(early=AfterCount(1000),
                                     late=AfterCount(1)),
                      Repeatedly(AfterCount(100)),
                      trigger.OrFinally(AfterCount(3), AfterCount(10))):
     context = pipeline_context.PipelineContext()
     self.assertEqual(
         trigger_fn,
         TriggerFn.from_runner_api(trigger_fn.to_runner_api(context), context))
Esempio n. 3
0
 def test_fixed_watermark_with_early(self):
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterWatermark(early=AfterCount(2)),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(0, 10): [set('ab'), set('abc')]},
         2)
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterWatermark(early=AfterCount(2)),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(0, 10): [set('abc'), set('abc')]},
         3)
Esempio n. 4
0
  def test_combining_with_accumulation_mode_and_fanout(self):
    # PCollection will contain elements from 1 to 5.
    elements = [i for i in range(1, 6)]

    ts = TestStream().advance_watermark_to(0)
    for i in elements:
      ts.add_elements([i])
    ts.advance_watermark_to_infinity()

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    with TestPipeline(options=options) as p:
      result = (
          p
          | ts
          | beam.WindowInto(
              GlobalWindows(),
              accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
              trigger=AfterWatermark(early=AfterAll(AfterCount(1))))
          | beam.CombineGlobally(sum).without_defaults().with_fanout(2))

      def has_expected_values(actual):
        from hamcrest.core import assert_that as hamcrest_assert
        from hamcrest.library.collection import contains
        from hamcrest.library.collection import only_contains
        ordered = sorted(actual)
        # Early firings.
        hamcrest_assert(ordered[:4], contains(1, 3, 6, 10))
        # Different runners have different number of 15s, but there should
        # be at least one 15.
        hamcrest_assert(ordered[4:], only_contains(15))

      assert_that(result, has_expected_values)
Esempio n. 5
0
  def test_model_composite_triggers(self):
    pipeline_options = PipelineOptions()
    pipeline_options.view_as(StandardOptions).streaming = True

    with TestPipeline(options=pipeline_options) as p:
      test_stream = (TestStream()
                     .advance_watermark_to(10)
                     .add_elements(['a', 'a', 'a', 'b', 'b'])
                     .advance_watermark_to(70)
                     .add_elements([TimestampedValue('a', 10),
                                    TimestampedValue('a', 10),
                                    TimestampedValue('c', 10),
                                    TimestampedValue('c', 10)])
                     .advance_processing_time(600))
      pcollection = (p
                     | test_stream
                     | 'pair_with_one' >> beam.Map(lambda x: (x, 1)))

      counts = (
          # [START model_composite_triggers]
          pcollection | WindowInto(
              FixedWindows(1 * 60),
              trigger=AfterWatermark(
                  late=AfterProcessingTime(10 * 60)),
              accumulation_mode=AccumulationMode.DISCARDING)
          # [END model_composite_triggers]
          | 'group' >> beam.GroupByKey()
          | 'count' >> beam.Map(
              lambda word_ones: (word_ones[0], sum(word_ones[1]))))
      assert_that(counts, equal_to([('a', 3), ('b', 2), ('a', 2), ('c', 2)]))
Esempio n. 6
0
  def test_model_early_late_triggers(self):
    pipeline_options = PipelineOptions()
    pipeline_options.view_as(StandardOptions).streaming = True

    with TestPipeline(options=pipeline_options) as p:
      test_stream = (
          TestStream().advance_watermark_to(10).add_elements([
              'a', 'a', 'a', 'b', 'b'
          ]).add_elements([
              TimestampedValue('a', 10)
          ]).advance_watermark_to(20).advance_processing_time(60).add_elements(
              [TimestampedValue('a', 10)]))
      trigger = (
          # [START model_early_late_triggers]
          AfterWatermark(
              early=AfterProcessingTime(delay=1 * 60), late=AfterCount(1))
          # [END model_early_late_triggers]
      )
      counts = (
          p
          | test_stream
          | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
          | WindowInto(
              FixedWindows(15),
              trigger=trigger,
              allowed_lateness=20,
              accumulation_mode=AccumulationMode.DISCARDING)
          | 'group' >> beam.GroupByKey()
          | 'count' >>
          beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))))
      assert_that(counts, equal_to([('a', 4), ('b', 2), ('a', 1)]))
Esempio n. 7
0
    def test_multiple_accumulating_firings(self):
        # PCollection will contain elements from 1 to 10.
        elements = [i for i in range(1, 11)]

        ts = TestStream().advance_watermark_to(0)
        for i in elements:
            ts.add_elements([('key', str(i))])
            if i % 5 == 0:
                ts.advance_watermark_to(i)
                ts.advance_processing_time(5)

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            _ = (
                p
                | ts
                | beam.WindowInto(
                    FixedWindows(10),
                    accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                    trigger=AfterWatermark(
                        early=AfterAll(AfterCount(1), AfterProcessingTime(5))))
                | beam.GroupByKey()
                | beam.FlatMap(lambda x: x[1])
                | beam.ParDo(self.record_dofn()))

        # The trigger should fire twice. Once after 5 seconds, and once after 10.
        # The firings should accumulate the output.
        first_firing = [str(i) for i in elements if i <= 5]
        second_firing = [str(i) for i in elements]
        self.assertListEqual(first_firing + second_firing,
                             TriggerPipelineTest.all_records)
Esempio n. 8
0
def apply_transform(events):
    return (events
            | beam.WindowInto(FixedWindows(5),
                              trigger=AfterWatermark(),
                              accumulation_mode=AccumulationMode.DISCARDING,
                              allowed_lateness=Duration(seconds=0))
            | beam.CombineGlobally(
                beam.combiners.CountCombineFn()).without_defaults())
Esempio n. 9
0
 def expand(self, events):
     return (events
             | beam.WindowInto(
                 FixedWindows(1 * 24 * 60 * 60),  # 1 Day Window
                 trigger=AfterWatermark(early=AfterCount(1)),
                 accumulation_mode=AccumulationMode.ACCUMULATING,
                 allowed_lateness=Duration(seconds=0))
             | beam.CombineGlobally(
                 beam.combiners.CountCombineFn()).without_defaults())
Esempio n. 10
0
 def test_fixed_after_first(self):
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterAny(AfterCount(2), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(0, 10): [set('ab')]},
         1,
         2)
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterAny(AfterCount(5), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(0, 10): [set('abc')]},
         1,
         2,
         late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
Esempio n. 11
0
 def test_sessions_after_all(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterAll(AfterCount(2), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(1, 13): [set('abc')]},
         1,
         2)
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterAll(AfterCount(5), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(1, 13): [set('abcxy')]},
         1,
         2,
         late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
Esempio n. 12
0
 def test_fixed_watermark(self):
   self.run_trigger_simple(
       FixedWindows(10),  # pyformat break
       AfterWatermark(),
       AccumulationMode.ACCUMULATING,
       [(1, 'a'), (2, 'b'), (13, 'c')],
       {IntervalWindow(0, 10): [set('ab')],
        IntervalWindow(10, 20): [set('c')]},
       1,
       2,
       3)
 def expand(self, pcoll):
     return (pcoll
             | "Parse message" >> beam.ParDo(PubsubMessageParser())
             | "Windowing" >> beam.WindowInto(FixedWindows(60),
                                              trigger=AfterWatermark(
                                                  early=AfterProcessingTime(delay=20)),
                                              accumulation_mode=AccumulationMode.ACCUMULATING)
             | "WithKeys" >> beam.Map(lambda account_offer: ((account_offer['offer_id']), account_offer))
             | beam.GroupByKey()
             | 'Count distinct accounts' >> beam.ParDo(DistinctAccountCount())
             | 'Map to BQ row' >> beam.ParDo(ConvertStatToBQRow()))
Esempio n. 14
0
 def test_sessions_watermark(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterWatermark(),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b')],
         {IntervalWindow(1, 12): [set('ab')]},
         1,
         2,
         -2,
         -1)
Esempio n. 15
0
 def test_fixed_watermark_with_early_late(self):
   self.run_trigger_simple(
       FixedWindows(100),  # pyformat break
       AfterWatermark(early=AfterCount(3),
                      late=AfterCount(2)),
       AccumulationMode.DISCARDING,
       zip(range(9), 'abcdefghi'),
       {IntervalWindow(0, 100): [
           set('abcd'), set('efgh'),  # early
           set('i'),                  # on time
           set('vw'), set('xy')       # late
           ]},
       2,
       late_data=zip(range(5), 'vwxyz'))
    def expand(self, pcoll):

        output = (pcoll
                  | "ParseJson" >> beam.ParDo(JsonToTaxiRide())
                  | "FilterForPickups" >>
                  beam.Filter(lambda x: x.ride_status == 'pickup')
                  | "WindowByMinute" >> beam.WindowInto(
                      beam.window.FixedWindows(60),
                      trigger=AfterWatermark(late=AfterCount(1)),
                      allowed_lateness=60,
                      accumulation_mode=AccumulationMode.ACCUMULATING)
                  | "CountPerMinute" >> beam.CombineGlobally(
                      CountCombineFn()).without_defaults())

        return output
Esempio n. 17
0
 def test_repeatedly_after_first(self):
   self.run_trigger_simple(
       FixedWindows(100),  # pyformat break
       Repeatedly(AfterAny(AfterCount(3), AfterWatermark())),
       AccumulationMode.ACCUMULATING,
       zip(range(7), 'abcdefg'),
       {IntervalWindow(0, 100): [
           set('abc'),
           set('abcdef'),
           set('abcdefg'),
           set('abcdefgx'),
           set('abcdefgxy'),
           set('abcdefgxyz')]},
       1,
       late_data=zip(range(3), 'xyz'))
Esempio n. 18
0
def run(argv=None):
    from apache_beam.transforms.window import TimestampedValue, FixedWindows

    pubsub_input_topic = 'projects/professionaldataengineercourse/topics/faces_on_images'

    with beam.Pipeline(options=get_pipeline_options()) as pipeline:
        logging.info("pubsub_input_topic = {}".format(pubsub_input_topic))

        json_messages = \
            (pipeline
             | 'ReadFromPubSubTopic' >> beam.io.ReadFromPubSub(topic=pubsub_input_topic).with_output_types(bytes)
             | 'DecodeMessagesFromPubSub' >> beam.Map(decode_message)
             )

        window_size_s = 30
        allowed_lateness_s = 60
        high_confidence_faces_grouped_by_emotion_count_per_window = (
                json_messages
                | 'ParseJsonMessage' >> beam.Map(parse_jsons)
                | 'FilterHighFaceConfidence' >> beam.ParDo(FilterHighConfidenceFacesDoFn())
                | 'FlatMapFAcesWithHighEmotionLikelihood' >> beam.FlatMap(get_faces_with_high_emotion_likelihood)
                | 'UseCustomTimestamp' >> beam.Map(lambda face_info:
                                                   TimestampedValue(face_info, face_info['ts_seconds']))
                | 'WindowFaceInfo' >> beam.WindowInto(
                        FixedWindows(window_size_s, 0),
                        trigger=AfterWatermark(
                            early=AfterAny(AfterCount(5), AfterProcessingTime(10)),
                            late=AfterAll(AfterCount(2), AfterProcessingTime(20))),
                        allowed_lateness=allowed_lateness_s,
                        accumulation_mode=AccumulationMode.DISCARDING)
                | 'PairEmotionWithFace' >> beam.Map(lambda face_info: (face_info['emotion'], face_info))
                | 'GroupByEmotion' >> beam.GroupByKey()
                | 'FormatOutputForBigQuery' >> beam.ParDo(FormatFaceInfoPerWindow())
        )

        log_p_collection(high_confidence_faces_grouped_by_emotion_count_per_window, "OutputToBigQuery")

        high_confidence_faces_grouped_by_emotion_count_per_window | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
            bq_faces_windowed_table_name,
            schema={"fields": bq_faces_windowed_table_schema},
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
    def test_sessions_and_complex_trigger_accumulating(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 1), tsv('k1', 2, 15),
                             tsv('k1', 3, 7), tsv('k1', 4, 30)])
              .advance_watermark_to(50)
              .add_elements([tsv('k1', -3, 1), tsv('k1', -2, 2),])
              .add_elements([tsv('k1', -1, 21)])
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(Sessions(10),
                              triggerfn=AfterWatermark(early=AfterCount(2),
                                                       late=AfterCount(1)),
                              accumulation_mode=AccumulationMode.ACCUMULATING,
                              allowed_lateness=MAX_TIMESTAMP.seconds())

        with TestPipeline() as p:
            result = (p
                      | test_stream
                      | WindowInto(windowing.windowfn)
                      | ParDo(trigger_manager._ReifyWindows())
                      | ParDo(trigger_manager._GroupBundlesByKey())
                      | ParDo(
                          trigger_manager.GeneralTriggerManagerDoFn(windowing))
                      | Map(lambda elm: (elm[0], elm[1][0].windows[0],
                                         set(v.value for v in elm[1]))))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(1, 25), {1, 2, 3}),  # early
                    ('k1', IntervalWindow(1, 25), {1, 2, 3}),  # on time
                    ('k1', IntervalWindow(30, 40), {4}),  # on time
                    ('k1', IntervalWindow(1, 25), {1, 2, 3, -3, -2}),  # late
                    ('k1', IntervalWindow(1, 40), {1, 2, 3, 4, -3, -2,
                                                   -1}),  # late
                ]))
Esempio n. 20
0
 def test_sessions_watermark_with_early_late(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterWatermark(early=AfterCount(2), late=AfterCount(1)),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (15, 'b'), (7, 'c'), (30, 'd')],
         {
             IntervalWindow(1, 25): [
                 set('abc'),  # early
                 set('abc'),  # on time
                 set('abcxy')  # late
             ],
             IntervalWindow(30, 40): [
                 set('d'),  # on time
             ],
             IntervalWindow(1, 40): [
                 set('abcdxyz')  # late
             ],
         },
         2,
         late_data=[(1, 'x'), (2, 'y'), (21, 'z')])
    def test_with_trigger_window_that_finish(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 0), tsv('k1', 2, 0)])
              .add_elements([tsv('k1', 3, 0)])
              .advance_watermark_to(2)
              .add_elements([tsv('k1', 6, 0)])  # This value is discarded.
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(FixedWindows(1),
                              triggerfn=AfterWatermark(),
                              allowed_lateness=0,
                              accumulation_mode=AccumulationMode.DISCARDING)

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(0, 1), [1, 2,
                                                  3]),  # On the watermark
                ]))
Esempio n. 22
0
    def test_combining_with_accumulation_mode_and_fanout(self):
        # PCollection will contain elements from 1 to 5.
        elements = [i for i in range(1, 6)]

        ts = TestStream().advance_watermark_to(0)
        for i in elements:
            ts.add_elements([i])
        ts.advance_watermark_to_infinity()

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            result = (
                p
                | ts
                | beam.WindowInto(
                    GlobalWindows(),
                    accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                    trigger=AfterWatermark(early=AfterAll(AfterCount(1))))
                | beam.CombineGlobally(sum).without_defaults().with_fanout(2))

            # The frings for DISCARDING mode is [1, 2, 3, 4, 5, 0, 0].
            firings = [1, 3, 6, 10, 15, 15, 15]
            assert_that(result, equal_to(firings))
Esempio n. 23
0
 def test_after_watermark_no_allowed_lateness(self):
     self._test(AfterWatermark(), 0, DataLossReason.NO_POTENTIAL_LOSS)
Esempio n. 24
0
 def test_after_watermark_safe_late(self):
     self._test(AfterWatermark(late=DefaultTrigger()), 60,
                DataLossReason.NO_POTENTIAL_LOSS)
Esempio n. 25
0
 def test_after_watermark_may_finish_late(self):
     self._test(AfterWatermark(late=AfterProcessingTime()), 60,
                DataLossReason.NO_POTENTIAL_LOSS)
Esempio n. 26
0
 def test_after_watermark_no_allowed_lateness_condition_late(self):
     self._test(AfterWatermark(late=AfterCount(5)), 0,
                DataLossReason.NO_POTENTIAL_LOSS)
Esempio n. 27
0
 def test_after_watermark_condition_late(self):
     self._test(AfterWatermark(late=AfterCount(5)), 60,
                DataLossReason.CONDITION_NOT_GUARANTEED)
def run():
    # Command line arguments
    parser = argparse.ArgumentParser(
        description='Load from PubSub into BigQuery')
    parser.add_argument('--project',
                        required=True,
                        help='Specify Google Cloud project')
    parser.add_argument('--region',
                        required=True,
                        help='Specify Google Cloud region')
    parser.add_argument('--staging_location',
                        required=True,
                        help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location',
                        required=True,
                        help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--accum_mode',
                        required=True,
                        help='Accumulation mode for pipeline')

    opts, pipeline_args = parser.parse_known_args()

    options = PipelineOptions(pipeline_args, save_main_session=True)

    options.view_as(
        GoogleCloudOptions).job_name = f"{opts.accum_mode}-{time.time_ns()}"
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(
        GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    table_schema = {
        "fields": [
            {
                "name": "taxi_events",
                "type": "INTEGER"
            },
            {
                "name": "timestamp",
                "type": "STRING"
            },
        ]
    }

    input_topic = "projects/pubsub-public-data/topics/taxirides-realtime"
    output_table = f"{opts.project}:dataflow_demos.{opts.accum_mode}"

    if opts.accum_mode == 'accumulating':
        accum_mode = beam.transforms.trigger.AccumulationMode.ACCUMULATING
    elif opts.accum_mode == 'discarding':
        accum_mode = beam.transforms.trigger.AccumulationMode.DISCARDING
    else:
        raise ValueError(
            'Invalid accumulation mode value. Use \'accumulating\' or \'discarding\' '
        )

    p = beam.Pipeline(options=options)

    (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(input_topic)
     | 'ParseJson' >> beam.Map(parse_json).with_output_types(TaxiRide)
     | 'WindowByMinute' >> beam.WindowInto(
         beam.window.FixedWindows(60),
         trigger=AfterWatermark(early=AfterProcessingTime(10)),
         accumulation_mode=accum_mode)
     | "CountPerMinute" >> beam.CombineGlobally(
         CountCombineFn()).without_defaults()
     | "AddWindowTimestamp" >> beam.ParDo(GetTimestampFn())
     | 'WriteAggToBQ' >> beam.io.WriteToBigQuery(
         output_table,
         schema=table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()
Esempio n. 29
0
 def test_after_watermark_late_none(self):
     self._test(AfterWatermark(), 60, DataLossReason.MAY_FINISH)