Beispiel #1
0
    def test_windowing_behavior(self):

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True

        with TestPipeline(options=options) as p:

            base_json_pickup = "{\"ride_id\":\"x\",\"point_idx\":1,\"latitude\":0.0,\"longitude\":0.0," \
                         "\"timestamp\":\"00:00:00\",\"meter_reading\":1.0,\"meter_increment\":0.1," \
                         "\"ride_status\":\"pickup\",\"passenger_count\":1}"

            base_json_enroute = "{\"ride_id\":\"x\",\"point_idx\":1,\"latitude\":0.0,\"longitude\":0.0," \
                         "\"timestamp\":\"00:00:00\",\"meter_reading\":1.0,\"meter_increment\":0.1," \
                         "\"ride_status\":\"pickup\",\"passenger_count\":1}"

            test_stream = TestStream().advance_watermark_to(0).add_elements([
                TimestampedValue(base_json_pickup, 0),
                TimestampedValue(base_json_pickup, 0),
                TimestampedValue(base_json_enroute, 0),
                TimestampedValue(base_json_pickup, 60)
            ]).advance_watermark_to(60).advance_processing_time(
                60).add_elements([TimestampedValue(base_json_pickup, 120)
                                  ]).advance_watermark_to_infinity()

            taxi_counts = (p | test_stream | TaxiCountTransform())

            EXPECTED_WINDOW_COUNTS = {
                IntervalWindow(0, 60): [3],
                IntervalWindow(60, 120): [1],
                IntervalWindow(120, 180): [1]
            }

            assert_that(taxi_counts,
                        equal_to_per_window(EXPECTED_WINDOW_COUNTS),
                        reify_windows=True)
Beispiel #2
0
    def test_gbk_execution_after_processing_trigger_fired(self):
        """Advance TestClock to (X + delta) and see the pipeline does finish."""
        # TODO(mariagh): Add test_gbk_execution_after_processing_trigger_unfired
        # Advance TestClock to (X + delta) and see the pipeline does finish
        # Possibly to the framework trigger_transcripts.yaml

        test_stream = (TestStream().advance_watermark_to(10).add_elements([
            'a'
        ]).advance_processing_time(5.1).advance_watermark_to_infinity())

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        p = TestPipeline(options=options)
        records = (p
                   | test_stream
                   | beam.WindowInto(
                       beam.window.FixedWindows(15),
                       trigger=trigger.AfterProcessingTime(5),
                       accumulation_mode=trigger.AccumulationMode.DISCARDING)
                   | beam.Map(lambda x: ('k', x))
                   | beam.GroupByKey())

        # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
        # respect the TimestampCombiner.  The test below should also verify the
        # timestamps of the outputted elements once this is implemented.

        expected_window_to_elements = {
            window.IntervalWindow(0, 15): [('k', ['a'])],
        }
        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    label='assert per window')

        p.run()
Beispiel #3
0
    def test_gbk_execution_after_watermark_trigger(self):
        test_stream = (TestStream()
            .advance_watermark_to(10)
            .add_elements([TimestampedValue('a', 11)])
            .advance_watermark_to(20)
            .add_elements([TimestampedValue('b', 21)])
            .advance_watermark_to_infinity())  # yapf: disable

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        p = TestPipeline(options=options)
        records = (
            p  # pylint: disable=unused-variable
            | test_stream
            | beam.WindowInto(
                FixedWindows(15),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | beam.Map(lambda x: ('k', x))
            | beam.GroupByKey())

        # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
        # respect the TimestampCombiner.  The test below should also verify the
        # timestamps of the outputted elements once this is implemented.

        # assert per window
        expected_window_to_elements = {
            window.IntervalWindow(0, 15): [('k', ['a']), ('k', [])],
            window.IntervalWindow(15, 30): [('k', ['b']), ('k', [])],
        }
        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    label='assert per window')

        p.run()
Beispiel #4
0
    def test_combiner_latest(self):
        """Test TimestampCombiner with LATEST."""
        options = PipelineOptions(streaming=True)
        with TestPipeline(options=options) as p:
            result = (
                p
                | TestStream().add_elements([
                    window.TimestampedValue(('k', 100), 2)
                ]).add_elements([window.TimestampedValue(
                    ('k', 400), 7)]).advance_watermark_to_infinity()
                | beam.WindowInto(
                    window.FixedWindows(10),
                    timestamp_combiner=TimestampCombiner.OUTPUT_AT_LATEST)
                | beam.CombinePerKey(sum))

            records = (
                result
                | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts)))

            # All the KV pairs are applied GBK using LATEST timestamp for
            # the same key.
            expected_window_to_elements = {
                window.IntervalWindow(0, 10): [
                    (('k', 500), Timestamp(7)),
                ],
            }

            assert_that(records,
                        equal_to_per_window(expected_window_to_elements),
                        use_global_window=False,
                        label='assert per window')
    def test_late_data_behavior(self):

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True

        with TestPipeline(options=options) as p:

            base_json_pickup = "{\"ride_id\":\"x\",\"point_idx\":1,\"latitude\":0.0,\"longitude\":0.0," \
                        "\"timestamp\":\"00:00:00\",\"meter_reading\":1.0,\"meter_increment\":0.1," \
                        "\"ride_status\":\"pickup\",\"passenger_count\":1}"

            test_stream = TestStream().advance_watermark_to(0).add_elements([
                TimestampedValue(base_json_pickup, 0),
                TimestampedValue(base_json_pickup, 0),
            ]).advance_watermark_to(
                60).advance_processing_time(60).add_elements([
                    TimestampedValue(base_json_pickup, 0)
                ]).advance_watermark_to(300).advance_processing_time(
                    240).add_elements([TimestampedValue(base_json_pickup, 0)])

            EXPECTED_RESULTS = {
                IntervalWindow(0, 60): [2, 3]
            }  #On Time and Late Result

            taxi_counts_late = (p | test_stream | TaxiCountTransform())

            assert_that(taxi_counts_late,
                        equal_to_per_window(EXPECTED_RESULTS),
                        reify_windows=True)
Beispiel #6
0
    def test_multi_triggered_gbk_side_input(self):
        """Test a GBK sideinput, with multiple triggering."""
        # TODO(BEAM-9322): Remove use of this experiment.
        # This flag is only necessary when using the multi-output TestStream b/c
        # it relies on using the PCollection output tags as the PCollection output
        # ids.
        p = TestPipeline(additional_pipeline_args=[
            '--experiments=' + 'passthrough_pcollection_output_ids'
        ])

        test_stream = (
            p
            | 'Mixed TestStream' >> TestStream().advance_watermark_to(
                3, tag='main').add_elements(
                    ['a1'], tag='main').advance_watermark_to(
                        8, tag='main').add_elements(['a2'], tag='main').
            add_elements([window.TimestampedValue(
                ('k', 100), 2)], tag='side').add_elements(
                    [window.TimestampedValue(('k', 400), 7)],
                    tag='side').advance_watermark_to_infinity(
                        tag='main').advance_watermark_to_infinity(tag='side'))

        main_data = (
            test_stream['main']
            | 'Main windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                accumulation_mode=trigger.AccumulationMode.DISCARDING))

        side_data = (
            test_stream['side']
            | 'Side windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | beam.CombinePerKey(sum)
            | 'Values' >> Map(lambda k_vs: k_vs[1]))

        class RecordFn(beam.DoFn):
            def process(self,
                        elm=beam.DoFn.ElementParam,
                        ts=beam.DoFn.TimestampParam,
                        side=beam.DoFn.SideInputParam):
                yield (elm, ts, side)

        records = (main_data
                   | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data)))

        expected_window_to_elements = {
            window.IntervalWindow(0, 5): [
                ('a1', Timestamp(3), [100, 0]),
            ],
            window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])],
        }

        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    use_global_window=False,
                    label='assert per window')

        p.run()
  def test_basic_execution_sideinputs_fixed_windows(self):
    options = PipelineOptions()
    options.view_as(DebugOptions).add_experiment(
        'passthrough_pcollection_output_ids')
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    test_stream = (p | TestStream()
        .advance_watermark_to(12, tag='side')
        .add_elements([window.TimestampedValue('s1', 10)], tag='side')
        .advance_watermark_to(20, tag='side')
        .add_elements([window.TimestampedValue('s2', 20)], tag='side')

        .advance_watermark_to(9, tag='main')
        .add_elements(['a1', 'a2', 'a3', 'a4'], tag='main')
        .add_elements(['b'], tag='main')
        .advance_watermark_to(18, tag='main')
        .add_elements('c', tag='main')
        ) # yapf: disable

    main_stream = (
        test_stream['main']
        | 'main windowInto' >> beam.WindowInto(window.FixedWindows(1)))

    side_stream = (
        test_stream['side']
        | 'side windowInto' >> beam.WindowInto(window.FixedWindows(3)))

    class RecordFn(beam.DoFn):
      def process(
          self,
          elm=beam.DoFn.ElementParam,
          ts=beam.DoFn.TimestampParam,
          side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (
        main_stream  # pylint: disable=unused-variable
        | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream)))

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(9, 10): [
            ('a1', Timestamp(9), ['s1']), ('a2', Timestamp(9), ['s1']),
            ('a3', Timestamp(9), ['s1']), ('a4', Timestamp(9), ['s1']),
            ('b', Timestamp(9), ['s1'])
        ],
        window.IntervalWindow(18, 19): [('c', Timestamp(18), ['s2'])],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        label='assert per window')

    p.run()
  def test_basic_execution(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)]))

    class RecordFn(beam.DoFn):
      def process(self, element=beam.DoFn.ElementParam,
                  timestamp=beam.DoFn.TimestampParam):
        yield (element, timestamp)

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    my_record_fn = RecordFn()
    records = p | test_stream | beam.ParDo(my_record_fn)

    assert_that(records, equal_to([
        ('a', timestamp.Timestamp(10)),
        ('b', timestamp.Timestamp(10)),
        ('c', timestamp.Timestamp(10)),
        ('d', timestamp.Timestamp(20)),
        ('e', timestamp.Timestamp(20)),
        ('late', timestamp.Timestamp(12)),
        ('last', timestamp.Timestamp(310)),]))

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(0, 15): [
            ('a', Timestamp(10)),
            ('b', Timestamp(10)),
            ('c', Timestamp(10)),
            ('late', Timestamp(12))
        ],
        window.IntervalWindow(15, 30): [
            ('d', Timestamp(20)),
            ('e', Timestamp(20))
        ],
        window.IntervalWindow(300, 315): [
            ('last', Timestamp(310)),
        ],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        custom_windowing=window.FixedWindows(15),
        label='assert per window')

    p.run()
Beispiel #9
0
  def test_basic_execution(self):
    test_stream = (TestStream()
                   .advance_watermark_to(0)
                   .advance_processing_time(5)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(2)
                   .advance_processing_time(1)
                   .advance_watermark_to(4)
                   .advance_processing_time(1)
                   .advance_watermark_to(6)
                   .advance_processing_time(1)
                   .advance_watermark_to(8)
                   .advance_processing_time(1)
                   .advance_watermark_to(10)
                   .advance_processing_time(1)
                   .add_elements([TimestampedValue('1', 15),
                                  TimestampedValue('2', 15),
                                  TimestampedValue('3', 15)]))  # yapf: disable

    options = StandardOptions(streaming=True)
    p = TestPipeline(options=options)

    records = (
        p
        | test_stream
        | ReverseTestStream(sample_resolution_sec=1, output_tag=None))

    assert_that(
        records,
        equal_to_per_window({
            beam.window.GlobalWindow(): [
                [ProcessingTimeEvent(5), WatermarkEvent(0)],
                [
                    ElementEvent([
                        TimestampedValue('a', 0),
                        TimestampedValue('b', 0),
                        TimestampedValue('c', 0)
                    ])
                ],
                [ProcessingTimeEvent(1), WatermarkEvent(2000000)],
                [ProcessingTimeEvent(1), WatermarkEvent(4000000)],
                [ProcessingTimeEvent(1), WatermarkEvent(6000000)],
                [ProcessingTimeEvent(1), WatermarkEvent(8000000)],
                [ProcessingTimeEvent(1), WatermarkEvent(10000000)],
                [
                    ElementEvent([
                        TimestampedValue('1', 15),
                        TimestampedValue('2', 15),
                        TimestampedValue('3', 15)
                    ])
                ],
            ],
        }))

    p.run()
Beispiel #10
0
    def test_multi_triggered_gbk_side_input(self):
        """Test a GBK sideinput, with multiple triggering."""
        options = StandardOptions(streaming=True)
        p = TestPipeline(options=options)

        test_stream = (
            p
            | 'Mixed TestStream' >> TestStream().advance_watermark_to(
                3, tag='main').add_elements(
                    ['a1'], tag='main').advance_watermark_to(
                        8, tag='main').add_elements(['a2'], tag='main').
            add_elements([window.TimestampedValue(
                ('k', 100), 2)], tag='side').add_elements(
                    [window.TimestampedValue(('k', 400), 7)],
                    tag='side').advance_watermark_to_infinity(
                        tag='main').advance_watermark_to_infinity(tag='side'))

        main_data = (
            test_stream['main']
            | 'Main windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                accumulation_mode=trigger.AccumulationMode.DISCARDING))

        side_data = (
            test_stream['side']
            | 'Side windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | beam.CombinePerKey(sum)
            | 'Values' >> Map(lambda k_vs: k_vs[1]))

        class RecordFn(beam.DoFn):
            def process(self,
                        elm=beam.DoFn.ElementParam,
                        ts=beam.DoFn.TimestampParam,
                        side=beam.DoFn.SideInputParam):
                yield (elm, ts, side)

        records = (main_data
                   | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data)))

        expected_window_to_elements = {
            window.IntervalWindow(0, 5): [
                ('a1', Timestamp(3), [100, 0]),
            ],
            window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])],
        }

        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    use_global_window=False,
                    label='assert per window')

        p.run()
Beispiel #11
0
  def test_basic_execution_sideinputs_fixed_windows(self):
    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    main_stream = (p
                   | 'main TestStream' >> TestStream()
                   .advance_watermark_to(9)
                   .add_elements(['a1', 'a2', 'a3', 'a4'])
                   .add_elements(['b'])
                   .advance_watermark_to(18)
                   .add_elements('c')
                   | 'main windowInto' >> beam.WindowInto(
                       window.FixedWindows(1))
                  )
    side_stream = (p
                   | 'side TestStream' >> TestStream()
                   .advance_watermark_to(12)
                   .add_elements([window.TimestampedValue('s1', 10)])
                   .advance_watermark_to(20)
                   .add_elements([window.TimestampedValue('s2', 20)])
                   | 'side windowInto' >> beam.WindowInto(
                       window.FixedWindows(3))
                  )

    class RecordFn(beam.DoFn):
      def process(self,
                  elm=beam.DoFn.ElementParam,
                  ts=beam.DoFn.TimestampParam,
                  side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (main_stream     # pylint: disable=unused-variable
               | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream)))

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(9, 10): [
            ('a1', Timestamp(9), ['s1']),
            ('a2', Timestamp(9), ['s1']),
            ('a3', Timestamp(9), ['s1']),
            ('a4', Timestamp(9), ['s1']),
            ('b', Timestamp(9), ['s1'])
        ],
        window.IntervalWindow(18, 19):[('c', Timestamp(18), ['s2'])],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        use_global_window=False,
        label='assert per window')

    p.run()
  def test_gbk_execution_no_triggers(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)]))

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p
               | test_stream
               | beam.WindowInto(FixedWindows(15))
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey())

    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.
    assert_that(records, equal_to([
        ('k', ['a', 'b', 'c']),
        ('k', ['d', 'e']),
        ('k', ['late']),
        ('k', ['last'])]))

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(15, 30): [
            ('k', ['a', 'b', 'c']),
            ('k', ['late']),
        ],
        window.IntervalWindow(30, 45): [
            ('k', ['d', 'e']),
        ],
        window.IntervalWindow(300, 315): [
            ('k', ['last']),
        ],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        custom_windowing=window.FixedWindows(15),
        label='assert per window')

    p.run()
Beispiel #13
0
 def test_equal_to_per_window_fail_unmatched_window(self):
   with self.assertRaises(BeamAssertException):
     expected = {
         window.IntervalWindow(50, 100): [('k', [1])],
     }
     with TestPipeline(options=StandardOptions(streaming=True)) as p:
       assert_that((p
                    | Create([1])
                    | beam.WindowInto(
                        FixedWindows(20),
                        trigger=trigger.AfterWatermark(),
                        accumulation_mode=trigger.AccumulationMode.DISCARDING)
                    | beam.Map(lambda x: ('k', x))
                    | beam.GroupByKey()),
                   equal_to_per_window(expected),
                   reify_windows=True)
Beispiel #14
0
 def test_equal_to_per_window_succeeds_no_reify_windows(self):
   start = int(MIN_TIMESTAMP.micros // 1e6) - 5
   end = start + 20
   expected = {
       window.IntervalWindow(start, end): [('k', [1])],
   }
   with TestPipeline(options=StandardOptions(streaming=True)) as p:
     assert_that((p
                  | Create([1])
                  | beam.WindowInto(
                      FixedWindows(20),
                      trigger=trigger.AfterWatermark(),
                      accumulation_mode=trigger.AccumulationMode.DISCARDING)
                  | beam.Map(lambda x: ('k', x))
                  | beam.GroupByKey()),
                 equal_to_per_window(expected))
Beispiel #15
0
  def test_gbk_execution_no_triggers(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)]))

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p
               | test_stream
               | beam.WindowInto(FixedWindows(15))
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey())

    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(0, 15): [
            ('k', ['a', 'b', 'c']),
            ('k', ['late']),
        ],
        window.IntervalWindow(15, 30): [
            ('k', ['d', 'e']),
        ],
        window.IntervalWindow(300, 315): [
            ('k', ['last']),
        ],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        use_global_window=False,
        label='assert per window')

    p.run()
Beispiel #16
0
    def test_gbk_execution_no_triggers(self):
        test_stream = (TestStream().advance_watermark_to(10).add_elements([
            'a', 'b', 'c'
        ]).advance_watermark_to(20).add_elements(['d']).add_elements([
            'e'
        ]).advance_processing_time(10).advance_watermark_to(300).add_elements([
            TimestampedValue('late', 12)
        ]).add_elements([TimestampedValue('last', 310)
                         ]).advance_watermark_to_infinity())

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        p = TestPipeline(options=options)
        records = (p
                   | test_stream
                   | beam.WindowInto(FixedWindows(15), allowed_lateness=300)
                   | beam.Map(lambda x: ('k', x))
                   | beam.GroupByKey())

        # TODO(https://github.com/apache/beam/issues/18441): timestamp assignment
        # for elements from a GBK should respect the TimestampCombiner.  The test
        # below should also verify the timestamps of the outputted elements once
        # this is implemented.

        # assert per window
        expected_window_to_elements = {
            window.IntervalWindow(0, 15): [
                ('k', ['a', 'b', 'c']),
                ('k', ['late']),
            ],
            window.IntervalWindow(15, 30): [
                ('k', ['d', 'e']),
            ],
            window.IntervalWindow(300, 315): [
                ('k', ['last']),
            ],
        }
        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    label='assert per window')

        p.run()
  def test_basic_execution_sideinputs(self):
    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    main_stream = (p
                   | 'main TestStream' >> TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['e']))
    side_stream = (p
                   | 'side TestStream' >> TestStream()
                   .add_elements([window.TimestampedValue(2, 2)])
                   .add_elements([window.TimestampedValue(1, 1)])
                   .add_elements([window.TimestampedValue(7, 7)])
                   .add_elements([window.TimestampedValue(4, 4)])
                  )

    class RecordFn(beam.DoFn):
      def process(self,
                  elm=beam.DoFn.ElementParam,
                  ts=beam.DoFn.TimestampParam,
                  side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (main_stream        # pylint: disable=unused-variable
               | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream)))

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(0, 15): [
            ('e', Timestamp(10), [2, 1, 7, 4]),
        ],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        custom_windowing=window.FixedWindows(15),
        label='assert per window')

    assert_that(records, equal_to([('e', Timestamp(10), [2, 1, 7, 4])]))
    p.run()
Beispiel #18
0
  def test_basic_execution_batch_sideinputs_fixed_windows(self):
    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    main_stream = (
        p
        |
        'main TestStream' >> TestStream().advance_watermark_to(2).add_elements(
            ['a']).advance_watermark_to(4).add_elements(
                ['b']).advance_watermark_to_infinity()
        | 'main window' >> beam.WindowInto(window.FixedWindows(1)))
    side = (
        p
        | beam.Create([2, 1, 4])
        | beam.Map(lambda t: window.TimestampedValue(t, t))
        | beam.WindowInto(window.FixedWindows(2)))

    class RecordFn(beam.DoFn):
      def process(
          self,
          elm=beam.DoFn.ElementParam,
          ts=beam.DoFn.TimestampParam,
          side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (
        main_stream  # pylint: disable=unused-variable
        | beam.ParDo(RecordFn(), beam.pvalue.AsList(side)))

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(2, 3): [('a', Timestamp(2), [2])],
        window.IntervalWindow(4, 5): [('b', Timestamp(4), [4])]
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        label='assert per window')

    p.run()
Beispiel #19
0
  def test_deduplication_in_different_windows(self):
    with self.create_pipeline() as p:
      test_stream = (
          TestStream(
              coder=coders.StrUtf8Coder()).advance_watermark_to(0).add_elements(
                  [
                      window.TimestampedValue('k1', 0),
                      window.TimestampedValue('k2', 10),
                      window.TimestampedValue('k3', 20),
                      window.TimestampedValue('k1', 30),
                      window.TimestampedValue('k2', 40),
                      window.TimestampedValue('k3', 50),
                      window.TimestampedValue('k4', 60),
                      window.TimestampedValue('k5', 70),
                      window.TimestampedValue('k6', 80)
                  ]).advance_watermark_to_infinity())

      res = (
          p
          | test_stream
          | beam.WindowInto(window.FixedWindows(30))
          | deduplicate.Deduplicate(processing_time_duration=10 * 60)
          | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts)))
      # Deduplication should happen per window.
      expect_unique_keys_per_window = {
          window.IntervalWindow(0, 30): [('k1', Timestamp(0)),
                                         ('k2', Timestamp(10)),
                                         ('k3', Timestamp(20))],
          window.IntervalWindow(30, 60): [('k1', Timestamp(30)),
                                          ('k2', Timestamp(40)),
                                          ('k3', Timestamp(50))],
          window.IntervalWindow(60, 90): [('k4', Timestamp(60)),
                                          ('k5', Timestamp(70)),
                                          ('k6', Timestamp(80))],
      }
      assert_that(
          res,
          equal_to_per_window(expect_unique_keys_per_window),
          use_global_window=False,
          label='assert per window')
Beispiel #20
0
  def test_basic_execution_batch_sideinputs_fixed_windows(self):
    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    main_stream = (p
                   | 'main TestStream' >> TestStream()
                   .advance_watermark_to(2)
                   .add_elements(['a'])
                   .advance_watermark_to(4)
                   .add_elements(['b'])
                   | 'main window' >> beam.WindowInto(window.FixedWindows(1)))
    side = (p
            | beam.Create([2, 1, 4])
            | beam.Map(lambda t: window.TimestampedValue(t, t))
            | beam.WindowInto(window.FixedWindows(2)))

    class RecordFn(beam.DoFn):
      def process(self,
                  elm=beam.DoFn.ElementParam,
                  ts=beam.DoFn.TimestampParam,
                  side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (main_stream     # pylint: disable=unused-variable
               | beam.ParDo(RecordFn(), beam.pvalue.AsList(side)))

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(2, 3):[('a', Timestamp(2), [2])],
        window.IntervalWindow(4, 5):[('b', Timestamp(4), [4])]
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        use_global_window=False,
        label='assert per window')

    p.run()
Beispiel #21
0
  def test_gbk_execution_after_processing_trigger_fired(self):
    """Advance TestClock to (X + delta) and see the pipeline does finish."""
    # TODO(mariagh): Add test_gbk_execution_after_processing_trigger_unfired
    # Advance TestClock to (X + delta) and see the pipeline does finish
    # Possibly to the framework trigger_transcripts.yaml

    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a'])
                   .advance_processing_time(5.1))

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p
               | test_stream
               | beam.WindowInto(
                   beam.window.FixedWindows(15),
                   trigger=trigger.AfterProcessingTime(5),
                   accumulation_mode=trigger.AccumulationMode.DISCARDING
                   )
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey())

    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.

    expected_window_to_elements = {
        window.IntervalWindow(0, 15): [('k', ['a'])],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        use_global_window=False,
        label='assert per window')

    p.run()
Beispiel #22
0
  def test_gbk_execution_after_watermark_trigger(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a'])
                   .advance_watermark_to(20))

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p            # pylint: disable=unused-variable
               | test_stream
               | beam.WindowInto(
                   FixedWindows(15),
                   trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                   accumulation_mode=trigger.AccumulationMode.DISCARDING)
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey())

    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(15, 30): [
            ('k', ['a']),
            ('k', []),
        ],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        use_global_window=False,
        label='assert per window')

    p.run()
    def test_multiple_outputs_with_watermark_advancement(self):
        """Tests that the TestStream can independently control output watermarks."""

        # Purposely set the watermark of numbers to 20 then letters to 5 to test
        # that the watermark advancement is per PCollection.
        #
        # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be
        # emitted at different times so that they will have different windows. The
        # watermark advancement is checked by checking their windows. If the
        # watermark does not advance, then the windows will be [-inf, -inf). If the
        # windows do not advance separately, then the PCollections will both
        # windowed in [15, 30).
        letters_elements = [
            TimestampedValue('a', 6),
            TimestampedValue('b', 7),
            TimestampedValue('c', 8),
        ]
        numbers_elements = [
            TimestampedValue('1', 21),
            TimestampedValue('2', 22),
            TimestampedValue('3', 23),
        ]
        test_stream = (TestStream().advance_watermark_to(
            0, tag='letters').advance_watermark_to(
                0, tag='numbers').advance_watermark_to(
                    20, tag='numbers').advance_watermark_to(
                        5, tag='letters').add_elements(
                            letters_elements,
                            tag='letters').advance_watermark_to(
                                10, tag='letters').add_elements(
                                    numbers_elements,
                                    tag='numbers').advance_watermark_to(
                                        30, tag='numbers'))

        options = StandardOptions(streaming=True)
        p = TestPipeline(is_integration_test=True, options=options)

        main = p | test_stream

        # Use an AfterWatermark trigger with an early firing to test that the
        # watermark is advancing properly and that the element is being emitted in
        # the correct window.
        letters = (
            main['letters']
            | 'letter windows' >> beam.WindowInto(
                FixedWindows(15),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | 'letter with key' >> beam.Map(lambda x: ('k', x))
            | 'letter gbk' >> beam.GroupByKey())

        numbers = (
            main['numbers']
            | 'number windows' >> beam.WindowInto(
                FixedWindows(15),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | 'number with key' >> beam.Map(lambda x: ('k', x))
            | 'number gbk' >> beam.GroupByKey())

        # The letters were emitted when the watermark was at 5, thus we expect to
        # see the elements in the [0, 15) window. We used an early trigger to make
        # sure that the ON_TIME empty pane was also emitted with a TestStream.
        # This pane has no data because of the early trigger causes the elements to
        # fire before the end of the window and because the accumulation mode
        # discards any data after the trigger fired.
        expected_letters = {
            window.IntervalWindow(0, 15): [
                ('k', ['a', 'b', 'c']),
                ('k', []),
            ],
        }

        # Same here, except the numbers were emitted at watermark = 20, thus they
        # are in the [15, 30) window.
        expected_numbers = {
            window.IntervalWindow(15, 30): [
                ('k', ['1', '2', '3']),
                ('k', []),
            ],
        }
        assert_that(letters,
                    equal_to_per_window(expected_letters),
                    label='letters assert per window')
        assert_that(numbers,
                    equal_to_per_window(expected_numbers),
                    label='numbers assert per window')

        p.run()
 def assertion_matcher(pcol, expected_values):
     """Assertion matcher to match the pipeline output to expected output"""
     assert_that(pcol,
                 equal_to_per_window(expected_values),
                 use_global_window=False,
                 label='Assert events per window.')
def run(argv=None):
  """Build and run the pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--output_topic', required=True,
      help=('Output PubSub topic of the form '
            '"projects/<PROJECT>/topic/<TOPIC>".'))
  group = parser.add_mutually_exclusive_group(required=True)
  group.add_argument(
      '--input_topic',
      help=('Input PubSub topic of the form '
            '"projects/<PROJECT>/topics/<TOPIC>".'))
  group.add_argument(
      '--input_subscription',
      help=('Input PubSub subscription of the form '
            '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  pipeline_options.view_as(StandardOptions).streaming = True
  with beam.Pipeline(options=pipeline_options) as p:

    # Read from PubSub into a PCollection.
    if known_args.input_subscription:
      lines = p | beam.io.ReadFromPubSub(
          subscription=known_args.input_subscription)
    else:
      lines = p | beam.io.ReadFromPubSub(topic=known_args.input_topic)

    # Count the occurrences of each word.
    def count_ones(word_ones):
      (word, ones) = word_ones
      return (word, sum(ones))

    counts = (lines
              | 'AddTimestampFn' >> beam.ParDo(AddTimestampFn())
              | 'After AddTimestampFn' >> ParDo(PrintFn('After AddTimestampFn'))
              | 'Split' >> (beam.ParDo(WordExtractingDoFn())
                            .with_output_types(unicode))
              | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
              | beam.WindowInto(window.FixedWindows(5, 0))
              | 'GroupByKey' >> beam.GroupByKey()
              | 'CountOnes' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
      (word, count) = word_count
      return '%s: %d' % (word, count)

    output = counts | 'format' >> beam.Map(format_result)

    # Write to PubSub.
    # pylint: disable=expression-not-assigned
    output | beam.io.WriteStringsToPubSub(known_args.output_topic)

    def check_gbk_format():
      # A matcher that checks that the output of GBK is of the form word: count.
      def matcher(elements):
        # pylint: disable=unused-variable
        actual_elements_in_window, window = elements
        for elm in actual_elements_in_window:
          assert re.match(r'\S+:\s+\d+', elm) is not None
      return matcher

    # Check that the format of the output is correct.
    assert_that(
        output,
        check_gbk_format(),
        use_global_window=False,
        label='Assert word:count format.')

    # Check also that elements are ouput in the right window.
    # This expects exactly 1 occurrence of any subset of the elements
    # 150, 151, 152, 153, 154 in the window [150, 155)
    # or exactly 1 occurrence of any subset of the elements
    # 210, 211, 212, 213, 214 in the window [210, 215).
    expected_window_to_elements = {
        window.IntervalWindow(150, 155): [
            ('150: 1'), ('151: 1'), ('152: 1'), ('153: 1'), ('154: 1'),
        ],
        window.IntervalWindow(210, 215): [
            ('210: 1'), ('211: 1'), ('212: 1'), ('213: 1'), ('214: 1'),
        ],
    }

    # To pass, publish numbers in [150-155) or [210-215) with no repeats.
    # To fail, publish a repeated number in the range above range.
    # For example: '210 213 151 213'
    assert_that(
        output,
        equal_to_per_window(expected_window_to_elements),
        use_global_window=False,
        label='Assert correct streaming windowing.')
Beispiel #26
0
    def test_basic_execution_in_records_format(self):
        test_stream = (TestStream()
                       .advance_watermark_to(0)
                       .advance_processing_time(5)
                       .add_elements(['a', 'b', 'c'])
                       .advance_watermark_to(2)
                       .advance_processing_time(1)
                       .advance_watermark_to(4)
                       .advance_processing_time(1)
                       .advance_watermark_to(6)
                       .advance_processing_time(1)
                       .advance_watermark_to(8)
                       .advance_processing_time(1)
                       .advance_watermark_to(10)
                       .advance_processing_time(1)
                       .add_elements([TimestampedValue('1', 15),
                                      TimestampedValue('2', 15),
                                      TimestampedValue('3', 15)]))  # yapf: disable

        options = StandardOptions(streaming=True)
        p = TestPipeline(options=options)

        coder = beam.coders.FastPrimitivesCoder()
        records = (p
                   | test_stream
                   | ReverseTestStream(
                       sample_resolution_sec=1,
                       coder=coder,
                       output_format=OutputFormat.TEST_STREAM_FILE_RECORDS,
                       output_tag=None)
                   | 'stringify' >> beam.Map(str))

        assert_that(
            records,
            equal_to_per_window({
                beam.window.GlobalWindow(): [
                    str(TestStreamFileHeader()),
                    str(
                        TestStreamFileRecord(
                            recorded_event=TestStreamPayload.Event(
                                processing_time_event=TestStreamPayload.Event.
                                AdvanceProcessingTime(
                                    advance_duration=5000000)))),
                    str(
                        TestStreamFileRecord(
                            recorded_event=TestStreamPayload.Event(
                                watermark_event=TestStreamPayload.Event.
                                AdvanceWatermark(new_watermark=0)))),
                    str(
                        TestStreamFileRecord(
                            recorded_event=TestStreamPayload.Event(
                                element_event=TestStreamPayload.Event.
                                AddElements(elements=[
                                    TestStreamPayload.TimestampedElement(
                                        encoded_element=coder.encode('a'),
                                        timestamp=0),
                                    TestStreamPayload.TimestampedElement(
                                        encoded_element=coder.encode('b'),
                                        timestamp=0),
                                    TestStreamPayload.TimestampedElement(
                                        encoded_element=coder.encode('c'),
                                        timestamp=0),
                                ])))),
                    str(
                        TestStreamFileRecord(
                            recorded_event=TestStreamPayload.Event(
                                watermark_event=TestStreamPayload.Event.
                                AdvanceWatermark(new_watermark=2000000)))),
                    str(
                        TestStreamFileRecord(
                            recorded_event=TestStreamPayload.Event(
                                processing_time_event=TestStreamPayload.Event.
                                AdvanceProcessingTime(
                                    advance_duration=1000000)))),
                    str(
                        TestStreamFileRecord(
                            recorded_event=TestStreamPayload.Event(
                                watermark_event=TestStreamPayload.Event.
                                AdvanceWatermark(new_watermark=4000000)))),
                    str(
                        TestStreamFileRecord(
                            recorded_event=TestStreamPayload.Event(
                                processing_time_event=TestStreamPayload.Event.
                                AdvanceProcessingTime(
                                    advance_duration=1000000)))),
                    str(
                        TestStreamFileRecord(
                            recorded_event=TestStreamPayload.Event(
                                watermark_event=TestStreamPayload.Event.
                                AdvanceWatermark(new_watermark=6000000)))),
                    str(
                        TestStreamFileRecord(
                            recorded_event=TestStreamPayload.Event(
                                processing_time_event=TestStreamPayload.Event.
                                AdvanceProcessingTime(
                                    advance_duration=1000000)))),
                    str(
                        TestStreamFileRecord(
                            recorded_event=TestStreamPayload.Event(
                                watermark_event=TestStreamPayload.Event.
                                AdvanceWatermark(new_watermark=8000000)))),
                    str(
                        TestStreamFileRecord(
                            recorded_event=TestStreamPayload.Event(
                                processing_time_event=TestStreamPayload.Event.
                                AdvanceProcessingTime(
                                    advance_duration=1000000)))),
                    str(
                        TestStreamFileRecord(
                            recorded_event=TestStreamPayload.Event(
                                watermark_event=TestStreamPayload.Event.
                                AdvanceWatermark(new_watermark=10000000)))),
                    str(
                        TestStreamFileRecord(
                            recorded_event=TestStreamPayload.Event(
                                processing_time_event=TestStreamPayload.Event.
                                AdvanceProcessingTime(
                                    advance_duration=1000000)))),
                    str(
                        TestStreamFileRecord(
                            recorded_event=TestStreamPayload.Event(
                                element_event=TestStreamPayload.Event.
                                AddElements(elements=[
                                    TestStreamPayload.TimestampedElement(
                                        encoded_element=coder.encode('1'),
                                        timestamp=15000000),
                                    TestStreamPayload.TimestampedElement(
                                        encoded_element=coder.encode('2'),
                                        timestamp=15000000),
                                    TestStreamPayload.TimestampedElement(
                                        encoded_element=coder.encode('3'),
                                        timestamp=15000000),
                                ])))),
                ],
            }))

        p.run()
Beispiel #27
0
    def test_windowing(self):
        test_stream = (TestStream()
                       .advance_watermark_to(0)
                       .add_elements(['a', 'b', 'c'])
                       .advance_processing_time(1)
                       .advance_processing_time(1)
                       .advance_processing_time(1)
                       .advance_processing_time(1)
                       .advance_processing_time(1)
                       .advance_watermark_to(5)
                       .add_elements(['1', '2', '3'])
                       .advance_processing_time(1)
                       .advance_watermark_to(6)
                       .advance_processing_time(1)
                       .advance_watermark_to(7)
                       .advance_processing_time(1)
                       .advance_watermark_to(8)
                       .advance_processing_time(1)
                       .advance_watermark_to(9)
                       .advance_processing_time(1)
                       .advance_watermark_to(10)
                       .advance_processing_time(1)
                       .advance_watermark_to(11)
                       .advance_processing_time(1)
                       .advance_watermark_to(12)
                       .advance_processing_time(1)
                       .advance_watermark_to(13)
                       .advance_processing_time(1)
                       .advance_watermark_to(14)
                       .advance_processing_time(1)
                       .advance_watermark_to(15)
                       .advance_processing_time(1)
                       )  # yapf: disable

        options = StandardOptions(streaming=True)
        p = TestPipeline(options=options)

        records = (p
                   | test_stream
                   | 'letter windows' >> beam.WindowInto(
                       FixedWindows(5),
                       accumulation_mode=trigger.AccumulationMode.DISCARDING)
                   | 'letter with key' >> beam.Map(lambda x: ('k', x))
                   | 'letter gbk' >> beam.GroupByKey()
                   | ReverseTestStream(sample_resolution_sec=1,
                                       output_tag=None))

        assert_that(
            records,
            equal_to_per_window({
                beam.window.GlobalWindow(): [
                    [ProcessingTimeEvent(5),
                     WatermarkEvent(4999998)],
                    [
                        ElementEvent([
                            TimestampedValue(('k', ['a', 'b', 'c']), 4.999999)
                        ])
                    ],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(5000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(6000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(7000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(8000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(9000000)],
                    [
                        ElementEvent([
                            TimestampedValue(('k', ['1', '2', '3']), 9.999999)
                        ])
                    ],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(10000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(11000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(12000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(13000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(14000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(15000000)],
                ],
            }))

        p.run()
Beispiel #28
0
def test_output():
    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    test_pipeline = TestPipeline(options=options)

    events = (test_pipeline
              | TestStream().add_elements(
                  elements=["event"],
                  event_timestamp=datetime(
                      2021, 3, 1, 0, 0, 1, 0,
                      tzinfo=pytz.UTC).timestamp()).add_elements(
                          elements=["event"],
                          event_timestamp=datetime(
                              2021, 3, 1, 0, 0, 2, 0,
                              tzinfo=pytz.UTC).timestamp()).add_elements(
                                  elements=["event"],
                                  event_timestamp=datetime(
                                      2021, 3, 1, 0, 0, 3, 0,
                                      tzinfo=pytz.UTC).timestamp()).
              add_elements(
                  elements=["event"],
                  event_timestamp=datetime(
                      2021, 3, 1, 0, 0, 4, 0,
                      tzinfo=pytz.UTC).timestamp()).advance_watermark_to(
                          datetime(
                              2021, 3, 1, 0, 0, 5, 0,
                              tzinfo=pytz.UTC).timestamp()).add_elements(
                                  elements=["event"],
                                  event_timestamp=datetime(
                                      2021, 3, 1, 0, 0, 5, 0,
                                      tzinfo=pytz.UTC).timestamp()).
              add_elements(elements=["event"],
                           event_timestamp=datetime(
                               2021, 3, 1, 0, 0, 6, 0,
                               tzinfo=pytz.UTC).timestamp()).add_elements(
                                   elements=["event"],
                                   event_timestamp=datetime(
                                       2021, 3, 1, 0, 0, 7, 0,
                                       tzinfo=pytz.UTC).timestamp()).
              add_elements(elements=["event"],
                           event_timestamp=datetime(
                               2021, 3, 1, 0, 0, 8, 0,
                               tzinfo=pytz.UTC).timestamp()).add_elements(
                                   elements=["event"],
                                   event_timestamp=datetime(
                                       2021, 3, 1, 0, 0, 9, 0,
                                       tzinfo=pytz.UTC).timestamp()).
              advance_watermark_to(
                  datetime(
                      2021, 3, 1, 0, 0, 10, 0,
                      tzinfo=pytz.UTC).timestamp()).add_elements(
                          elements=["event"],
                          event_timestamp=datetime(
                              2021, 3, 1, 0, 0, 10, 0,
                              tzinfo=pytz.UTC).timestamp()).add_elements(
                                  elements=["event"],
                                  event_timestamp=datetime(
                                      2021, 3, 1, 0, 0, 11, 0,
                                      tzinfo=pytz.UTC).timestamp()).
              add_elements(elements=["event"],
                           event_timestamp=datetime(
                               2021, 3, 1, 0, 0, 12, 0, tzinfo=pytz.UTC).
                           timestamp()).add_elements(
                               elements=["event"],
                               event_timestamp=datetime(
                                   2021, 3, 1, 0, 0, 13, 0,
                                   tzinfo=pytz.UTC).timestamp()).
              add_elements(elements=["event"],
                           event_timestamp=datetime(
                               2021, 3,
                               1, 0, 0, 14, 0, tzinfo=pytz.UTC).timestamp()).
              advance_watermark_to(
                  datetime(
                      2021, 3, 1, 0, 0, 15, 0,
                      tzinfo=pytz.UTC).timestamp()).add_elements(
                          elements=["event"],
                          event_timestamp=datetime(
                              2021, 3, 1, 0, 0, 15, 0,
                              tzinfo=pytz.UTC).timestamp()).add_elements(
                                  elements=["event"],
                                  event_timestamp=datetime(
                                      2021, 3, 1, 0, 0, 16, 0,
                                      tzinfo=pytz.UTC).timestamp()).
              add_elements(elements=["event"],
                           event_timestamp=datetime(
                               2021, 3, 1, 0, 0, 17, 0, tzinfo=pytz.UTC).
                           timestamp()).add_elements(
                               elements=["event"],
                               event_timestamp=datetime(
                                   2021, 3, 1, 0, 0, 18, 0,
                                   tzinfo=pytz.UTC).timestamp()).
              add_elements(elements=["event"],
                           event_timestamp=datetime(
                               2021, 3,
                               1, 0, 0, 19, 0, tzinfo=pytz.UTC).timestamp()).
              advance_watermark_to(
                  datetime(2021, 3, 1, 0, 0, 20, 0,
                           tzinfo=pytz.UTC).timestamp()
              ).add_elements(
                  elements=["event"],
                  event_timestamp=datetime(
                      2021, 3, 1, 0, 0, 20, 0,
                      tzinfo=pytz.UTC).timestamp()).advance_watermark_to(
                          datetime(2021, 3, 1, 0, 0, 25, 0, tzinfo=pytz.UTC).
                          timestamp()).advance_watermark_to_infinity())

    results = apply_transform(events)

    answers = {
        window.IntervalWindow(
            datetime(2021, 3, 1, 0, 0, 0, 0, tzinfo=pytz.UTC).timestamp(),
            datetime(2021, 3, 2, 0, 0, 0, 0, tzinfo=pytz.UTC).timestamp()):
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
    }

    assert_that(results,
                equal_to_per_window(answers),
                label='count assert per window')

    test_pipeline.run()
def run(argv=None):
  """Build and run the pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--output_topic', required=True,
      help=('Output PubSub topic of the form '
            '"projects/<PROJECT>/topic/<TOPIC>".'))
  group = parser.add_mutually_exclusive_group(required=True)
  group.add_argument(
      '--input_topic',
      help=('Input PubSub topic of the form '
            '"projects/<PROJECT>/topics/<TOPIC>".'))
  group.add_argument(
      '--input_subscription',
      help=('Input PubSub subscription of the form '
            '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  pipeline_options.view_as(StandardOptions).streaming = True
  p = beam.Pipeline(options=pipeline_options)

  # Read from PubSub into a PCollection.
  if known_args.input_subscription:
    lines = p | beam.io.ReadStringsFromPubSub(
        subscription=known_args.input_subscription)
  else:
    lines = p | beam.io.ReadStringsFromPubSub(topic=known_args.input_topic)

  # Count the occurrences of each word.
  def count_ones(word_ones):
    (word, ones) = word_ones
    return (word, sum(ones))

  counts = (lines
            | 'AddTimestampFn' >> beam.ParDo(AddTimestampFn())
            | 'After AddTimestampFn' >> ParDo(PrintFn('After AddTimestampFn'))
            | 'Split' >> (beam.ParDo(WordExtractingDoFn())
                          .with_output_types(six.text_type))
            | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
            | beam.WindowInto(window.FixedWindows(5, 0))
            | 'GroupByKey' >> beam.GroupByKey()
            | 'CountOnes' >> beam.Map(count_ones))

  # Format the counts into a PCollection of strings.
  def format_result(word_count):
    (word, count) = word_count
    return '%s: %d' % (word, count)

  output = counts | 'format' >> beam.Map(format_result)

  # Write to PubSub.
  # pylint: disable=expression-not-assigned
  output | beam.io.WriteStringsToPubSub(known_args.output_topic)

  def check_gbk_format():
    # A matcher that checks that the output of GBK is of the form word: count.
    def matcher(elements):
      # pylint: disable=unused-variable
      actual_elements_in_window, window = elements
      for elm in actual_elements_in_window:
        assert re.match(r'\S+:\s+\d+', elm) is not None
    return matcher

  # Check that the format of the output is correct.
  assert_that(
      output,
      check_gbk_format(),
      use_global_window=False,
      label='Assert word:count format.')

  # Check also that elements are ouput in the right window.
  # This expects exactly 1 occurrence of any subset of the elements
  # 150, 151, 152, 153, 154 in the window [150, 155)
  # or exactly 1 occurrence of any subset of the elements
  # 210, 211, 212, 213, 214 in the window [210, 215).
  expected_window_to_elements = {
      window.IntervalWindow(150, 155): [
          ('150: 1'), ('151: 1'), ('152: 1'), ('153: 1'), ('154: 1'),
      ],
      window.IntervalWindow(210, 215): [
          ('210: 1'), ('211: 1'), ('212: 1'), ('213: 1'), ('214: 1'),
      ],
  }

  # To make it pass, publish numbers in [150-155) or [210-215) with no repeats.
  # To make it fail, publish a repeated number in the range above range.
  # For example: '210 213 151 213'
  assert_that(
      output,
      equal_to_per_window(expected_window_to_elements),
      use_global_window=False,
      label='Assert correct streaming windowing.')

  result = p.run()
  result.wait_until_finish()