Beispiel #1
0
    def test_sessions_default(self):
        self.run_trigger_simple(
            Sessions(10),  # pyformat break
            DefaultTrigger(),
            AccumulationMode.ACCUMULATING,
            [(1, 'a'), (2, 'b')],
            {IntervalWindow(1, 12): [set('ab')]},
            1,
            2,
            -2,
            -1)

        self.run_trigger_simple(
            Sessions(10),  # pyformat break
            AfterWatermark(),
            AccumulationMode.ACCUMULATING,
            [(1, 'a'), (2, 'b'), (15, 'c'), (16, 'd'), (30, 'z'), (9, 'e'),
             (10, 'f'), (30, 'y')],
            {
                IntervalWindow(1, 26): [set('abcdef')],
                IntervalWindow(30, 40): [set('yz')]
            },
            1,
            2,
            3,
            4,
            5,
            6,
            -4,
            -2,
            -1)
Beispiel #2
0
 def test_reshuffle_window_fn_preserved(self):
   pipeline = TestPipeline()
   data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
   expected_windows = [TestWindowedValue(v, t, [w]) for (v, t, w) in [
       ((1, 1), 1.0, IntervalWindow(1.0, 3.0)),
       ((2, 1), 1.0, IntervalWindow(1.0, 3.0)),
       ((3, 1), 1.0, IntervalWindow(1.0, 3.0)),
       ((1, 2), 2.0, IntervalWindow(2.0, 4.0)),
       ((2, 2), 2.0, IntervalWindow(2.0, 4.0)),
       ((1, 4), 4.0, IntervalWindow(4.0, 6.0))]]
   expected_merged_windows = [TestWindowedValue(v, t, [w]) for (v, t, w) in [
       ((1, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)),
       ((2, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)),
       ((3, [1]), 3.0, IntervalWindow(1.0, 3.0)),
       ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))]]
   before_reshuffle = (pipeline
                       | 'start' >> beam.Create(data)
                       | 'add_timestamp' >> beam.Map(
                           lambda v: TimestampedValue(v, v[1]))
                       | 'window' >> beam.WindowInto(Sessions(gap_size=2)))
   assert_that(before_reshuffle, equal_to(expected_windows),
               label='before_reshuffle', reify_windows=True)
   after_reshuffle = before_reshuffle | beam.Reshuffle()
   assert_that(after_reshuffle, equal_to(expected_windows),
               label='after_reshuffle', reify_windows=True)
   after_group = after_reshuffle | beam.GroupByKey()
   assert_that(after_group, equal_to(expected_merged_windows),
               label='after_group', reify_windows=True)
   pipeline.run()
Beispiel #3
0
 def test_reshuffle_windows_unchanged(self):
     pipeline = TestPipeline()
     data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
     expected_data = [
         TestWindowedValue(v, t, [w])
         for (v, t,
              w) in [((1, [2, 1]), 4.0, IntervalWindow(1.0, 4.0)
                      ), ((2, [2, 1]), 4.0, IntervalWindow(1.0, 4.0)
                          ), ((3, [1]), 3.0, IntervalWindow(1.0, 3.0)
                              ), ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))]
     ]
     before_reshuffle = (
         pipeline
         | 'start' >> beam.Create(data)
         | 'add_timestamp' >>
         beam.Map(lambda v: beam.window.TimestampedValue(v, v[1]))
         | 'window' >> beam.WindowInto(Sessions(gap_size=2))
         | 'group_by_key' >> beam.GroupByKey())
     assert_that(before_reshuffle,
                 equal_to(expected_data),
                 label='before_reshuffle',
                 reify_windows=True)
     after_reshuffle = (before_reshuffle | 'reshuffle' >> beam.Reshuffle())
     assert_that(after_reshuffle,
                 equal_to(expected_data),
                 label='after reshuffle',
                 reify_windows=True)
     pipeline.run()
    def get_window(self):
        """
        Returns a selected beam windowing strategy

        :return: the selected windowing strategy
        """
        return Sessions(self.gap_threshold)
Beispiel #5
0
 def test_sessions_repeatedly_after_count(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         Repeatedly(AfterCount(2)),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')],
         {IntervalWindow(1, 25): [set('abc'), set('abcde')]},
         1,
         3)
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         Repeatedly(AfterCount(2)),
         AccumulationMode.DISCARDING,
         [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')],
         {IntervalWindow(1, 25): [set('abc'), set('de')]},
         1,
         3)
Beispiel #6
0
 def test_windowfn_encoding(self):
     for window_fn in (GlobalWindows(), FixedWindows(37),
                       SlidingWindows(2, 389), Sessions(5077)):
         context = pipeline_context.PipelineContext()
         self.assertEqual(
             window_fn,
             WindowFn.from_runner_api(window_fn.to_runner_api(context),
                                      context))
Beispiel #7
0
 def test_sessions_after_all(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterAll(AfterCount(2), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(1, 13): [set('abc')]},
         1,
         2)
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterAll(AfterCount(5), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(1, 13): [set('abcxy')]},
         1,
         2,
         late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
Beispiel #8
0
  def test_sessions_after_each(self):
    self.run_trigger_simple(
        Sessions(10),  # pyformat break
        AfterEach(AfterCount(2), AfterCount(3)),
        AccumulationMode.ACCUMULATING,
        zip(range(10), 'abcdefghij'),
        {IntervalWindow(0, 11): [set('ab')],
         IntervalWindow(0, 15): [set('abcdef')]},
        2)

    self.run_trigger_simple(
        Sessions(10),  # pyformat break
        Repeatedly(AfterEach(AfterCount(2), AfterCount(3))),
        AccumulationMode.ACCUMULATING,
        zip(range(10), 'abcdefghij'),
        {IntervalWindow(0, 11): [set('ab')],
         IntervalWindow(0, 15): [set('abcdef')],
         IntervalWindow(0, 17): [set('abcdefgh')]},
        2)
Beispiel #9
0
 def test_sessions_watermark(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterWatermark(),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b')],
         {IntervalWindow(1, 12): [set('ab')]},
         1,
         2,
         -2,
         -1)
 def test_sessions(self):
     with TestPipeline() as p:
         pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27)
         result = (pcoll
                   | 'w' >> WindowInto(Sessions(10))
                   | GroupByKey()
                   | sort_values
                   | reify_windows)
         expected = [('key @ [1.0, 13.0)', [1, 2, 3]),
                     ('key @ [20.0, 45.0)', [20, 27, 35])]
         assert_that(result, equal_to(expected))
Beispiel #11
0
 def test_sessions_after_count(self):
   self.run_trigger_simple(
       Sessions(10),  # pyformat break
       AfterCount(2),
       AccumulationMode.ACCUMULATING,
       [(1, 'a'), (15, 'b'), (6, 'c'), (30, 's'), (31, 't'), (50, 'z'),
        (50, 'y')],
       {IntervalWindow(1, 25): [set('abc')],
        IntervalWindow(30, 41): [set('st')],
        IntervalWindow(50, 60): [set('yz')]},
       1,
       2,
       3)
    def test_sessions_and_complex_trigger_accumulating(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 1), tsv('k1', 2, 15),
                             tsv('k1', 3, 7), tsv('k1', 4, 30)])
              .advance_watermark_to(50)
              .add_elements([tsv('k1', -3, 1), tsv('k1', -2, 2),])
              .add_elements([tsv('k1', -1, 21)])
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(Sessions(10),
                              triggerfn=AfterWatermark(early=AfterCount(2),
                                                       late=AfterCount(1)),
                              accumulation_mode=AccumulationMode.ACCUMULATING,
                              allowed_lateness=MAX_TIMESTAMP.seconds())

        with TestPipeline() as p:
            result = (p
                      | test_stream
                      | WindowInto(windowing.windowfn)
                      | ParDo(trigger_manager._ReifyWindows())
                      | ParDo(trigger_manager._GroupBundlesByKey())
                      | ParDo(
                          trigger_manager.GeneralTriggerManagerDoFn(windowing))
                      | Map(lambda elm: (elm[0], elm[1][0].windows[0],
                                         set(v.value for v in elm[1]))))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(1, 25), {1, 2, 3}),  # early
                    ('k1', IntervalWindow(1, 25), {1, 2, 3}),  # on time
                    ('k1', IntervalWindow(30, 40), {4}),  # on time
                    ('k1', IntervalWindow(1, 25), {1, 2, 3, -3, -2}),  # late
                    ('k1', IntervalWindow(1, 40), {1, 2, 3, 4, -3, -2,
                                                   -1}),  # late
                ]))
Beispiel #13
0
 def test_sessions_watermark_with_early_late(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterWatermark(early=AfterCount(2), late=AfterCount(1)),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (15, 'b'), (7, 'c'), (30, 'd')],
         {
             IntervalWindow(1, 25): [
                 set('abc'),  # early
                 set('abc'),  # on time
                 set('abcxy')  # late
             ],
             IntervalWindow(30, 40): [
                 set('d'),  # on time
             ],
             IntervalWindow(1, 40): [
                 set('abcdxyz')  # late
             ],
         },
         2,
         late_data=[(1, 'x'), (2, 'y'), (21, 'z')])
Beispiel #14
0
    def test_sessions_merging(self):
        windowfn = Sessions(10)

        def merge(*timestamps):
            windows = [
                windowfn.assign(context(None, t, [])) for t in timestamps
            ]
            running = set()

            class TestMergeContext(WindowFn.MergeContext):
                def __init__(self):
                    super(TestMergeContext, self).__init__(running)

                def merge(self, to_be_merged, merge_result):
                    for w in to_be_merged:
                        if w in running:
                            running.remove(w)
                    running.add(merge_result)

            for ws in windows:
                running.update(ws)
                windowfn.merge(TestMergeContext())
            windowfn.merge(TestMergeContext())
            return sorted(running)

        self.assertEqual([IntervalWindow(2, 12)], merge(2))
        self.assertEqual([IntervalWindow(2, 12),
                          IntervalWindow(19, 29)], merge(2, 19))

        self.assertEqual([IntervalWindow(2, 19)], merge(2, 9))
        self.assertEqual([IntervalWindow(2, 19)], merge(9, 2))

        self.assertEqual([IntervalWindow(2, 19),
                          IntervalWindow(19, 29)], merge(2, 9, 19))
        self.assertEqual([IntervalWindow(2, 19),
                          IntervalWindow(19, 29)], merge(19, 9, 2))

        self.assertEqual([IntervalWindow(2, 25)], merge(2, 15, 10))
Beispiel #15
0
 def expand(self, pcoll):
     return (pcoll
             | 'ComputeSessionsWindow' >> beam.WindowInto(
                 Sessions(gap_size=ONE_HOUR_IN_SECONDS))
             | combiners.Count.PerElement())
        {
            "userId": "Andy",
            "click": 1,
            "timestamp": 1603113600
        },  # Event time: 13:20
    ])
    # fmt: on

    # Assign timestamp to metadata of elements such that Beam's window functions can
    # access and use them to group events.
    timestamped_events = events | "AddTimestamp" >> beam.ParDo(
        AddTimestampDoFn())

    windowed_events = timestamped_events | beam.WindowInto(
        # Each session must be separated by a time gap of at least 30 minutes (1800 sec)
        Sessions(gap_size=30 * 60),
        # Triggers determine when to emit the aggregated results of each window. Default
        # trigger outputs the aggregated result when it estimates all data has arrived,
        # and discards all subsequent data for that window.
        trigger=None,
        # Since a trigger can fire multiple times, the accumulation mode determines
        # whether the system accumulates the window panes as the trigger fires, or
        # discards them.
        accumulation_mode=None,
        # Policies for combining timestamps that occur within a window. Only relevant if
        # a grouping operation is applied to windows.
        timestamp_combiner=None,
        # By setting allowed_lateness we can handle late data. If allowed lateness is
        # set, the default trigger will emit new results immediately whenever late
        # data arrives.
        allowed_lateness=Duration(seconds=1 * 24 * 60 * 60),  # 1 day