def test_sliding_windows_assignment(self): windowfn = SlidingWindows(size=15, period=5, offset=2) expected = [IntervalWindow(7, 22), IntervalWindow(2, 17), IntervalWindow(-3, 12)] self.assertEqual(expected, windowfn.assign(context('v', 7))) self.assertEqual(expected, windowfn.assign(context('v', 8))) self.assertEqual(expected, windowfn.assign(context('v', 11)))
def assign(self, context): timestamp = context.timestamp if timestamp < 3: return [IntervalWindow(0, 3)] elif timestamp < 5: return [IntervalWindow(3, 5)] else: return [IntervalWindow(5, timestamp)]
def test_sliding_windows_assignment_fraction_large_offset(self): windowfn = SlidingWindows(size=3.5, period=2.5, offset=4.0) self.assertEqual([IntervalWindow(1.5, 5.0), IntervalWindow(-1.0, 2.5)], windowfn.assign(context('v', 1.7))) self.assertEqual([IntervalWindow(4.0, 7.5), IntervalWindow(1.5, 5.0)], windowfn.assign(context('v', 4.5)))
def assign(self, context): timestamp = context.timestamp _logger.info("ASSIGN: %s %s" % (context.element, format_timestamp(timestamp))) if self.is_final(context.element): return [IntervalWindow(timestamp, MAX_TIMESTAMP)] else: return [IntervalWindow(timestamp, timestamp + self.gap_size)]
def test_fixed_watermark(self): self.run_trigger_simple( FixedWindows(10), # pyformat break AfterWatermark(), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (13, 'c')], {IntervalWindow(0, 10): [set('ab')], IntervalWindow(10, 20): [set('c')]}, 1, 2, 3)
def setUp(self): self.window1 = IntervalWindow(0, 10) self.window2 = IntervalWindow(10, 20) self.window3 = IntervalWindow(20, 30) self.windowed_value = WindowedValue( 'a', 57, (self.window1, self.window2, self.window3)) self.restriction = OffsetRange(0, 100) self.watermark_estimator_state = Timestamp(21) self.restriction_provider = TestOffsetRestrictionProvider() self.watermark_estimator = ManualWatermarkEstimator(Timestamp(42)) self.maxDiff = None
def test_sessions_after_count(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterCount(2), AccumulationMode.ACCUMULATING, [(1, 'a'), (15, 'b'), (6, 'c'), (30, 's'), (31, 't'), (50, 'z'), (50, 'y')], {IntervalWindow(1, 25): [set('abc')], IntervalWindow(30, 41): [set('st')], IntervalWindow(50, 60): [set('yz')]}, 1, 2, 3)
def test_fixed_watermark_with_early(self): self.run_trigger_simple( FixedWindows(10), # pyformat break AfterWatermark(early=AfterCount(2)), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('ab'), set('abc')]}, 2) self.run_trigger_simple( FixedWindows(10), # pyformat break AfterWatermark(early=AfterCount(2)), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('abc'), set('abc')]}, 3)
def test_late_data_behavior(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: base_json_pickup = "{\"ride_id\":\"x\",\"point_idx\":1,\"latitude\":0.0,\"longitude\":0.0," \ "\"timestamp\":\"00:00:00\",\"meter_reading\":1.0,\"meter_increment\":0.1," \ "\"ride_status\":\"pickup\",\"passenger_count\":1}" test_stream = TestStream().advance_watermark_to(0).add_elements([ TimestampedValue(base_json_pickup, 0), TimestampedValue(base_json_pickup, 0), ]).advance_watermark_to( 60).advance_processing_time(60).add_elements([ TimestampedValue(base_json_pickup, 0) ]).advance_watermark_to(300).advance_processing_time( 240).add_elements([TimestampedValue(base_json_pickup, 0)]) EXPECTED_RESULTS = { IntervalWindow(0, 60): [2, 3] } #On Time and Late Result taxi_counts_late = (p | test_stream | TaxiCountTransform()) assert_that(taxi_counts_late, equal_to_per_window(EXPECTED_RESULTS), reify_windows=True)
def test_reified_value_assert_fail_unmatched_window(self): expected = [TestWindowedValue(v, MIN_TIMESTAMP, [IntervalWindow(0, 1)]) for v in [1, 2, 3]] with self.assertRaises(Exception): with TestPipeline() as p: assert_that(p | Create([2, 3, 1]), equal_to(expected), reify_windows=True)
def decode_from_stream(self, in_, nested): end_millis = self._to_normal_time(in_.read_bigendian_uint64()) start_millis = end_millis - in_.read_var_int64() from apache_beam.transforms.window import IntervalWindow ret = IntervalWindow(start=Timestamp(micros=start_millis * 1000), end=Timestamp(micros=end_millis * 1000)) return ret
def test_reshuffle_window_fn_preserved(self): any_order = contains_in_any_order with TestPipeline() as pipeline: data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_windows = [ TestWindowedValue(v, t, [w]) for (v, t, w) in [((1, 1), 1.0, IntervalWindow(1.0, 3.0)), ( (2, 1), 1.0, IntervalWindow(1.0, 3.0)), ( (3, 1), 1.0, IntervalWindow(1.0, 3.0)), ( (1, 2), 2.0, IntervalWindow(2.0, 4.0)), ( (2, 2), 2.0, IntervalWindow(2.0, 4.0)), ((1, 4), 4.0, IntervalWindow(4.0, 6.0))] ] expected_merged_windows = [ TestWindowedValue(v, t - .001, [w]) for (v, t, w) in [((1, any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), ( (2, any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), ( (3, [1]), 3.0, IntervalWindow(1.0, 3.0)), ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))] ] before_reshuffle = ( pipeline | 'start' >> beam.Create(data) | 'add_timestamp' >> beam.Map(lambda v: TimestampedValue(v, v[1])) | 'window' >> beam.WindowInto(Sessions(gap_size=2))) assert_that( before_reshuffle, equal_to(expected_windows), label='before_reshuffle', reify_windows=True) after_reshuffle = before_reshuffle | beam.Reshuffle() assert_that( after_reshuffle, equal_to(expected_windows), label='after_reshuffle', reify_windows=True) after_group = after_reshuffle | beam.GroupByKey() assert_that( after_group, equal_to(expected_merged_windows), label='after_group', reify_windows=True)
def test_sessions_repeatedly_after_count(self): self.run_trigger_simple( Sessions(10), # pyformat break Repeatedly(AfterCount(2)), AccumulationMode.ACCUMULATING, [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')], {IntervalWindow(1, 25): [set('abc'), set('abcde')]}, 1, 3) self.run_trigger_simple( Sessions(10), # pyformat break Repeatedly(AfterCount(2)), AccumulationMode.DISCARDING, [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')], {IntervalWindow(1, 25): [set('abc'), set('de')]}, 1, 3)
def test_fixed_after_count(self): self.run_trigger_simple( FixedWindows(10), # pyformat break AfterCount(2), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c'), (11, 'z')], {IntervalWindow(0, 10): [set('ab')]}, 1, 2) self.run_trigger_simple( FixedWindows(10), # pyformat break AfterCount(2), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c'), (11, 'z')], {IntervalWindow(0, 10): [set('abc')]}, 3, 4)
def test_fixed_after_first(self): self.run_trigger_simple( FixedWindows(10), # pyformat break AfterAny(AfterCount(2), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('ab')]}, 1, 2) self.run_trigger_simple( FixedWindows(10), # pyformat break AfterAny(AfterCount(5), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(0, 10): [set('abc')]}, 1, 2, late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
def test_sessions_after_all(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterAll(AfterCount(2), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(1, 13): [set('abc')]}, 1, 2) self.run_trigger_simple( Sessions(10), # pyformat break AfterAll(AfterCount(5), AfterWatermark()), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b'), (3, 'c')], {IntervalWindow(1, 13): [set('abcxy')]}, 1, 2, late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
def test_fixed_windows_simple_watermark(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 0), tsv('k2', 1, 0), tsv('k1', 2, 0), tsv('k2', 2, 0)]) .add_elements([tsv('k1', 3, 0), tsv('k2', 3, 0)]) .add_elements([tsv('k1', 4, 1), tsv('k2', 4, 1)]) .add_elements([tsv('k1', 5, 1), tsv('k2', 5, 1)]) .advance_watermark_to(1) .add_elements([tsv('k1', 6, 0)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(1), allowed_lateness=MAX_TIMESTAMP.seconds()) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ('k2', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ('k1', IntervalWindow(1, 2), [4, 5]), # On the watermark ('k2', IntervalWindow(1, 2), [4, 5]), # On the watermark ('k1', IntervalWindow(0, 1), [6]), # After the watermark ]))
def decode_from_stream(self, in_, nested): global IntervalWindow if IntervalWindow is None: from apache_beam.transforms.window import IntervalWindow typed_value = IntervalWindow(None, None) typed_value._end_micros = ( 1000 * self._to_normal_time(in_.read_bigendian_uint64())) typed_value._start_micros = ( typed_value._end_micros - 1000 * in_.read_var_int64()) return typed_value
def test_sessions_after_each(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterEach(AfterCount(2), AfterCount(3)), AccumulationMode.ACCUMULATING, zip(range(10), 'abcdefghij'), {IntervalWindow(0, 11): [set('ab')], IntervalWindow(0, 15): [set('abcdef')]}, 2) self.run_trigger_simple( Sessions(10), # pyformat break Repeatedly(AfterEach(AfterCount(2), AfterCount(3))), AccumulationMode.ACCUMULATING, zip(range(10), 'abcdefghij'), {IntervalWindow(0, 11): [set('ab')], IntervalWindow(0, 15): [set('abcdef')], IntervalWindow(0, 17): [set('abcdefgh')]}, 2)
def test_sessions_and_complex_trigger_accumulating(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 1), tsv('k1', 2, 15), tsv('k1', 3, 7), tsv('k1', 4, 30)]) .advance_watermark_to(50) .add_elements([tsv('k1', -3, 1), tsv('k1', -2, 2),]) .add_elements([tsv('k1', -1, 21)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(Sessions(10), triggerfn=AfterWatermark(early=AfterCount(2), late=AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING, allowed_lateness=MAX_TIMESTAMP.seconds()) with TestPipeline() as p: result = (p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo( trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], set(v.value for v in elm[1])))) assert_that( result, equal_to([ ('k1', IntervalWindow(1, 25), {1, 2, 3}), # early ('k1', IntervalWindow(1, 25), {1, 2, 3}), # on time ('k1', IntervalWindow(30, 40), {4}), # on time ('k1', IntervalWindow(1, 25), {1, 2, 3, -3, -2}), # late ('k1', IntervalWindow(1, 40), {1, 2, 3, 4, -3, -2, -1}), # late ]))
def test_param_windowed_value_coder(self): from apache_beam.transforms.window import IntervalWindow from apache_beam.utils.windowed_value import PaneInfo wv = windowed_value.create( b'', # Milliseconds to microseconds 1000 * 1000, (IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)) windowed_value_coder = coders.WindowedValueCoder( coders.BytesCoder(), coders.IntervalWindowCoder()) payload = windowed_value_coder.encode(wv) coder = coders.ParamWindowedValueCoder( payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]) # Test binary representation self.assertEqual(b'\x01', coder.encode(window.GlobalWindows.windowed_value(1))) # Test unnested self.check_coder( coders.ParamWindowedValueCoder( payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]), windowed_value.WindowedValue( 3, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)), windowed_value.WindowedValue( 1, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3))) # Test nested self.check_coder( coders.TupleCoder(( coders.ParamWindowedValueCoder( payload, [ coders.FloatCoder(), coders.IntervalWindowCoder()]), coders.ParamWindowedValueCoder( payload, [ coders.StrUtf8Coder(), coders.IntervalWindowCoder()]))), (windowed_value.WindowedValue( 1.5, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)), windowed_value.WindowedValue( "abc", 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3))))
def test_reshuffle_windows_unchanged(self): pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_data = [TestWindowedValue(v, t, [w]) for (v, t, w) in [ ((1, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), ((2, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), ((3, [1]), 3.0, IntervalWindow(1.0, 3.0)), ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))]] before_reshuffle = (pipeline | 'start' >> beam.Create(data) | 'add_timestamp' >> beam.Map( lambda v: beam.window.TimestampedValue(v, v[1])) | 'window' >> beam.WindowInto(Sessions(gap_size=2)) | 'group_by_key' >> beam.GroupByKey()) assert_that(before_reshuffle, equal_to(expected_data), label='before_reshuffle', reify_windows=True) after_reshuffle = before_reshuffle | beam.Reshuffle() assert_that(after_reshuffle, equal_to(expected_data), label='after reshuffle', reify_windows=True) pipeline.run()
def test_sessions_watermark(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterWatermark(), AccumulationMode.ACCUMULATING, [(1, 'a'), (2, 'b')], {IntervalWindow(1, 12): [set('ab')]}, 1, 2, -2, -1)
def test_sessions_watermark_with_early_late(self): self.run_trigger_simple( Sessions(10), # pyformat break AfterWatermark(early=AfterCount(2), late=AfterCount(1)), AccumulationMode.ACCUMULATING, [(1, 'a'), (15, 'b'), (7, 'c'), (30, 'd')], { IntervalWindow(1, 25): [ set('abc'), # early set('abc'), # on time set('abcxy') # late ], IntervalWindow(30, 40): [ set('d'), # on time ], IntervalWindow(1, 40): [ set('abcdxyz') # late ], }, 2, late_data=[(1, 'x'), (2, 'y'), (21, 'z')])
def decode_from_stream(self, in_, nested): # type: (create_InputStream, bool) -> IntervalWindow if not TYPE_CHECKING: global IntervalWindow if IntervalWindow is None: from apache_beam.transforms.window import IntervalWindow # instantiating with None is not part of the public interface typed_value = IntervalWindow(None, None) # type: ignore[arg-type] typed_value._end_micros = ( 1000 * self._to_normal_time(in_.read_bigendian_uint64())) typed_value._start_micros = (typed_value._end_micros - 1000 * in_.read_var_int64()) return typed_value
def test_shard_naming(self): namer = fileio.default_file_naming(prefix='/path/to/file', suffix='.txt') self.assertEqual(namer(GlobalWindow(), None, None, None, None, None), '/path/to/file.txt') self.assertEqual(namer(GlobalWindow(), None, 1, 5, None, None), '/path/to/file-00001-of-00005.txt') self.assertEqual(namer(GlobalWindow(), None, 1, 5, 'gz', None), '/path/to/file-00001-of-00005.txt.gz') self.assertEqual( namer(IntervalWindow(0, 100), None, 1, 5, None, None), '/path/to/file' '-1970-01-01T00:00:00-1970-01-01T00:01:40-00001-of-00005.txt')
def test_sessions_merging(self): windowfn = Sessions(10) def merge(*timestamps): windows = [ windowfn.assign(context(None, t, [])) for t in timestamps ] running = set() class TestMergeContext(WindowFn.MergeContext): def __init__(self): super(TestMergeContext, self).__init__(running) def merge(self, to_be_merged, merge_result): for w in to_be_merged: if w in running: running.remove(w) running.add(merge_result) for ws in windows: running.update(ws) windowfn.merge(TestMergeContext()) windowfn.merge(TestMergeContext()) return sorted(running) self.assertEqual([IntervalWindow(2, 12)], merge(2)) self.assertEqual([IntervalWindow(2, 12), IntervalWindow(19, 29)], merge(2, 19)) self.assertEqual([IntervalWindow(2, 19)], merge(2, 9)) self.assertEqual([IntervalWindow(2, 19)], merge(9, 2)) self.assertEqual([IntervalWindow(2, 19), IntervalWindow(19, 29)], merge(2, 9, 19)) self.assertEqual([IntervalWindow(2, 19), IntervalWindow(19, 29)], merge(19, 9, 2)) self.assertEqual([IntervalWindow(2, 25)], merge(2, 15, 10))
def test_fixed_watermark_with_early_late(self): self.run_trigger_simple( FixedWindows(100), # pyformat break AfterWatermark(early=AfterCount(3), late=AfterCount(2)), AccumulationMode.DISCARDING, zip(range(9), 'abcdefghi'), {IntervalWindow(0, 100): [ set('abcd'), set('efgh'), # early set('i'), # on time set('vw'), set('xy') # late ]}, 2, late_data=zip(range(5), 'vwxyz'))
def merge(self, merge_context): to_merge = [] end = MIN_TIMESTAMP _logger.info("%d windows" % len(merge_context.windows)) for w in sorted(merge_context.windows, key=lambda w: w.start): _logger.info("WINDOW: (%s, %s)" % (format_timestamp(w.start), format_timestamp(w.end))) if to_merge: if end > w.start: # window `w` overlaps with `to_merge`: add it to_merge.append(w) if w.end == MAX_TIMESTAMP: _logger.info("FINAL: (%s, %s)" % (format_timestamp( to_merge[0].start), format_timestamp(end))) # we don't want any more windows on this key end = w.start break elif w.end > end: end = w.end else: # FIXME: this check seems superfluous if len(to_merge) > 1: _logger.info("NEW: (%s, %s)" % (format_timestamp( to_merge[0].start), format_timestamp(end))) merge_context.merge( to_merge, IntervalWindow(to_merge[0].start, end)) to_merge = [w] end = w.end else: to_merge = [w] end = w.end if len(to_merge) > 1: _logger.info( "NEW: (%s, %s)" % (format_timestamp(to_merge[0].start), format_timestamp(end))) merge_context.merge(to_merge, IntervalWindow(to_merge[0].start, end))
def test_repeatedly_after_first(self): self.run_trigger_simple( FixedWindows(100), # pyformat break Repeatedly(AfterAny(AfterCount(3), AfterWatermark())), AccumulationMode.ACCUMULATING, zip(range(7), 'abcdefg'), {IntervalWindow(0, 100): [ set('abc'), set('abcdef'), set('abcdefg'), set('abcdefgx'), set('abcdefgxy'), set('abcdefgxyz')]}, 1, late_data=zip(range(3), 'xyz'))