def test_deduplication_with_event_time(self): deduplicate_duration = 60 with self.create_pipeline() as p: test_stream = (TestStream(coder=coders.StrUtf8Coder( )).with_output_types(str).advance_watermark_to(0).add_elements([ window.TimestampedValue('k1', 0), window.TimestampedValue('k2', 20), window.TimestampedValue('k3', 30) ]).advance_watermark_to(30).add_elements([ window.TimestampedValue('k1', 40), window.TimestampedValue('k2', 50), window.TimestampedValue('k3', 60) ]).advance_watermark_to(deduplicate_duration).add_elements( [window.TimestampedValue('k1', 70)]).advance_watermark_to_infinity()) res = (p | test_stream | deduplicate.Deduplicate( event_time_duration=Duration(deduplicate_duration)) | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts))) assert_that( res, equal_to([('k1', Timestamp(0)), ('k2', Timestamp(20)), ('k3', Timestamp(30)), ('k1', Timestamp(70))]))
def test_deduplication_in_different_windows(self): with self.create_pipeline() as p: test_stream = ( TestStream( coder=coders.StrUtf8Coder()).advance_watermark_to(0).add_elements( [ window.TimestampedValue('k1', 0), window.TimestampedValue('k2', 10), window.TimestampedValue('k3', 20), window.TimestampedValue('k1', 30), window.TimestampedValue('k2', 40), window.TimestampedValue('k3', 50), window.TimestampedValue('k4', 60), window.TimestampedValue('k5', 70), window.TimestampedValue('k6', 80) ]).advance_watermark_to_infinity()) res = ( p | test_stream | beam.WindowInto(window.FixedWindows(30)) | deduplicate.Deduplicate(processing_time_duration=10 * 60) | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts))) # Deduplication should happen per window. expect_unique_keys_per_window = { window.IntervalWindow(0, 30): [('k1', Timestamp(0)), ('k2', Timestamp(10)), ('k3', Timestamp(20))], window.IntervalWindow(30, 60): [('k1', Timestamp(30)), ('k2', Timestamp(40)), ('k3', Timestamp(50))], window.IntervalWindow(60, 90): [('k4', Timestamp(60)), ('k5', Timestamp(70)), ('k6', Timestamp(80))], } assert_that( res, equal_to_per_window(expect_unique_keys_per_window), use_global_window=False, label='assert per window')