def test_deduplication_with_event_time(self):
        deduplicate_duration = 60
        with self.create_pipeline() as p:
            test_stream = (TestStream(coder=coders.StrUtf8Coder(
            )).with_output_types(str).advance_watermark_to(0).add_elements([
                window.TimestampedValue('k1', 0),
                window.TimestampedValue('k2', 20),
                window.TimestampedValue('k3', 30)
            ]).advance_watermark_to(30).add_elements([
                window.TimestampedValue('k1', 40),
                window.TimestampedValue('k2', 50),
                window.TimestampedValue('k3', 60)
            ]).advance_watermark_to(deduplicate_duration).add_elements(
                [window.TimestampedValue('k1',
                                         70)]).advance_watermark_to_infinity())
            res = (p
                   | test_stream
                   | deduplicate.Deduplicate(
                       event_time_duration=Duration(deduplicate_duration))
                   | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts)))

            assert_that(
                res,
                equal_to([('k1', Timestamp(0)), ('k2', Timestamp(20)),
                          ('k3', Timestamp(30)), ('k1', Timestamp(70))]))
Example #2
0
  def test_deduplication_in_different_windows(self):
    with self.create_pipeline() as p:
      test_stream = (
          TestStream(
              coder=coders.StrUtf8Coder()).advance_watermark_to(0).add_elements(
                  [
                      window.TimestampedValue('k1', 0),
                      window.TimestampedValue('k2', 10),
                      window.TimestampedValue('k3', 20),
                      window.TimestampedValue('k1', 30),
                      window.TimestampedValue('k2', 40),
                      window.TimestampedValue('k3', 50),
                      window.TimestampedValue('k4', 60),
                      window.TimestampedValue('k5', 70),
                      window.TimestampedValue('k6', 80)
                  ]).advance_watermark_to_infinity())

      res = (
          p
          | test_stream
          | beam.WindowInto(window.FixedWindows(30))
          | deduplicate.Deduplicate(processing_time_duration=10 * 60)
          | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts)))
      # Deduplication should happen per window.
      expect_unique_keys_per_window = {
          window.IntervalWindow(0, 30): [('k1', Timestamp(0)),
                                         ('k2', Timestamp(10)),
                                         ('k3', Timestamp(20))],
          window.IntervalWindow(30, 60): [('k1', Timestamp(30)),
                                          ('k2', Timestamp(40)),
                                          ('k3', Timestamp(50))],
          window.IntervalWindow(60, 90): [('k4', Timestamp(60)),
                                          ('k5', Timestamp(70)),
                                          ('k6', Timestamp(80))],
      }
      assert_that(
          res,
          equal_to_per_window(expect_unique_keys_per_window),
          use_global_window=False,
          label='assert per window')