예제 #1
0
    def test_multi_triggered_gbk_side_input(self):
        """Test a GBK sideinput, with multiple triggering."""
        # TODO(BEAM-9322): Remove use of this experiment.
        # This flag is only necessary when using the multi-output TestStream b/c
        # it relies on using the PCollection output tags as the PCollection output
        # ids.
        p = TestPipeline(additional_pipeline_args=[
            '--experiments=' + 'passthrough_pcollection_output_ids'
        ])

        test_stream = (
            p
            | 'Mixed TestStream' >> TestStream().advance_watermark_to(
                3, tag='main').add_elements(
                    ['a1'], tag='main').advance_watermark_to(
                        8, tag='main').add_elements(['a2'], tag='main').
            add_elements([window.TimestampedValue(
                ('k', 100), 2)], tag='side').add_elements(
                    [window.TimestampedValue(('k', 400), 7)],
                    tag='side').advance_watermark_to_infinity(
                        tag='main').advance_watermark_to_infinity(tag='side'))

        main_data = (
            test_stream['main']
            | 'Main windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                accumulation_mode=trigger.AccumulationMode.DISCARDING))

        side_data = (
            test_stream['side']
            | 'Side windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | beam.CombinePerKey(sum)
            | 'Values' >> Map(lambda k_vs: k_vs[1]))

        class RecordFn(beam.DoFn):
            def process(self,
                        elm=beam.DoFn.ElementParam,
                        ts=beam.DoFn.TimestampParam,
                        side=beam.DoFn.SideInputParam):
                yield (elm, ts, side)

        records = (main_data
                   | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data)))

        expected_window_to_elements = {
            window.IntervalWindow(0, 5): [
                ('a1', Timestamp(3), [100, 0]),
            ],
            window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])],
        }

        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    use_global_window=False,
                    label='assert per window')

        p.run()
예제 #2
0
    def test_basic_execution_sideinputs(self):
        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:

            test_stream = (p | TestStream()
                .advance_watermark_to(0, tag='side')
                .advance_watermark_to(10, tag='main')
                .add_elements(['e'], tag='main')
                .add_elements([window.TimestampedValue(2, 2)], tag='side')
                .add_elements([window.TimestampedValue(1, 1)], tag='side')
                .add_elements([window.TimestampedValue(7, 7)], tag='side')
                .add_elements([window.TimestampedValue(4, 4)], tag='side')
                ) # yapf: disable

            main_stream = test_stream['main']
            side_stream = test_stream['side']

            class RecordFn(beam.DoFn):
                def process(self,
                            elm=beam.DoFn.ElementParam,
                            ts=beam.DoFn.TimestampParam,
                            side=beam.DoFn.SideInputParam):
                    yield (elm, ts, side)

            records = (
                main_stream  # pylint: disable=unused-variable
                | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream)))

            assert_that(records, equal_to([('e', Timestamp(10), [2, 1, 7,
                                                                 4])]))
예제 #3
0
    def test_combiner_latest(self):
        """Test TimestampCombiner with LATEST."""
        options = PipelineOptions(streaming=True)
        with TestPipeline(options=options) as p:
            result = (
                p
                | TestStream().add_elements([
                    window.TimestampedValue(('k', 100), 2)
                ]).add_elements([window.TimestampedValue(
                    ('k', 400), 7)]).advance_watermark_to_infinity()
                | beam.WindowInto(
                    window.FixedWindows(10),
                    timestamp_combiner=TimestampCombiner.OUTPUT_AT_LATEST)
                | beam.CombinePerKey(sum))

            records = (
                result
                | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts)))

            # All the KV pairs are applied GBK using LATEST timestamp for
            # the same key.
            expected_window_to_elements = {
                window.IntervalWindow(0, 10): [
                    (('k', 500), Timestamp(7)),
                ],
            }

            assert_that(records,
                        equal_to_per_window(expected_window_to_elements),
                        use_global_window=False,
                        label='assert per window')
예제 #4
0
  def test_globally(self):
    l = [
        window.TimestampedValue(3, 100),
        window.TimestampedValue(1, 200),
        window.TimestampedValue(2, 300)
    ]
    with TestPipeline() as p:
      # Map(lambda x: x) PTransform is added after Create here, because when
      # a PCollection of TimestampedValues is created with Create PTransform,
      # the timestamps are not assigned to it. Adding a Map forces the
      # PCollection to go through a DoFn so that the PCollection consists of
      # the elements with timestamps assigned to them instead of a PCollection
      # of TimestampedValue(element, timestamp).
      pcoll = p | Create(l) | Map(lambda x: x)
      latest = pcoll | combine.Latest.Globally()
      assert_that(latest, equal_to([2]))

      # Now for global combines without default
      windowed = pcoll | 'window' >> WindowInto(FixedWindows(180))
      result_windowed = (
          windowed
          |
          'latest wo defaults' >> combine.Latest.Globally().without_defaults())

      assert_that(result_windowed, equal_to([3, 2]), label='latest-wo-defaults')
예제 #5
0
 def test_per_key(self):
   l = [window.TimestampedValue(('a', 1), 300),
        window.TimestampedValue(('b', 3), 100),
        window.TimestampedValue(('a', 2), 200)]
   with TestPipeline() as p:
     pc = p | Create(l) | Map(lambda x: x)
     latest = pc | combine.Latest.PerKey()
     assert_that(latest, equal_to([('a', 1), ('b', 3)]))
예제 #6
0
    def test_multi_triggered_gbk_side_input(self):
        """Test a GBK sideinput, with multiple triggering."""
        options = StandardOptions(streaming=True)
        p = TestPipeline(options=options)

        test_stream = (
            p
            | 'Mixed TestStream' >> TestStream().advance_watermark_to(
                3, tag='main').add_elements(
                    ['a1'], tag='main').advance_watermark_to(
                        8, tag='main').add_elements(['a2'], tag='main').
            add_elements([window.TimestampedValue(
                ('k', 100), 2)], tag='side').add_elements(
                    [window.TimestampedValue(('k', 400), 7)],
                    tag='side').advance_watermark_to_infinity(
                        tag='main').advance_watermark_to_infinity(tag='side'))

        main_data = (
            test_stream['main']
            | 'Main windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                accumulation_mode=trigger.AccumulationMode.DISCARDING))

        side_data = (
            test_stream['side']
            | 'Side windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | beam.CombinePerKey(sum)
            | 'Values' >> Map(lambda k_vs: k_vs[1]))

        class RecordFn(beam.DoFn):
            def process(self,
                        elm=beam.DoFn.ElementParam,
                        ts=beam.DoFn.TimestampParam,
                        side=beam.DoFn.SideInputParam):
                yield (elm, ts, side)

        records = (main_data
                   | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data)))

        expected_window_to_elements = {
            window.IntervalWindow(0, 5): [
                ('a1', Timestamp(3), [100, 0]),
            ],
            window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])],
        }

        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    use_global_window=False,
                    label='assert per window')

        p.run()
  def test_basic_execution_sideinputs_fixed_windows(self):
    options = PipelineOptions()
    options.view_as(DebugOptions).add_experiment(
        'passthrough_pcollection_output_ids')
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    test_stream = (p | TestStream()
        .advance_watermark_to(12, tag='side')
        .add_elements([window.TimestampedValue('s1', 10)], tag='side')
        .advance_watermark_to(20, tag='side')
        .add_elements([window.TimestampedValue('s2', 20)], tag='side')

        .advance_watermark_to(9, tag='main')
        .add_elements(['a1', 'a2', 'a3', 'a4'], tag='main')
        .add_elements(['b'], tag='main')
        .advance_watermark_to(18, tag='main')
        .add_elements('c', tag='main')
        ) # yapf: disable

    main_stream = (
        test_stream['main']
        | 'main windowInto' >> beam.WindowInto(window.FixedWindows(1)))

    side_stream = (
        test_stream['side']
        | 'side windowInto' >> beam.WindowInto(window.FixedWindows(3)))

    class RecordFn(beam.DoFn):
      def process(
          self,
          elm=beam.DoFn.ElementParam,
          ts=beam.DoFn.TimestampParam,
          side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (
        main_stream  # pylint: disable=unused-variable
        | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream)))

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(9, 10): [
            ('a1', Timestamp(9), ['s1']), ('a2', Timestamp(9), ['s1']),
            ('a3', Timestamp(9), ['s1']), ('a4', Timestamp(9), ['s1']),
            ('b', Timestamp(9), ['s1'])
        ],
        window.IntervalWindow(18, 19): [('c', Timestamp(18), ['s2'])],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        label='assert per window')

    p.run()
예제 #8
0
  def test_basic_execution_sideinputs_fixed_windows(self):

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    global result     # pylint: disable=global-variable-undefined
    result = []

    def recorded_elements(elem):
      result.append(elem)
      return elem

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    main_stream = (p
                   | 'main TestStream' >> TestStream()
                   .advance_watermark_to(9)
                   .add_elements(['a1', 'a2', 'a3', 'a4'])
                   .add_elements(['b'])
                   .advance_watermark_to(18)
                   .add_elements('c')
                   | 'main windowInto' >> beam.WindowInto(
                       window.FixedWindows(1))
                  )
    side_stream = (p
                   | 'side TestStream' >> TestStream()
                   .advance_watermark_to(12)
                   .add_elements([window.TimestampedValue('s1', 10)])
                   .advance_watermark_to(20)
                   .add_elements([window.TimestampedValue('s2', 20)])
                   | 'side windowInto' >> beam.WindowInto(
                       window.FixedWindows(3))
                  )

    class RecordFn(beam.DoFn):
      def process(self,
                  elm=beam.DoFn.ElementParam,
                  ts=beam.DoFn.TimestampParam,
                  side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (main_stream     # pylint: disable=unused-variable
               | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream))
               | beam.Map(recorded_elements))
    p.run()

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    self.assertEqual([('a1', Timestamp(9), ['s1']),
                      ('a2', Timestamp(9), ['s1']),
                      ('a3', Timestamp(9), ['s1']),
                      ('a4', Timestamp(9), ['s1']),
                      ('b', Timestamp(9), ['s1']),
                      ('c', Timestamp(18), ['s2'])], result)
예제 #9
0
 def test_globally(self):
   l = [window.TimestampedValue(3, 100),
        window.TimestampedValue(1, 200),
        window.TimestampedValue(2, 300)]
   with TestPipeline() as p:
     # Map(lambda x: x) PTransform is added after Create here, because when
     # a PCollection of TimestampedValues is created with Create PTransform,
     # the timestamps are not assigned to it. Adding a Map forces the
     # PCollection to go through a DoFn so that the PCollection consists of
     # the elements with timestamps assigned to them instead of a PCollection
     # of TimestampedValue(element, timestamp).
     pc = p | Create(l) | Map(lambda x: x)
     latest = pc | combine.Latest.Globally()
     assert_that(latest, equal_to([2]))
예제 #10
0
    def generate_events(self):
        publish_client = pubsub.Client(project=self.project)
        topic = publish_client.topic(self.topic_name)
        sub = topic.subscription(self.subscription_name)

        logging.info('Generating auction events to topic %s', topic.name)

        if self.args.input.startswith('gs://'):
            from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
            fs = GCSFileSystem(self.pipeline_options)
            with fs.open(self.args.input) as infile:
                for line in infile:
                    topic.publish(line)
        else:
            with open(self.args.input) as infile:
                for line in infile:
                    topic.publish(line)

        logging.info('Finished event generation.')

        # Read from PubSub into a PCollection.
        if self.args.subscription_name:
            raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                subscription=sub.full_name)
        else:
            raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                topic=topic.full_name)
        raw_events = (
            raw_events
            | 'deserialization' >> beam.ParDo(nexmark_util.ParseJsonEvnetFn())
            | 'timestamping' >>
            beam.Map(lambda e: window.TimestampedValue(e, e.date_time)))
        return raw_events
예제 #11
0
파일: sync.py 프로젝트: sambvfx/rillbeam
def main_without_pubsub(options):
    from rillbeam.transforms import SleepFn

    with beam.Pipeline(options=options) as pipe:

        # FIXME: still can't "fake" timestamp data like we get from pubsub...
        graph = (
            pipe
            | 'start' >> beam.Create([(k, k) for k in range(5)])
            # The purpose of the WindowInto transform is to establish a
            # FixedWindows windowing function for the PCollection.
            # It does not bucket elements into windows since the timestamps
            # from Create are not spaced 5 ms apart and very likely they all
            # fall into the same window.
            | 'w' >> beam.WindowInto(window.FixedWindows(5))
            # Generate timestamped values using the values as timestamps.
            # Now there are values 5 ms apart and since Map propagates the
            # windowing function from input to output the output PCollection
            # will have elements falling into different 5ms windows.
            | beam.Map(lambda x_t2: window.TimestampedValue(x_t2[0], x_t2[1])))

        b1 = (graph
              | 'AsInt' >> beam.Map(lambda x: int(x))
              | 'LogInt' >> Log())

        b2 = (graph
              | 'AsStr' >> beam.Map(lambda x: str(x))
              | 'LogStr' >> Log())

        b3 = (b1
              | 'Sleep' >> beam.ParDo(SleepFn(), duration=0.2)
              | 'AsFloat' >> beam.Map(lambda x: float(x))
              | 'LogFloat' >> Log())

        ((b1, b2, b3) | Sync() | 'SyncLog' >> Log())
예제 #12
0
    def test_stateful_set_state_clean_portably(self):
        class SetStateClearingStatefulDoFn(beam.DoFn):

            SET_STATE = SetStateSpec('buffer', VarIntCoder())
            EMIT_TIMER = TimerSpec('emit_timer', TimeDomain.WATERMARK)

            def process(self,
                        element,
                        set_state=beam.DoFn.StateParam(SET_STATE),
                        emit_timer=beam.DoFn.TimerParam(EMIT_TIMER)):
                _, value = element
                set_state.add(value)

                all_elements = [element for element in set_state.read()]

                if len(all_elements) == 5:
                    set_state.clear()
                    set_state.add(100)
                    emit_timer.set(1)

            @on_timer(EMIT_TIMER)
            def emit_values(self, set_state=beam.DoFn.StateParam(SET_STATE)):
                yield sorted(set_state.read())

        with TestPipeline() as p:
            values = p | beam.Create([('key', 1), ('key', 2), ('key', 3),
                                      ('key', 4), ('key', 5)])
            actual_values = (values
                             |
                             beam.Map(lambda t: window.TimestampedValue(t, 1))
                             | beam.WindowInto(window.FixedWindows(1))
                             | beam.ParDo(SetStateClearingStatefulDoFn()))

            assert_that(actual_values, equal_to([[100]]))
예제 #13
0
  def test_pardo_side_inputs(self):
    def cross_product(elem, sides):
      for side in sides:
        yield elem, side
    with self.create_pipeline() as p:
      main = p | 'main' >> beam.Create(['a', 'b', 'c'])
      side = p | 'side' >> beam.Create(['x', 'y'])
      assert_that(main | beam.FlatMap(cross_product, beam.pvalue.AsList(side)),
                  equal_to([('a', 'x'), ('b', 'x'), ('c', 'x'),
                            ('a', 'y'), ('b', 'y'), ('c', 'y')]))

      # Now with some windowing.
      pcoll = p | beam.Create(range(10)) | beam.Map(
          lambda t: window.TimestampedValue(t, t))
      # Intentionally choosing non-aligned windows to highlight the transition.
      main = pcoll | 'WindowMain' >> beam.WindowInto(window.FixedWindows(5))
      side = pcoll | 'WindowSide' >> beam.WindowInto(window.FixedWindows(7))
      res = main | beam.Map(lambda x, s: (x, sorted(s)),
                            beam.pvalue.AsList(side))
      assert_that(
          res,
          equal_to([
              # The window [0, 5) maps to the window [0, 7).
              (0, range(7)),
              (1, range(7)),
              (2, range(7)),
              (3, range(7)),
              (4, range(7)),
              # The window [5, 10) maps to the window [7, 14).
              (5, range(7, 10)),
              (6, range(7, 10)),
              (7, range(7, 10)),
              (8, range(7, 10)),
              (9, range(7, 10))]),
          label='windowed')
예제 #14
0
    def test_basic_execution_batch_sideinputs(self):
        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        p = TestPipeline(options=options)

        main_stream = (p
                       | 'main TestStream' >> TestStream()
                       .advance_watermark_to(10)
                       .add_elements(['e'])
                       .advance_watermark_to_infinity())  # yapf: disable
        side = (p
                | beam.Create([2, 1, 4])
                | beam.Map(lambda t: window.TimestampedValue(t, t)))

        class RecordFn(beam.DoFn):
            def process(self,
                        elm=beam.DoFn.ElementParam,
                        ts=beam.DoFn.TimestampParam,
                        side=beam.DoFn.SideInputParam):
                yield (elm, ts, sorted(side))

        records = (
            main_stream  # pylint: disable=unused-variable
            | beam.ParDo(RecordFn(), beam.pvalue.AsList(side)))

        assert_that(records, equal_to([('e', Timestamp(10), [1, 2, 4])]))

        p.run()
예제 #15
0
 def test_pardo_windowed_side_inputs(self):
     with self.create_pipeline() as p:
         # Now with some windowing.
         pcoll = p | beam.Create(list(
             range(10))) | beam.Map(lambda t: window.TimestampedValue(t, t))
         # Intentionally choosing non-aligned windows to highlight the transition.
         main = pcoll | 'WindowMain' >> beam.WindowInto(
             window.FixedWindows(5))
         side = pcoll | 'WindowSide' >> beam.WindowInto(
             window.FixedWindows(7))
         res = main | beam.Map(lambda x, s:
                               (x, sorted(s)), beam.pvalue.AsList(side))
         assert_that(
             res,
             equal_to([
                 # The window [0, 5) maps to the window [0, 7).
                 (0, list(range(7))),
                 (1, list(range(7))),
                 (2, list(range(7))),
                 (3, list(range(7))),
                 (4, list(range(7))),
                 # The window [5, 10) maps to the window [7, 14).
                 (5, list(range(7, 10))),
                 (6, list(range(7, 10))),
                 (7, list(range(7, 10))),
                 (8, list(range(7, 10))),
                 (9, list(range(7, 10)))
             ]),
             label='windowed')
예제 #16
0
 def read_from_file(self):
     return (
         self.pipeline
         | 'reading_from_file' >> beam.io.ReadFromText(self.args.input)
         | 'deserialization' >> beam.ParDo(nexmark_util.ParseJsonEventFn())
         | 'timestamping' >>
         beam.Map(lambda e: window.TimestampedValue(e, e.date_time)))
예제 #17
0
def assign_timevalue(v):
    # pcollectionのデータにタイムスタンプを付加する
    # 後段のwindowはこのタイムスタンプを基準に分割される
    # ここでは適当に乱数でタイムスタンプを入れている
    import apache_beam.transforms.window as window
    import random
    import time
    return window.TimestampedValue(v, int(time.time()) + random.randint(0, 1))
예제 #18
0
 def process(self, element, init_result):
     bundle = element
     writer = self.sink.open_writer(init_result, str(uuid.uuid4()))
     for e in bundle[1]:  # values
         writer.write(e)
     return [
         window.TimestampedValue(writer.close(), timestamp.MAX_TIMESTAMP)
     ]
예제 #19
0
    def test_deduplication_with_event_time(self):
        deduplicate_duration = 60
        with self.create_pipeline() as p:
            test_stream = (TestStream(coder=coders.StrUtf8Coder(
            )).with_output_types(str).advance_watermark_to(0).add_elements([
                window.TimestampedValue('k1', 0),
                window.TimestampedValue('k2', 20),
                window.TimestampedValue('k3', 30)
            ]).advance_watermark_to(30).add_elements([
                window.TimestampedValue('k1', 40),
                window.TimestampedValue('k2', 50),
                window.TimestampedValue('k3', 60)
            ]).advance_watermark_to(deduplicate_duration).add_elements(
                [window.TimestampedValue('k1',
                                         70)]).advance_watermark_to_infinity())
            res = (p
                   | test_stream
                   | deduplicate.Deduplicate(
                       event_time_duration=Duration(deduplicate_duration))
                   | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts)))

            assert_that(
                res,
                equal_to([('k1', Timestamp(0)), ('k2', Timestamp(20)),
                          ('k3', Timestamp(30)), ('k1', Timestamp(70))]))
예제 #20
0
 def test_windowing(self):
     with self.create_pipeline() as p:
         res = (p
                | beam.Create([1, 2, 100, 101, 102])
                | beam.Map(lambda t: window.TimestampedValue(('k', t), t))
                | beam.WindowInto(beam.transforms.window.Sessions(10))
                | beam.GroupByKey()
                | beam.Map(lambda k_vs1: (k_vs1[0], sorted(k_vs1[1]))))
         assert_that(res, equal_to([('k', [1, 2]), ('k', [100, 101, 102])]))
예제 #21
0
  def _run_pardo_state_timers(self, windowed):
    state_spec = userstate.BagStateSpec('state', beam.coders.StrUtf8Coder())
    timer_spec = userstate.TimerSpec('timer', userstate.TimeDomain.WATERMARK)
    elements = list('abcdefgh')
    buffer_size = 3

    class BufferDoFn(beam.DoFn):
      def process(self,
                  kv,
                  ts=beam.DoFn.TimestampParam,
                  timer=beam.DoFn.TimerParam(timer_spec),
                  state=beam.DoFn.StateParam(state_spec)):
        _, element = kv
        state.add(element)
        buffer = state.read()
        # For real use, we'd keep track of this size separately.
        if len(list(buffer)) >= 3:
          state.clear()
          yield buffer
        else:
          timer.set(ts + 1)

      @userstate.on_timer(timer_spec)
      def process_timer(self, state=beam.DoFn.StateParam(state_spec)):
        buffer = state.read()
        state.clear()
        yield buffer

    def is_buffered_correctly(actual):
      # Pickling self in the closure for asserts gives errors (only on jenkins).
      self = FnApiRunnerTest('__init__')
      # Acutal should be a grouping of the inputs into batches of size
      # at most buffer_size, but the actual batching is nondeterministic
      # based on ordering and trigger firing timing.
      self.assertEqual(sorted(sum((list(b) for b in actual), [])), elements)
      self.assertEqual(max(len(list(buffer)) for buffer in actual), buffer_size)
      if windowed:
        # Elements were assigned to windows based on their parity.
        # Assert that each grouping consists of elements belonging to the
        # same window to ensure states and timers were properly partitioned.
        for b in actual:
          parity = set(ord(e) % 2 for e in b)
          self.assertEqual(1, len(parity), b)

    with self.create_pipeline() as p:
      actual = (
          p
          | beam.Create(elements)
          # Send even and odd elements to different windows.
          | beam.Map(lambda e: window.TimestampedValue(e, ord(e) % 2))
          | beam.WindowInto(window.FixedWindows(1) if windowed
                            else window.GlobalWindows())
          | beam.Map(lambda x: ('key', x))
          | beam.ParDo(BufferDoFn()))

      assert_that(actual, is_buffered_correctly)
예제 #22
0
  def test_basic_execution_sideinputs(self):

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    global result     # pylint: disable=global-variable-undefined
    result = []

    def recorded_elements(elem):
      result.append(elem)
      return elem

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    main_stream = (p
                   | 'main TestStream' >> TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['e']))
    side_stream = (p
                   | 'side TestStream' >> TestStream()
                   .add_elements([window.TimestampedValue(2, 2)])
                   .add_elements([window.TimestampedValue(1, 1)])
                   .add_elements([window.TimestampedValue(7, 7)])
                   .add_elements([window.TimestampedValue(4, 4)])
                  )

    class RecordFn(beam.DoFn):
      def process(self,
                  elm=beam.DoFn.ElementParam,
                  ts=beam.DoFn.TimestampParam,
                  side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (main_stream        # pylint: disable=unused-variable
               | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream))
               | beam.Map(recorded_elements))

    p.run()

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    self.assertEqual([('e', Timestamp(10), [2, 1, 7, 4])], result)
예제 #23
0
  def test_basic_execution_sideinputs(self):
    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    main_stream = (p
                   | 'main TestStream' >> TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['e']))
    side_stream = (p
                   | 'side TestStream' >> TestStream()
                   .add_elements([window.TimestampedValue(2, 2)])
                   .add_elements([window.TimestampedValue(1, 1)])
                   .add_elements([window.TimestampedValue(7, 7)])
                   .add_elements([window.TimestampedValue(4, 4)])
                  )

    class RecordFn(beam.DoFn):
      def process(self,
                  elm=beam.DoFn.ElementParam,
                  ts=beam.DoFn.TimestampParam,
                  side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (main_stream        # pylint: disable=unused-variable
               | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream)))

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(0, 15): [
            ('e', Timestamp(10), [2, 1, 7, 4]),
        ],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        custom_windowing=window.FixedWindows(15),
        label='assert per window')

    assert_that(records, equal_to([('e', Timestamp(10), [2, 1, 7, 4])]))
    p.run()
예제 #24
0
def _finalize_write(_, sink, init_result, write_results, min_shards):
  write_results = list(write_results)
  extra_shards = []
  if len(write_results) < min_shards:
    logging.debug(
        'Creating %s empty shard(s).', min_shards - len(write_results))
    for _ in range(min_shards - len(write_results)):
      writer = sink.open_writer(init_result, str(uuid.uuid4()))
      extra_shards.append(writer.close())
  outputs = sink.finalize_write(init_result, write_results + extra_shards)
  if outputs:
    return (window.TimestampedValue(v, window.MAX_TIMESTAMP) for v in outputs)
예제 #25
0
    def _success_write(self, _, outputs):
        """
        Writes a success file to the final dir
        :param sink: apache_beam.io.filebasedsink.FileBasedSink
        :return:
        """
        main_dir = os.path.dirname(self.sink.file_path_prefix.get())
        success_filename = '/'.join([main_dir, '_SUCCESS'])
        FileSystems.create(success_filename, 'text/plain').close()

        for v in outputs:
            yield v

        yield window.TimestampedValue(success_filename, window.MAX_TIMESTAMP)
예제 #26
0
 def test_windowed_batches(self):
   # Assumes a single bundle, in order...
   with TestPipeline() as p:
     res = (
         p
         | beam.Create(range(47))
         | beam.Map(lambda t: window.TimestampedValue(t, t))
         | beam.WindowInto(window.FixedWindows(30))
         | util.BatchElements(
             min_batch_size=5, max_batch_size=10, clock=FakeClock())
         | beam.Map(len))
     assert_that(res, equal_to([
         5, 5, 10, 10,  # elements in [0, 30)
         10, 7,         # elements in [30, 47)
     ]))
예제 #27
0
  def test_deduplication_in_different_windows(self):
    with self.create_pipeline() as p:
      test_stream = (
          TestStream(
              coder=coders.StrUtf8Coder()).advance_watermark_to(0).add_elements(
                  [
                      window.TimestampedValue('k1', 0),
                      window.TimestampedValue('k2', 10),
                      window.TimestampedValue('k3', 20),
                      window.TimestampedValue('k1', 30),
                      window.TimestampedValue('k2', 40),
                      window.TimestampedValue('k3', 50),
                      window.TimestampedValue('k4', 60),
                      window.TimestampedValue('k5', 70),
                      window.TimestampedValue('k6', 80)
                  ]).advance_watermark_to_infinity())

      res = (
          p
          | test_stream
          | beam.WindowInto(window.FixedWindows(30))
          | deduplicate.Deduplicate(processing_time_duration=10 * 60)
          | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts)))
      # Deduplication should happen per window.
      expect_unique_keys_per_window = {
          window.IntervalWindow(0, 30): [('k1', Timestamp(0)),
                                         ('k2', Timestamp(10)),
                                         ('k3', Timestamp(20))],
          window.IntervalWindow(30, 60): [('k1', Timestamp(30)),
                                          ('k2', Timestamp(40)),
                                          ('k3', Timestamp(50))],
          window.IntervalWindow(60, 90): [('k4', Timestamp(60)),
                                          ('k5', Timestamp(70)),
                                          ('k6', Timestamp(80))],
      }
      assert_that(
          res,
          equal_to_per_window(expect_unique_keys_per_window),
          use_global_window=False,
          label='assert per window')
예제 #28
0
  def test_basic_execution_batch_sideinputs_fixed_windows(self):
    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    main_stream = (
        p
        |
        'main TestStream' >> TestStream().advance_watermark_to(2).add_elements(
            ['a']).advance_watermark_to(4).add_elements(
                ['b']).advance_watermark_to_infinity()
        | 'main window' >> beam.WindowInto(window.FixedWindows(1)))
    side = (
        p
        | beam.Create([2, 1, 4])
        | beam.Map(lambda t: window.TimestampedValue(t, t))
        | beam.WindowInto(window.FixedWindows(2)))

    class RecordFn(beam.DoFn):
      def process(
          self,
          elm=beam.DoFn.ElementParam,
          ts=beam.DoFn.TimestampParam,
          side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (
        main_stream  # pylint: disable=unused-variable
        | beam.ParDo(RecordFn(), beam.pvalue.AsList(side)))

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(2, 3): [('a', Timestamp(2), [2])],
        window.IntervalWindow(4, 5): [('b', Timestamp(4), [4])]
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        label='assert per window')

    p.run()
예제 #29
0
 def run_windowed_side_inputs(self, elements, main_window_fn,
                              side_window_fn=None,
                              side_input_type=beam.pvalue.AsList,
                              combine_fn=None,
                              expected=None):
   with self.create_pipeline() as p:
     pcoll = p | beam.Create(elements) | beam.Map(
         lambda t: window.TimestampedValue(t, t))
     main = pcoll | 'WindowMain' >> beam.WindowInto(main_window_fn)
     side = pcoll | 'WindowSide' >> beam.WindowInto(
         side_window_fn or main_window_fn)
     kw = {}
     if combine_fn is not None:
       side |= beam.CombineGlobally(combine_fn).without_defaults()
       kw['default_value'] = 0
     elif side_input_type == beam.pvalue.AsDict:
       side |= beam.Map(lambda x: ('k%s' % x, 'v%s' % x))
     res = main | beam.Map(lambda x, s: (x, s), side_input_type(side, **kw))
     if side_input_type in (beam.pvalue.AsIter, beam.pvalue.AsList):
       res |= beam.Map(lambda (x, s): (x, sorted(s)))
     assert_that(res, equal_to(expected))
예제 #30
0
    def test_sessions_combine(self):
        with TestPipeline() as p:
            input = (
                p
                | beam.Create([('c', 1), ('c', 9), ('c', 12), ('d', 2),
                               ('d', 4)])
                |
                beam.MapTuple(lambda k, v: window.TimestampedValue((k, v), v))
                | beam.WindowInto(window.Sessions(4)))

            global_sum = (input
                          | beam.Values()
                          | beam.CombineGlobally(sum).without_defaults())
            sum_per_key = input | beam.CombinePerKey(sum)

            # The first window has 3 elements: ('c', 1), ('d', 2), ('d', 4).
            # The second window has 2 elements: ('c', 9), ('c', 12).
            assert_that(global_sum, equal_to([7, 21]), label='global sum')
            assert_that(sum_per_key,
                        equal_to([('c', 1), ('c', 21), ('d', 6)]),
                        label='sum per key')