Ejemplo n.º 1
0
 def finish_bundle(self):
     if self.writer is not None:
         yield WindowedValue(self.writer.close(),
                             window.GlobalWindow().max_timestamp(),
                             [window.GlobalWindow()])
Ejemplo n.º 2
0
 def windowed_value(cls, value, timestamp=MIN_TIMESTAMP):
     return WindowedValue(value, timestamp, (GlobalWindow(), ))
Ejemplo n.º 3
0
 def invoke_user_timer(self, timer_spec, key, window, timestamp):
     # self.output_processor is Optional, but in practice it won't be None here
     self.output_processor.process_outputs(  # type: ignore[union-attr]
         WindowedValue(None, timestamp, (window, )),
         self.signature.timer_methods[timer_spec].invoke_timer_callback(
             self.user_state_context, key, window, timestamp))
Ejemplo n.º 4
0
 def finish_bundle(self):
     for (k, w), va in self._cache.items():
         # We compact the accumulator since a GBK (which necessitates encoding)
         # will follow.
         yield WindowedValue((k, self._combine_fn.compact(va)), w.end,
                             (w, ))
Ejemplo n.º 5
0
    def test_wordcount(self):
        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

        # Count the occurrences of each word.
        counts = (p
                  | beam.Create(['to be or not to be that is the question'])
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that counts will be cached.
        ib.watch(locals())

        result = p.run()
        result.wait_until_finish()

        actual = list(result.get(counts))
        self.assertSetEqual(
            set(actual),
            set([
                ('or', 1),
                ('that', 1),
                ('be', 2),
                ('is', 1),
                ('question', 1),
                ('to', 2),
                ('the', 1),
                ('not', 1),
            ]))

        # Truncate the precision to millis because the window coder uses millis
        # as units then gets upcast to micros.
        end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000
        df_counts = ib.collect(counts, include_window_info=True)
        df_expected = pd.DataFrame(
            {
                0: [e[0] for e in actual],
                1: [e[1] for e in actual],
                'event_time': [end_of_window for _ in actual],
                'windows': [[GlobalWindow()] for _ in actual],
                'pane_info': [
                    PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)
                    for _ in actual
                ]
            },
            columns=[0, 1, 'event_time', 'windows', 'pane_info'])

        pd.testing.assert_frame_equal(df_expected, df_counts)

        actual_reified = result.get(counts, include_window_info=True)
        expected_reified = [
            WindowedValue(e, Timestamp(micros=end_of_window), [GlobalWindow()],
                          PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0))
            for e in actual
        ]
        self.assertEqual(actual_reified, expected_reified)
Ejemplo n.º 6
0
 def process(self, element):
     yield WindowedValue(element, expected_timestamp,
                         [expected_window])
Ejemplo n.º 7
0
 def as_windowed_value(element):
     return WindowedValueHolder(WindowedValue(element, 0, []))
Ejemplo n.º 8
0
    def process(self,
                element,
                timestamp=beam.DoFn.TimestampParam,
                window=beam.DoFn.WindowParam,
                *args,
                **kwargs):
        if isinstance(element, KeyedWorkItem):
            # Must be a timer firing.
            key = element.encoded_key
        else:
            key, values = element
            values = list(values)
            assert len(values) == 1
            # Value here will either be a WindowedValue or an ElementAndRestriction
            # object.
            # TODO: handle key collisions here.
            assert len(values) == 1, 'Internal error. Processing of splittable ' \
                                     'DoFn cannot continue since elements did not ' \
                                     'have unique keys.'
            value = values[0]
            if len(values) != 1:
                raise ValueError('')

        state = self._step_context.get_keyed_state(key)
        element_state = state.get_state(window, self._element_tag)
        # Initially element_state is an empty list.
        is_seed_call = not element_state

        if not is_seed_call:
            element = state.get_state(window, self._element_tag)
            restriction = state.get_state(window, self._restriction_tag)
            windowed_element = WindowedValue(element, timestamp, [window])
        else:
            # After values iterator is expanded above we should have gotten a list
            # with a single ElementAndRestriction object.
            assert isinstance(value, ElementAndRestriction)
            element_and_restriction = value
            element = element_and_restriction.element
            restriction = element_and_restriction.restriction

            if isinstance(value, WindowedValue):
                windowed_element = WindowedValue(element, value.timestamp,
                                                 value.windows)
            else:
                windowed_element = WindowedValue(element, timestamp, [window])

        tracker = self.sdf_invoker.invoke_create_tracker(restriction)
        assert self._process_element_invoker
        assert isinstance(self._process_element_invoker,
                          SDFProcessElementInvoker)

        output_values = self._process_element_invoker.invoke_process_element(
            self.sdf_invoker, windowed_element, tracker)

        sdf_result = None
        for output in output_values:
            if isinstance(output, SDFProcessElementInvoker.Result):
                # SDFProcessElementInvoker.Result should be the last item yielded.
                sdf_result = output
                break
            yield output

        assert sdf_result, (
            'SDFProcessElementInvoker must return a '
            'SDFProcessElementInvoker.Result object as the last '
            'value of a SDF invoke_process_element() invocation.')

        if not sdf_result.residual_restriction:
            # All work for current residual and restriction pair is complete.
            state.clear_state(window, self._element_tag)
            state.clear_state(window, self._restriction_tag)
            # Releasing output watermark by setting it to positive infinity.
            state.add_state(window, self.watermark_hold_tag,
                            WatermarkManager.WATERMARK_POS_INF)
        else:
            state.add_state(window, self._element_tag, element)
            state.add_state(window, self._restriction_tag,
                            sdf_result.residual_restriction)
            # Holding output watermark by setting it to negative infinity.
            state.add_state(window, self.watermark_hold_tag,
                            WatermarkManager.WATERMARK_NEG_INF)

            # Setting a timer to be reinvoked to continue processing the element.
            # Currently Python SDK only supports setting timers based on watermark. So
            # forcing a reinvocation by setting a timer for watermark negative
            # infinity.
            # TODO(chamikara): update this by setting a timer for the proper
            # processing time when Python SDK supports that.
            state.set_timer(window, '', TimeDomain.WATERMARK,
                            WatermarkManager.WATERMARK_NEG_INF)
Ejemplo n.º 9
0
 def finish_bundle(self, context=None):
     from apache_beam.transforms import window
     from apache_beam.utils.windowed_value import WindowedValue
     if len(self._cached) > 0:
         yield WindowedValue(self._cached, -1, [window.GlobalWindow()])
Ejemplo n.º 10
0
 def finish_bundle(self):
     for (k, w), va in self._cache.items():
         yield WindowedValue((k, va), w.end, (w, ))
Ejemplo n.º 11
0
    def invoke_process(self,
                       windowed_value,
                       restriction_tracker=None,
                       output_processor=None,
                       additional_args=None,
                       additional_kwargs=None):
        if not additional_args:
            additional_args = []
        if not additional_kwargs:
            additional_kwargs = {}

        if not output_processor:
            output_processor = self.output_processor
        self.context.set_element(windowed_value)
        # Call for the process function for each window if has windowed side inputs
        # or if the process accesses the window parameter. We can just call it once
        # otherwise as none of the arguments are changing

        if self.is_splittable and not restriction_tracker:
            restriction = self.invoke_initial_restriction(windowed_value.value)
            restriction_tracker = self.invoke_create_tracker(restriction)

        if restriction_tracker:
            if len(windowed_value.windows) > 1 and self.has_windowed_inputs:
                # Should never get here due to window explosion in
                # the upstream pair-with-restriction.
                raise NotImplementedError(
                    'SDFs in multiply-windowed values with windowed arguments.'
                )
            restriction_tracker_param = (
                self.signature.process_method.restriction_provider_arg_name)
            if not restriction_tracker_param:
                raise ValueError(
                    'A RestrictionTracker %r was provided but DoFn does not have a '
                    'RestrictionTrackerParam defined' % restriction_tracker)
            from apache_beam.io import iobase
            self.threadsafe_restriction_tracker = iobase.ThreadsafeRestrictionTracker(
                restriction_tracker)
            additional_kwargs[restriction_tracker_param] = (
                iobase.RestrictionTrackerView(
                    self.threadsafe_restriction_tracker))

            if self.watermark_estimator:
                # The watermark estimator needs to be reset for every element.
                self.watermark_estimator.reset()
                additional_kwargs[self.watermark_estimator_param] = (
                    self.watermark_estimator)
            try:
                self.current_windowed_value = windowed_value
                return self._invoke_process_per_window(windowed_value,
                                                       additional_args,
                                                       additional_kwargs,
                                                       output_processor)
            finally:
                self.threadsafe_restriction_tracker = None
                self.current_windowed_value = windowed_value

        elif self.has_windowed_inputs and len(windowed_value.windows) != 1:
            for w in windowed_value.windows:
                self._invoke_process_per_window(
                    WindowedValue(windowed_value.value,
                                  windowed_value.timestamp, (w, )),
                    additional_args, additional_kwargs, output_processor)
        else:
            self._invoke_process_per_window(windowed_value, additional_args,
                                            additional_kwargs,
                                            output_processor)
Ejemplo n.º 12
0
def test_invoker_normal(init_beam, fn):
    invoker = init_beam(fn)
    print("Normal testing {} with {} invoker.".format(fn, invoker))
    windowed_value = WindowedValue(False, 0, [None])
    invoker.invoke_process(windowed_value)
Ejemplo n.º 13
0
 def finish_bundle(self):
   xml = etree.tostring(self._root, pretty_print=True)
   self._root = None
   yield WindowedValue(xml, GlobalWindow().max_timestamp(), [GlobalWindow()])
Ejemplo n.º 14
0
 def invoke_user_timer(self, timer_spec, key, window, timestamp):
     self.output_processor.process_outputs(
         WindowedValue(None, timestamp, (window, )),
         self.signature.timer_methods[timer_spec].invoke_timer_callback(
             self.user_state_context, key, window, timestamp))
Ejemplo n.º 15
0
 def finish_bundle(self, context=None):
   if len(self._cached) > 0:  # pylint: disable=g-explicit-length-test
     yield WindowedValue(self._cached, -1, [window.GlobalWindow()])
Ejemplo n.º 16
0
 def finish_bundle(self):
     if self.writer is not None:
         yield WindowedValue(self.writer.close(), window.MAX_TIMESTAMP,
                             [window.GlobalWindow()])
Ejemplo n.º 17
0
def windowed_value(e):
    from apache_beam.transforms.window import GlobalWindow
    return WindowedValue(e, 1, [GlobalWindow()])
Ejemplo n.º 18
0
 def finish_bundle(self, context=None):
     if self._batch:
         yield WindowedValue(self._flush_batch(), -1,
                             [window.GlobalWindow()])
Ejemplo n.º 19
0
    def finish_bundle(self, element=None):
        from apache_beam.transforms import window
        from apache_beam.utils.windowed_value import WindowedValue

        if len(self._cached) > 0:  # pylint: disable=g-explicit-length-test
            yield WindowedValue(self._cached, -1, [window.GlobalWindow()])
Ejemplo n.º 20
0
 def output_key(self, wkey, value):
     windows, key = wkey
     if windows is 0:
         self.output(_globally_windowed_value.with_value((key, value)))
     else:
         self.output(WindowedValue((key, value), windows[0].end, windows))
Ejemplo n.º 21
0
 def finish_bundle(self, *args, **kwargs):
     if self._batch:
         yield WindowedValue(self._batch, -1, [window.GlobalWindow()])
Ejemplo n.º 22
0
 def windowed_value(cls,
                    value,
                    timestamp=MIN_TIMESTAMP,
                    pane_info=windowed_value.PANE_INFO_UNKNOWN):
     return WindowedValue(value, timestamp, (GlobalWindow(), ), pane_info)
Ejemplo n.º 23
0
  def invoke_process(self,
                     windowed_value,  # type: WindowedValue
                     restriction_tracker=None, # type: Optional[RestrictionTracker]
                     watermark_estimator=None, # type: Optional[WatermarkEstimator]
                     additional_args=None,
                     additional_kwargs=None
                    ):
    # type: (...) -> Optional[SplitResultResidual]
    if not additional_args:
      additional_args = []
    if not additional_kwargs:
      additional_kwargs = {}

    self.context.set_element(windowed_value)
    # Call for the process function for each window if has windowed side inputs
    # or if the process accesses the window parameter. We can just call it once
    # otherwise as none of the arguments are changing

    if self.is_splittable and not restriction_tracker:
      restriction = self.invoke_initial_restriction(windowed_value.value)
      restriction_tracker = self.invoke_create_tracker(restriction)

    if restriction_tracker is not None:
      if len(windowed_value.windows) > 1 and self.has_windowed_inputs:
        # Should never get here due to window explosion in
        # the upstream pair-with-restriction.
        raise NotImplementedError(
            'SDFs in multiply-windowed values with windowed arguments.')
      restriction_tracker_param = (
          self.signature.process_method.restriction_provider_arg_name)
      if not restriction_tracker_param:
        raise ValueError(
            'A RestrictionTracker %r was provided but DoFn does not have a '
            'RestrictionTrackerParam defined' % restriction_tracker)
      self.threadsafe_restriction_tracker = ThreadsafeRestrictionTracker(
          restriction_tracker)
      additional_kwargs[restriction_tracker_param] = (
          RestrictionTrackerView(self.threadsafe_restriction_tracker))

      self.threadsafe_watermark_estimator = (
          ThreadsafeWatermarkEstimator(watermark_estimator))
      watermark_param = (
          self.signature.process_method.watermark_estimator_provider_arg_name)
      # When the watermark_estimator is a NoOpWatermarkEstimator, the system
      # will not add watermark_param into the DoFn param list.
      if watermark_param is not None:
        additional_kwargs[watermark_param] = self.threadsafe_watermark_estimator
      try:
        self.current_windowed_value = windowed_value
        return self._invoke_process_per_window(
            windowed_value, additional_args, additional_kwargs)
      finally:
        self.threadsafe_restriction_tracker = None
        self.threadsafe_watermark_estimator = None
        self.current_windowed_value = windowed_value

    elif self.has_windowed_inputs and len(windowed_value.windows) != 1:
      for w in windowed_value.windows:
        self._invoke_process_per_window(
            WindowedValue(
                windowed_value.value, windowed_value.timestamp, (w, )),
            additional_args,
            additional_kwargs)
    else:
      self._invoke_process_per_window(
          windowed_value, additional_args, additional_kwargs)
    return None