コード例 #1
0
 def test_implicit_payload_builder_with_bytes(self):
   values = PayloadBase.bytes_values
   builder = ImplicitSchemaPayloadBuilder(values)
   result = builder.build()
   if sys.version_info[0] < 3:
     # in python 2.x bytes coder will be inferred
     args = {
         'integer_example': ConfigValue(
             coder_urn=['beam:coder:varint:v1'],
             payload=VarIntCoder()
             .get_impl().encode_nested(values['integer_example'])),
         'string_example': ConfigValue(
             coder_urn=['beam:coder:bytes:v1'],
             payload=StrUtf8Coder()
             .get_impl().encode_nested(values['string_example'])),
         'list_of_strings': ConfigValue(
             coder_urn=['beam:coder:iterable:v1',
                        'beam:coder:bytes:v1'],
             payload=IterableCoder(StrUtf8Coder())
             .get_impl().encode_nested(values['list_of_strings'])),
         'optional_kv': ConfigValue(
             coder_urn=['beam:coder:kv:v1',
                        'beam:coder:bytes:v1',
                        'beam:coder:double:v1'],
             payload=TupleCoder([StrUtf8Coder(), FloatCoder()])
             .get_impl().encode_nested(values['optional_kv'])),
     }
     expected = get_payload(args)
     self.assertEqual(result, expected)
   else:
     expected = get_payload(PayloadBase.args)
     self.assertEqual(result, expected)
コード例 #2
0
  def get_restriction_coder(self):
    # type: () -> Optional[TupleCoder]

    """Get coder for a restriction when processing an SDF. """
    if self.is_splittable_dofn():
      return TupleCoder([
          (self.get_restriction_provider().restriction_coder()),
          (self.get_watermark_estimator_provider().estimator_state_coder())
      ])
コード例 #3
0
ファイル: kafka.py プロジェクト: yktq/Apache-Beam-
def _encode_map(dict_obj):
    kv_list = [(key.encode('utf-8'), val.encode('utf-8'))
               for key, val in dict_obj.items()]
    coder = IterableCoder(
        TupleCoder(
            [LengthPrefixCoder(BytesCoder()),
             LengthPrefixCoder(BytesCoder())]))
    coder_urns = [
        'beam:coder:iterable:v1', 'beam:coder:kv:v1', 'beam:coder:bytes:v1',
        'beam:coder:bytes:v1'
    ]
    return ConfigValue(coder_urn=coder_urns, payload=coder.encode(kv_list))
コード例 #4
0
class PayloadBase(object):
    values = {
        'integer_example': 1,
        'boolean': True,
        'string_example': u'thing',
        'list_of_strings': [u'foo', u'bar'],
        'optional_kv': (u'key', 1.1),
        'optional_integer': None,
    }

    bytes_values = {
        'integer_example': 1,
        'boolean': True,
        'string_example': 'thing',
        'list_of_strings': ['foo', 'bar'],
        'optional_kv': ('key', 1.1),
        'optional_integer': None,
    }

    args = {
        'integer_example':
        ConfigValue(coder_urn=['beam:coder:varint:v1'],
                    payload=VarIntCoder().get_impl().encode_nested(
                        values['integer_example'])),
        'boolean':
        ConfigValue(coder_urn=['beam:coder:bool:v1'],
                    payload=BooleanCoder().get_impl().encode_nested(
                        values['boolean'])),
        'string_example':
        ConfigValue(coder_urn=['beam:coder:string_utf8:v1'],
                    payload=StrUtf8Coder().get_impl().encode_nested(
                        values['string_example'])),
        'list_of_strings':
        ConfigValue(
            coder_urn=['beam:coder:iterable:v1', 'beam:coder:string_utf8:v1'],
            payload=IterableCoder(StrUtf8Coder()).get_impl().encode_nested(
                values['list_of_strings'])),
        'optional_kv':
        ConfigValue(coder_urn=[
            'beam:coder:kv:v1', 'beam:coder:string_utf8:v1',
            'beam:coder:double:v1'
        ],
                    payload=TupleCoder([
                        StrUtf8Coder(), FloatCoder()
                    ]).get_impl().encode_nested(values['optional_kv'])),
    }

    def get_payload_from_typing_hints(self, values):
        """Return ExternalConfigurationPayload based on python typing hints"""
        raise NotImplementedError

    def get_payload_from_beam_typehints(self, values):
        """Return ExternalConfigurationPayload based on beam typehints"""
        raise NotImplementedError

    def test_typing_payload_builder(self):
        result = self.get_payload_from_typing_hints(self.values)
        expected = get_payload(self.args)
        self.assertEqual(result, expected)

    def test_typing_payload_builder_with_bytes(self):
        """
    string_utf8 coder will be used even if values are not unicode in python 2.x
    """
        result = self.get_payload_from_typing_hints(self.bytes_values)
        expected = get_payload(self.args)
        self.assertEqual(result, expected)

    def test_typehints_payload_builder(self):
        result = self.get_payload_from_beam_typehints(self.values)
        expected = get_payload(self.args)
        self.assertEqual(result, expected)

    def test_typehints_payload_builder_with_bytes(self):
        """
    string_utf8 coder will be used even if values are not unicode in python 2.x
    """
        result = self.get_payload_from_beam_typehints(self.bytes_values)
        expected = get_payload(self.args)
        self.assertEqual(result, expected)

    def test_optional_error(self):
        """
    value can only be None if typehint is Optional
    """
        with self.assertRaises(RuntimeError):
            self.get_payload_from_typing_hints({k: None for k in self.values})
コード例 #5
0
class GeneralTriggerManagerDoFn(DoFn):
    """A trigger manager that supports all windowing / triggering cases.

  This implements a DoFn that manages triggering in a per-key basis. All
  elements for a single key are processed together. Per-key state holds data
  related to all windows.
  """

    # TODO(BEAM-12026) Add support for Global and custom window fns.
    KNOWN_WINDOWS = SetStateSpec('known_windows', IntervalWindowCoder())
    FINISHED_WINDOWS = SetStateSpec('finished_windows', IntervalWindowCoder())
    LAST_KNOWN_TIME = CombiningValueStateSpec('last_known_time',
                                              combine_fn=max)
    LAST_KNOWN_WATERMARK = CombiningValueStateSpec('last_known_watermark',
                                                   combine_fn=max)

    # TODO(pabloem) What's the coder for the elements/keys here?
    WINDOW_ELEMENT_PAIRS = BagStateSpec(
        'all_elements', TupleCoder([IntervalWindowCoder(),
                                    PickleCoder()]))
    WINDOW_TAG_VALUES = BagStateSpec(
        'per_window_per_tag_value_state',
        TupleCoder([IntervalWindowCoder(),
                    StrUtf8Coder(),
                    VarIntCoder()]))

    PROCESSING_TIME_TIMER = TimerSpec('processing_time_timer',
                                      TimeDomain.REAL_TIME)
    WATERMARK_TIMER = TimerSpec('watermark_timer', TimeDomain.WATERMARK)

    def __init__(self, windowing: Windowing):
        self.windowing = windowing
        # Only session windows are merging. Other windows are non-merging.
        self.merging_windows = self.windowing.windowfn.is_merging()

    def process(
            self,
            element: typing.Tuple[
                K, typing.Iterable[windowed_value.WindowedValue]],
            all_elements: BagRuntimeState = DoFn.StateParam(
                WINDOW_ELEMENT_PAIRS),  # type: ignore
            latest_processing_time: AccumulatingRuntimeState = DoFn.StateParam(
                LAST_KNOWN_TIME),  # type: ignore
            latest_watermark: AccumulatingRuntimeState = DoFn.
        StateParam(  # type: ignore
            LAST_KNOWN_WATERMARK),
            window_tag_values: BagRuntimeState = DoFn.StateParam(
                WINDOW_TAG_VALUES),  # type: ignore
            windows_state: SetRuntimeState = DoFn.StateParam(
                KNOWN_WINDOWS),  # type: ignore
            finished_windows_state: SetRuntimeState = DoFn.
        StateParam(  # type: ignore
            FINISHED_WINDOWS),
            processing_time_timer=DoFn.TimerParam(PROCESSING_TIME_TIMER),
            watermark_timer=DoFn.TimerParam(WATERMARK_TIMER),
            *args,
            **kwargs):
        context = FnRunnerStatefulTriggerContext(
            processing_time_timer=processing_time_timer,
            watermark_timer=watermark_timer,
            latest_processing_time=latest_processing_time,
            latest_watermark=latest_watermark,
            all_elements_state=all_elements,
            window_tag_values=window_tag_values,
            finished_windows_state=finished_windows_state)
        key, windowed_values = element
        watermark = read_watermark(latest_watermark)

        windows_to_elements = collections.defaultdict(list)
        for wv in windowed_values:
            for window in wv.windows:
                # ignore expired windows
                if watermark > window.end + self.windowing.allowed_lateness:
                    continue
                if window in finished_windows_state.read():
                    continue
                windows_to_elements[window].append(
                    TimestampedValue(wv.value, wv.timestamp))

        # Processing merging of windows
        if self.merging_windows:
            old_windows = set(windows_state.read())
            all_windows = old_windows.union(list(windows_to_elements))
            if all_windows != old_windows:
                merge_context = TriggerMergeContext(all_windows, context,
                                                    self.windowing)
                self.windowing.windowfn.merge(merge_context)

                merged_windows_to_elements = collections.defaultdict(list)
                for window, values in windows_to_elements.items():
                    while window in merge_context.merged_away:
                        window = merge_context.merged_away[window]
                    merged_windows_to_elements[window].extend(values)
                windows_to_elements = merged_windows_to_elements

            for w in windows_to_elements:
                windows_state.add(w)
        # Done processing merging of windows

        seen_windows = set()
        for w in windows_to_elements:
            window_context = context.for_window(w)
            seen_windows.add(w)
            for value_w_timestamp in windows_to_elements[w]:
                _LOGGER.debug(value_w_timestamp)
                all_elements.add((w, value_w_timestamp))
                self.windowing.triggerfn.on_element(windowed_values, w,
                                                    window_context)

        return self._fire_eligible_windows(key, TimeDomain.WATERMARK,
                                           watermark, None, context,
                                           seen_windows)

    def _fire_eligible_windows(self,
                               key: K,
                               time_domain,
                               timestamp: Timestamp,
                               timer_tag: typing.Optional[str],
                               context: 'FnRunnerStatefulTriggerContext',
                               windows_of_interest: typing.Optional[
                                   typing.Set[BoundedWindow]] = None):
        windows_to_elements = context.windows_to_elements_map()
        context.all_elements_state.clear()

        fired_windows = set()
        _LOGGER.debug('%s - tag %s - timestamp %s', time_domain, timer_tag,
                      timestamp)
        for w, elems in windows_to_elements.items():
            if windows_of_interest is not None and w not in windows_of_interest:
                # windows_of_interest=None means that we care about all windows.
                # If we care only about some windows, and this window is not one of
                # them, then we do not intend to fire this window.
                continue
            window_context = context.for_window(w)
            if self.windowing.triggerfn.should_fire(time_domain, timestamp, w,
                                                    window_context):
                finished = self.windowing.triggerfn.on_fire(
                    timestamp, w, window_context)
                _LOGGER.debug('Firing on window %s. Finished: %s', w, finished)
                fired_windows.add(w)
                if finished:
                    context.finished_windows_state.add(w)
                # TODO(pabloem): Format the output: e.g. pane info
                elems = [
                    WindowedValue(e.value, e.timestamp, (w, )) for e in elems
                ]
                yield (key, elems)

        finished_windows: typing.Set[BoundedWindow] = set(
            context.finished_windows_state.read())
        # Add elements that were not fired back into state.
        for w, elems in windows_to_elements.items():
            for e in elems:
                if (w in finished_windows or
                    (w in fired_windows and self.windowing.accumulation_mode
                     == AccumulationMode.DISCARDING)):
                    continue
                context.all_elements_state.add((w, e))

    @on_timer(PROCESSING_TIME_TIMER)
    def processing_time_trigger(
        self,
        key=DoFn.KeyParam,
        timer_tag=DoFn.DynamicTimerTagParam,
        timestamp=DoFn.TimestampParam,
        latest_processing_time=DoFn.StateParam(LAST_KNOWN_TIME),
        all_elements=DoFn.StateParam(WINDOW_ELEMENT_PAIRS),
        processing_time_timer=DoFn.TimerParam(PROCESSING_TIME_TIMER),
        window_tag_values: BagRuntimeState = DoFn.StateParam(
            WINDOW_TAG_VALUES),  # type: ignore
        finished_windows_state: SetRuntimeState = DoFn.
        StateParam(  # type: ignore
            FINISHED_WINDOWS),
        watermark_timer=DoFn.TimerParam(WATERMARK_TIMER)):
        context = FnRunnerStatefulTriggerContext(
            processing_time_timer=processing_time_timer,
            watermark_timer=watermark_timer,
            latest_processing_time=latest_processing_time,
            latest_watermark=None,
            all_elements_state=all_elements,
            window_tag_values=window_tag_values,
            finished_windows_state=finished_windows_state)
        result = self._fire_eligible_windows(key, TimeDomain.REAL_TIME,
                                             timestamp, timer_tag, context)
        latest_processing_time.add(timestamp)
        return result

    @on_timer(WATERMARK_TIMER)
    def watermark_trigger(
        self,
        key=DoFn.KeyParam,
        timer_tag=DoFn.DynamicTimerTagParam,
        timestamp=DoFn.TimestampParam,
        latest_watermark=DoFn.StateParam(LAST_KNOWN_WATERMARK),
        all_elements=DoFn.StateParam(WINDOW_ELEMENT_PAIRS),
        processing_time_timer=DoFn.TimerParam(PROCESSING_TIME_TIMER),
        window_tag_values: BagRuntimeState = DoFn.StateParam(
            WINDOW_TAG_VALUES),  # type: ignore
        finished_windows_state: SetRuntimeState = DoFn.
        StateParam(  # type: ignore
            FINISHED_WINDOWS),
        watermark_timer=DoFn.TimerParam(WATERMARK_TIMER)):
        context = FnRunnerStatefulTriggerContext(
            processing_time_timer=processing_time_timer,
            watermark_timer=watermark_timer,
            latest_processing_time=None,
            latest_watermark=latest_watermark,
            all_elements_state=all_elements,
            window_tag_values=window_tag_values,
            finished_windows_state=finished_windows_state)
        result = self._fire_eligible_windows(key, TimeDomain.WATERMARK,
                                             timestamp, timer_tag, context)
        latest_watermark.add(timestamp)
        return result