Esempio n. 1
0
def extract_data_stream_stateless_function(udf_proto):
    """
    Extracts user-defined-function from the proto representation of a
    :class:`Function`.

    :param udf_proto: the proto representation of the Python :class:`Function`
    """
    func_type = udf_proto.function_type
    UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction
    func = None

    user_defined_func = pickle.loads(udf_proto.payload)
    if func_type == UserDefinedDataStreamFunction.MAP:
        func = user_defined_func.map
    elif func_type == UserDefinedDataStreamFunction.FLAT_MAP:
        func = user_defined_func.flat_map
    elif func_type == UserDefinedDataStreamFunction.REDUCE:
        reduce_func = user_defined_func.reduce

        def wrapped_func(value):
            return reduce_func(value[0], value[1])
        func = wrapped_func
    elif func_type == UserDefinedDataStreamFunction.CO_MAP:
        co_map_func = user_defined_func

        def wrapped_func(value):
            # value in format of: [INPUT_FLAG, REAL_VALUE]
            # INPUT_FLAG value of True for the left stream, while False for the right stream
            return Row(CoMapFunctionOutputFlag.LEFT.value, co_map_func.map1(value[1])) \
                if value[0] else Row(CoMapFunctionOutputFlag.RIGHT.value,
                                     co_map_func.map2(value[2]))
        func = wrapped_func
    elif func_type == UserDefinedDataStreamFunction.CO_FLAT_MAP:
        co_flat_map_func = user_defined_func

        def wrapped_func(value):
            if value[0]:
                result = co_flat_map_func.flat_map1(value[1])
                if result:
                    for result_val in result:
                        yield Row(CoFlatMapFunctionOutputFlag.LEFT.value, result_val)
                yield Row(CoFlatMapFunctionOutputFlag.LEFT_END.value, None)
            else:
                result = co_flat_map_func.flat_map2(value[2])
                if result:
                    for result_val in result:
                        yield Row(CoFlatMapFunctionOutputFlag.RIGHT.value, result_val)
                yield Row(CoFlatMapFunctionOutputFlag.RIGHT_END.value, None)
        func = wrapped_func

    elif func_type == UserDefinedDataStreamFunction.TIMESTAMP_ASSIGNER:
        extract_timestamp = user_defined_func.extract_timestamp

        def wrapped_func(value):
            pre_timestamp = value[0]
            real_data = value[1]
            return extract_timestamp(real_data, pre_timestamp)
        func = wrapped_func

    return func, user_defined_func
Esempio n. 2
0
def load_aggregate_function(payload):
    if is_built_in_function(payload):
        built_in_function_class_name = payload[1:].decode("utf-8")
        cls = getattr(functions, built_in_function_class_name)
        return cls()
    else:
        return pickle.loads(payload)
Esempio n. 3
0
def extract_user_defined_function(user_defined_function_proto, pandas_udaf=False)\
        -> Tuple[str, Dict, List]:
    """
    Extracts user-defined-function from the proto representation of a
    :class:`UserDefinedFunction`.

    :param user_defined_function_proto: the proto representation of the Python
    :param pandas_udaf: whether the user_defined_function_proto is pandas udaf
    :class:`UserDefinedFunction`
    """
    def _next_func_num():
        global _func_num
        _func_num = _func_num + 1
        return _func_num

    variable_dict = {}
    user_defined_funcs = []

    user_defined_func = pickle.loads(user_defined_function_proto.payload)
    if pandas_udaf:
        user_defined_func = PandasAggregateFunctionWrapper(user_defined_func)
    func_name = 'f%s' % _next_func_num()
    if isinstance(user_defined_func, DelegatingScalarFunction) \
            or isinstance(user_defined_func, DelegationTableFunction):
        variable_dict[func_name] = user_defined_func.func
    else:
        variable_dict[func_name] = user_defined_func.eval
    user_defined_funcs.append(user_defined_func)

    func_args, input_variable_dict, input_funcs = _extract_input(
        user_defined_function_proto.inputs)
    variable_dict.update(input_variable_dict)
    user_defined_funcs.extend(input_funcs)
    return "%s(%s)" % (func_name, func_args), variable_dict, user_defined_funcs
Esempio n. 4
0
def extract_user_defined_function(user_defined_function_proto, pandas_udaf=False)\
        -> Tuple[str, Dict, List]:
    """
    Extracts user-defined-function from the proto representation of a
    :class:`UserDefinedFunction`.

    :param user_defined_function_proto: the proto representation of the Python
    :param pandas_udaf: whether the user_defined_function_proto is pandas udaf
    :class:`UserDefinedFunction`
    """
    def _next_func_num():
        global _func_num
        _func_num = _func_num + 1
        return _func_num

    def _extract_input(args) -> Tuple[str, Dict, List]:
        local_variable_dict = {}
        local_funcs = []
        args_str = []
        for arg in args:
            if arg.HasField("udf"):
                # for chaining Python UDF input: the input argument is a Python ScalarFunction
                udf_arg, udf_variable_dict, udf_funcs = extract_user_defined_function(
                    arg.udf)
                args_str.append(udf_arg)
                local_variable_dict.update(udf_variable_dict)
                local_funcs.extend(udf_funcs)
            elif arg.HasField("inputOffset"):
                # the input argument is a column of the input row
                args_str.append("value[%s]" % arg.inputOffset)
            else:
                # the input argument is a constant value
                constant_value_name, parsed_constant_value = \
                    _parse_constant_value(arg.inputConstant)
                args_str.append(constant_value_name)
                local_variable_dict[
                    constant_value_name] = parsed_constant_value
        return ",".join(args_str), local_variable_dict, local_funcs

    variable_dict = {}
    user_defined_funcs = []

    user_defined_func = pickle.loads(user_defined_function_proto.payload)
    if pandas_udaf:
        user_defined_func = PandasAggregateFunctionWrapper(user_defined_func)
    func_name = 'f%s' % _next_func_num()
    if isinstance(user_defined_func, DelegatingScalarFunction) \
            or isinstance(user_defined_func, DelegationTableFunction):
        variable_dict[func_name] = user_defined_func.func
    else:
        variable_dict[func_name] = user_defined_func.eval
    user_defined_funcs.append(user_defined_func)

    func_args, input_variable_dict, input_funcs = _extract_input(
        user_defined_function_proto.inputs)
    variable_dict.update(input_variable_dict)
    user_defined_funcs.extend(input_funcs)
    return "%s(%s)" % (func_name, func_args), variable_dict, user_defined_funcs
Esempio n. 5
0
def extract_data_stream_stateless_function(udf_proto, runtime_context):
    """
    Extracts user-defined-function from the proto representation of a
    :class:`Function`.

    :param udf_proto: the proto representation of the Python :class:`Function`
    :param runtime_context: the streaming runtime context
    """
    func_type = udf_proto.function_type
    UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction
    func = None

    user_defined_func = pickle.loads(udf_proto.payload)
    if func_type == UserDefinedDataStreamFunction.MAP:
        func = user_defined_func.map
    elif func_type == UserDefinedDataStreamFunction.FLAT_MAP:
        func = user_defined_func.flat_map
    elif func_type == UserDefinedDataStreamFunction.CO_MAP:
        co_map_func = user_defined_func

        def wrapped_func(value):
            # value in format of: [INPUT_FLAG, REAL_VALUE]
            # INPUT_FLAG value of True for the left stream, while False for the right stream
            return co_map_func.map1(
                value[1]) if value[0] else co_map_func.map2(value[2])

        func = wrapped_func
    elif func_type == UserDefinedDataStreamFunction.CO_FLAT_MAP:
        co_flat_map_func = user_defined_func

        def wrapped_func(value):
            if value[0]:
                yield from co_flat_map_func.flat_map1(value[1])
            else:
                yield from co_flat_map_func.flat_map2(value[2])

        func = wrapped_func

    elif func_type == UserDefinedDataStreamFunction.TIMESTAMP_ASSIGNER:
        extract_timestamp = user_defined_func.extract_timestamp

        def wrapped_func(value):
            pre_timestamp = value[0]
            real_data = value[1]
            return extract_timestamp(real_data, pre_timestamp)

        func = wrapped_func

    def open_func():
        if hasattr(user_defined_func, "open"):
            user_defined_func.open(runtime_context)

    def close_func():
        if hasattr(user_defined_func, "close"):
            user_defined_func.close()

    return func, open_func, close_func
Esempio n. 6
0
def extract_data_stream_stateless_function(udf_proto):
    """
    Extracts user-defined-function from the proto representation of a
    :class:`Function`.

    :param udf_proto: the proto representation of the Python :class:`Function`
    """
    func_type = udf_proto.function_type
    UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction
    func = None
    # import pyflink.datastream.tests.test_data_stream
    # from pyflink.datastream.tests.test_data_stream import MyKeySelector
    user_defined_func = pickle.loads(udf_proto.payload)
    if func_type == UserDefinedDataStreamFunction.MAP:
        func = user_defined_func.map
    elif func_type == UserDefinedDataStreamFunction.FLAT_MAP:
        func = user_defined_func.flat_map
    elif func_type == UserDefinedDataStreamFunction.REDUCE:
        reduce_func = user_defined_func.reduce

        def wrap_func(value):
            return reduce_func(value[0], value[1])

        func = wrap_func
    elif func_type == UserDefinedDataStreamFunction.CO_MAP:
        co_map_func = user_defined_func

        def wrap_func(value):
            return co_map_func.map1(
                value[1]) if value[0] else co_map_func.map2(value[2])

        func = wrap_func
    elif func_type == UserDefinedDataStreamFunction.CO_FLAT_MAP:
        co_flat_map_func = user_defined_func

        def wrap_func(value):
            return co_flat_map_func.flat_map1(value[1]) if value[0] else \
                co_flat_map_func.flat_map2(value[2])

        func = wrap_func

    elif func_type == UserDefinedDataStreamFunction.TIMESTAMP_ASSIGNER:
        extract_timestamp = user_defined_func.extract_timestamp

        def wrap_func(value):
            pre_timestamp = value[0]
            real_data = value[1]
            new_timestamp = extract_timestamp(real_data, pre_timestamp)
            return Row(new_timestamp, real_data)

        func = wrap_func

    return func, user_defined_func
Esempio n. 7
0
def extract_process_function(user_defined_function_proto, ctx):
    process_function = pickle.loads(user_defined_function_proto.payload)
    process_element = process_function.process_element

    def wrapped_process_function(value):
        # VALUE[CURRENT_TIMESTAMP, CURRENT_WATERMARK, NORMAL_DATA]
        ctx.set_timestamp(value[0])
        ctx.timer_service().set_current_watermark(value[1])
        output_result = process_element(value[2], ctx)
        return output_result

    return wrapped_process_function, process_function
Esempio n. 8
0
def extract_keyed_process_function(user_defined_function_proto, ctx,
                                   on_timer_ctx, collector,
                                   keyed_state_backend):
    process_function = pickle.loads(user_defined_function_proto.payload)
    process_element = process_function.process_element
    on_timer = process_function.on_timer

    def wrapped_keyed_process_function(value):
        if value[0] is not None:
            # it is timer data
            # VALUE: TIMER_FLAG, TIMESTAMP_OF_TIMER, CURRENT_WATERMARK, CURRENT_KEY_OF_TIMER, None
            on_timer_ctx.set_timestamp(value[1])
            on_timer_ctx.timer_service().set_current_watermark(value[2])
            current_key = value[3]
            on_timer_ctx.set_current_key(current_key)
            keyed_state_backend.set_current_key(current_key)
            if value[
                    0] == KeyedProcessFunctionInputFlag.EVENT_TIME_TIMER.value:
                on_timer_ctx.set_time_domain(TimeDomain.EVENT_TIME)
            elif value[
                    0] == KeyedProcessFunctionInputFlag.PROC_TIME_TIMER.value:
                on_timer_ctx.set_time_domain(TimeDomain.PROCESSING_TIME)
            else:
                raise TypeError("TimeCharacteristic[%s] is not supported." %
                                str(value[0]))
            output_result = on_timer(value[1], on_timer_ctx)
        else:
            # it is normal data
            # VALUE: TIMER_FLAG, CURRENT_TIMESTAMP, CURRENT_WATERMARK, None, NORMAL_DATA
            # NORMAL_DATA: CURRENT_KEY, DATA
            ctx.set_timestamp(value[1])
            ctx.timer_service().set_current_watermark(value[2])
            current_key = value[4][0]
            ctx.set_current_key(current_key)
            keyed_state_backend.set_current_key(Row(current_key))

            output_result = process_element(value[4][1], ctx)

        if output_result:
            for result in output_result:
                yield Row(None, None, None, result)

        for result in collector.buf:
            # 0: proc time timer data
            # 1: event time timer data
            # 2: normal data
            # result_row: [TIMER_FLAG, TIMER TYPE, TIMER_KEY, RESULT_DATA]
            yield Row(result[0], result[1], result[2], None)

        collector.clear()

    return wrapped_keyed_process_function, process_function
Esempio n. 9
0
def extract_process_function(user_defined_function_proto, ctx, collector):
    process_function = pickle.loads(user_defined_function_proto.payload)
    process_element = process_function.process_element

    def wrapped_process_function(value):
        # VALUE[CURRENT_TIMESTAMP, CURRENT_WATERMARK, NORMAL_DATA]
        ctx.set_timestamp(value[0])
        ctx.timer_service().set_current_watermark(value[1])
        process_element(value[2], ctx, collector)

        for a in collector.buf:
            yield a[1]
        collector.clear()

    return wrapped_process_function, process_function
Esempio n. 10
0
def extract_user_defined_process_function(user_defined_function_proto, ctx,
                                          on_timer_ctx, collector,
                                          keyed_state_backend):
    proc_func = pickle.loads(user_defined_function_proto.payload)
    process_element_func = proc_func.process_element
    on_timer_func = proc_func.on_timer

    def wrapped_func(value):

        # VALUE[TIMER_FLAG, TIMER_VALUE, CURRENT_WATERMARK, TIMER_KEY, NORMAL_DATA]
        current_watermark = value[2]
        ctx.timer_service()._current_watermark = current_watermark
        on_timer_ctx.timer_service()._current_watermark = current_watermark
        # it is timer data
        if value[0] is not None:
            timer_key = value[3]
            keyed_state_backend.set_current_key(timer_key)
            if value[0] == 0:
                time_domain = TimeDomain.EVENT_TIME
            elif value[0] == 1:
                time_domain = TimeDomain.PROCESSING_TIME
            else:
                raise TypeError("TimeCharacteristic[%s] is not supported." %
                                str(value[0]))
            on_timer_ctx._time_domain = time_domain
            on_timer_func(value[1], on_timer_ctx, collector)
        else:
            # it is normal data
            # VALUE[TIMER_FLAG, TIMER_VALUE, CURRENT_WATERMARK, TIMER_KEY, NORMAL_DATA]
            # NORMAL_DATA[CURRENT_KEY, DATA]
            current_key = Row(value[4][0])
            keyed_state_backend.set_current_key(current_key)

            real_data = value[4][1]
            process_element_func(real_data, ctx, collector)

        for a in collector.buf:
            # 0: proc time timer data
            # 1: event time timer data
            # 2: normal data
            # result_row: [TIMER_FLAG, TIMER TYPE, TIMER_KEY, RESULT_DATA]
            if a[0] == 2:
                yield Row(None, None, None, a[1])
            else:
                yield Row(a[0], a[1], a[2], None)
        collector.clear()

    return wrapped_func, proc_func
Esempio n. 11
0
def extract_user_defined_aggregate_function(user_defined_function_proto):
    user_defined_agg = pickle.loads(user_defined_function_proto.payload)
    assert isinstance(user_defined_agg, AggregateFunction)
    args_str = []
    local_variable_dict = {}
    for arg in user_defined_function_proto.inputs:
        if arg.HasField("inputOffset"):
            # the input argument is a column of the input row
            args_str.append("value[%s]" % arg.inputOffset)
        else:
            # the input argument is a constant value
            constant_value_name, parsed_constant_value = \
                _parse_constant_value(arg.inputConstant)
            args_str.append(constant_value_name)
            local_variable_dict[constant_value_name] = parsed_constant_value

    return user_defined_agg, eval("lambda value : [%s]" % ",".join(args_str),
                                  local_variable_dict)
Esempio n. 12
0
def extract_user_defined_aggregate_function(
        current_index, user_defined_function_proto,
        distinct_info_dict: Dict[Tuple[List[str]], Tuple[List[int],
                                                         List[int]]]):
    user_defined_agg = pickle.loads(user_defined_function_proto.payload)
    assert isinstance(user_defined_agg, AggregateFunction)
    args_str = []
    local_variable_dict = {}
    for arg in user_defined_function_proto.inputs:
        if arg.HasField("inputOffset"):
            # the input argument is a column of the input row
            args_str.append("value[%s]" % arg.inputOffset)
        else:
            # the input argument is a constant value
            constant_value_name, parsed_constant_value = \
                _parse_constant_value(arg.inputConstant)
            for key, value in local_variable_dict.items():
                if value == parsed_constant_value:
                    constant_value_name = key
                    break
            if constant_value_name not in local_variable_dict:
                local_variable_dict[
                    constant_value_name] = parsed_constant_value
            args_str.append(constant_value_name)

    if user_defined_function_proto.distinct:
        if tuple(args_str) in distinct_info_dict:
            distinct_info_dict[tuple(args_str)][0].append(current_index)
            distinct_info_dict[tuple(args_str)][1].append(
                user_defined_function_proto.filter_arg)
            distinct_index = distinct_info_dict[tuple(args_str)][0][0]
        else:
            distinct_info_dict[tuple(args_str)] = \
                ([current_index], [user_defined_function_proto.filter_arg])
            distinct_index = current_index
    else:
        distinct_index = -1
    return user_defined_agg, \
        eval("lambda value : (%s,)" % ",".join(args_str), local_variable_dict), \
        user_defined_function_proto.filter_arg, \
        distinct_index
Esempio n. 13
0
def extract_process_function(user_defined_function_proto, ctx,
                             runtime_context):
    process_function = pickle.loads(user_defined_function_proto.payload)
    process_element = process_function.process_element

    def wrapped_process_function(value):
        # VALUE[CURRENT_TIMESTAMP, CURRENT_WATERMARK, NORMAL_DATA]
        ctx.set_timestamp(value[0])
        ctx.timer_service().advance_watermark(value[1])
        output_result = process_element(value[2], ctx)
        return output_result

    def open_func():
        if hasattr(process_function, "open"):
            process_function.open(runtime_context)

    def close_func():
        if hasattr(process_function, "close"):
            process_function.close()

    return wrapped_process_function, open_func, close_func
Esempio n. 14
0
def extract_one_input_process_function(function_urn,
                                       user_defined_function_proto,
                                       runtime_context, function_context):
    user_defined_func = pickle.loads(user_defined_function_proto.payload)

    def open_func():
        if hasattr(user_defined_func, "open"):
            user_defined_func.open(runtime_context)

    def close_func():
        if hasattr(user_defined_func, "close"):
            user_defined_func.close()

    process_element = user_defined_func.process_element

    if function_urn == DATA_STREAM_STATELESS_FUNCTION_URN:
        context = InternalProcessFunctionContext(function_context)

    def process_element_func(value):
        yield from process_element(value, context)

    return open_func, close_func, process_element_func
Esempio n. 15
0
def extract_process_function(
        user_defined_function_proto, j_runtime_context, j_function_context, j_timer_context,
        j_side_output_context, job_parameters, j_keyed_state_backend, j_operator_state_backend):
    from pyflink.fn_execution import flink_fn_execution_pb2
    UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction

    user_defined_func = pickle.loads(user_defined_function_proto.payload)
    func_type = user_defined_function_proto.function_type

    runtime_context = StreamingRuntimeContext.of(j_runtime_context, job_parameters)

    if j_side_output_context:
        side_output_context = SideOutputContext(j_side_output_context)

        def process_func(values):
            for value in values:
                if isinstance(value, tuple) and isinstance(value[0], OutputTag):
                    output_tag = value[0]  # type: OutputTag
                    side_output_context.collect(output_tag.tag_id, value[1])
                else:
                    yield value
    else:
        def process_func(values):
            yield from values

    def open_func():
        if hasattr(user_defined_func, "open"):
            user_defined_func.open(runtime_context)

    def close_func():
        if hasattr(user_defined_func, "close"):
            user_defined_func.close()

    if func_type == UserDefinedDataStreamFunction.PROCESS:
        function_context = InternalProcessFunctionContext(j_function_context)
        process_element = user_defined_func.process_element

        def process_element_func(value):
            yield from process_func(process_element(value, function_context))

        return OneInputOperation(open_func, close_func, process_element_func)

    elif func_type == UserDefinedDataStreamFunction.KEYED_PROCESS:
        function_context = InternalKeyedProcessFunctionContext(
            j_function_context, user_defined_function_proto.key_type_info)
        timer_context = InternalKeyedProcessFunctionOnTimerContext(
            j_timer_context, user_defined_function_proto.key_type_info)

        keyed_state_backend = KeyedStateBackend(
            function_context,
            j_keyed_state_backend)
        runtime_context.set_keyed_state_backend(keyed_state_backend)

        process_element = user_defined_func.process_element
        on_timer = user_defined_func.on_timer

        def process_element_func(value):
            yield from process_func(process_element(value[1], function_context))

        def on_timer_func(timestamp):
            yield from process_func(on_timer(timestamp, timer_context))

        return OneInputOperation(open_func, close_func, process_element_func, on_timer_func)

    elif func_type == UserDefinedDataStreamFunction.CO_PROCESS:
        function_context = InternalProcessFunctionContext(j_function_context)

        process_element1 = user_defined_func.process_element1
        process_element2 = user_defined_func.process_element2

        def process_element_func1(value):
            yield from process_func(process_element1(value, function_context))

        def process_element_func2(value):
            yield from process_func(process_element2(value, function_context))

        return TwoInputOperation(
            open_func, close_func, process_element_func1, process_element_func2)

    elif func_type == UserDefinedDataStreamFunction.CO_BROADCAST_PROCESS:
        broadcast_ctx = InternalBroadcastProcessFunctionContext(
            j_function_context, j_operator_state_backend)
        read_only_broadcast_ctx = InternalBroadcastProcessFunctionReadOnlyContext(
            j_function_context, j_operator_state_backend)

        process_element = user_defined_func.process_element
        process_broadcast_element = user_defined_func.process_broadcast_element

        def process_element_func1(value):
            elements = process_element(value, read_only_broadcast_ctx)
            if elements:
                yield from elements

        def process_element_func2(value):
            elements = process_broadcast_element(value, broadcast_ctx)
            if elements:
                yield from elements

        return TwoInputOperation(
            open_func, close_func, process_element_func1, process_element_func2)

    elif func_type == UserDefinedDataStreamFunction.KEYED_CO_PROCESS:
        function_context = InternalKeyedProcessFunctionContext(
            j_function_context, user_defined_function_proto.key_type_info)
        timer_context = InternalKeyedProcessFunctionOnTimerContext(
            j_timer_context, user_defined_function_proto.key_type_info)

        keyed_state_backend = KeyedStateBackend(
            function_context,
            j_keyed_state_backend)
        runtime_context.set_keyed_state_backend(keyed_state_backend)

        process_element1 = user_defined_func.process_element1
        process_element2 = user_defined_func.process_element2
        on_timer = user_defined_func.on_timer

        def process_element_func1(value):
            yield from process_func(process_element1(value[1], function_context))

        def process_element_func2(value):
            yield from process_func(process_element2(value[1], function_context))

        def on_timer_func(timestamp):
            yield from process_func(on_timer(timestamp, timer_context))

        return TwoInputOperation(
            open_func, close_func, process_element_func1, process_element_func2, on_timer_func)

    elif func_type == UserDefinedDataStreamFunction.KEYED_CO_BROADCAST_PROCESS:
        broadcast_ctx = InternalKeyedBroadcastProcessFunctionContext(
            j_function_context, j_operator_state_backend)
        read_only_broadcast_ctx = InternalKeyedBroadcastProcessFunctionReadOnlyContext(
            j_function_context, user_defined_function_proto.key_type_info, j_operator_state_backend)
        timer_context = InternalKeyedBroadcastProcessFunctionOnTimerContext(
            j_timer_context, user_defined_function_proto.key_type_info, j_operator_state_backend)

        process_element = user_defined_func.process_element
        process_broadcast_element = user_defined_func.process_broadcast_element
        on_timer = user_defined_func.on_timer

        def process_element_func1(value):
            elements = process_element(value[1], read_only_broadcast_ctx)
            if elements:
                yield from elements

        def process_element_func2(value):
            elements = process_broadcast_element(value, broadcast_ctx)
            if elements:
                yield from elements

        def on_timer_func(timestamp):
            yield from on_timer(timestamp, timer_context)

        return TwoInputOperation(
            open_func, close_func, process_element_func1, process_element_func2, on_timer_func)

    elif func_type == UserDefinedDataStreamFunction.WINDOW:

        window_operation_descriptor = (
            user_defined_func
        )  # type: WindowOperationDescriptor

        def user_key_selector(normal_data):
            return normal_data

        window_assigner = window_operation_descriptor.assigner
        window_trigger = window_operation_descriptor.trigger
        allowed_lateness = window_operation_descriptor.allowed_lateness
        late_data_output_tag = window_operation_descriptor.late_data_output_tag
        window_state_descriptor = window_operation_descriptor.window_state_descriptor
        internal_window_function = window_operation_descriptor.internal_window_function
        window_serializer = window_operation_descriptor.window_serializer
        window_coder = window_serializer._get_coder()

        if isinstance(window_coder, TimeWindowCoder):
            window_converter = TimeWindowConverter()
        elif isinstance(window_coder, CountWindowCoder):
            window_converter = CountWindowConverter()
        else:
            window_converter = GlobalWindowConverter()

        internal_timer_service = InternalTimerServiceImpl(
            j_timer_context.timerService(), window_converter)

        function_context = InternalKeyedProcessFunctionContext(
            j_function_context,
            user_defined_function_proto.key_type_info)
        window_timer_context = InternalWindowTimerContext(
            j_timer_context,
            user_defined_function_proto.key_type_info,
            window_converter)

        keyed_state_backend = KeyedStateBackend(
            function_context,
            j_keyed_state_backend,
            j_function_context.getWindowSerializer(),
            window_converter)
        runtime_context.set_keyed_state_backend(keyed_state_backend)

        window_operator = WindowOperator(
            window_assigner,
            keyed_state_backend,
            user_key_selector,
            window_state_descriptor,
            internal_window_function,
            window_trigger,
            allowed_lateness,
            late_data_output_tag)

        def open_func():
            window_operator.open(runtime_context, internal_timer_service)

        def close_func():
            window_operator.close()

        def process_element_func(value):
            yield from process_func(
                window_operator.process_element(value[1], function_context.timestamp()))

        if window_assigner.is_event_time():
            def on_timer_func(timestamp):
                window = window_timer_context.window()
                key = window_timer_context.get_current_key()
                yield from process_func(window_operator.on_event_time(timestamp, key, window))
        else:
            def on_timer_func(timestamp):
                window = window_timer_context.window()
                key = window_timer_context.get_current_key()
                yield from process_func(window_operator.on_processing_time(timestamp, key, window))

        return OneInputOperation(open_func, close_func, process_element_func, on_timer_func)

    else:
        raise Exception("Unknown function type {0}.".format(func_type))
Esempio n. 16
0
def extract_stateful_function(user_defined_function_proto,
                              runtime_context: RuntimeContext,
                              keyed_state_backend: RemoteKeyedStateBackend):
    func_type = user_defined_function_proto.function_type
    user_defined_func = pickle.loads(user_defined_function_proto.payload)
    internal_timer_service = InternalTimerServiceImpl(keyed_state_backend)

    def state_key_selector(normal_data):
        return Row(normal_data[0])

    def user_key_selector(normal_data):
        return normal_data[0]

    def input_selector(normal_data):
        return normal_data[1]

    UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction
    if func_type in (UserDefinedDataStreamFunction.KEYED_PROCESS,
                     UserDefinedDataStreamFunction.KEYED_CO_PROCESS):
        timer_service = TimerServiceImpl(internal_timer_service)
        ctx = InternalKeyedProcessFunctionContext(timer_service)
        on_timer_ctx = InternalKeyedProcessFunctionOnTimerContext(
            timer_service)
        process_function = user_defined_func
        internal_timer_service.set_namespace_serializer(
            VoidNamespaceSerializer())

        def open_func():
            if hasattr(process_function, "open"):
                process_function.open(runtime_context)

        def close_func():
            if hasattr(process_function, "close"):
                process_function.close()

        def on_event_time(timestamp: int, key, namespace):
            keyed_state_backend.set_current_key(key)
            return _on_timer(TimeDomain.EVENT_TIME, timestamp, key)

        def on_processing_time(timestamp: int, key, namespace):
            keyed_state_backend.set_current_key(key)
            return _on_timer(TimeDomain.PROCESSING_TIME, timestamp, key)

        def _on_timer(time_domain: TimeDomain, timestamp: int, key):
            user_current_key = user_key_selector(key)

            on_timer_ctx.set_timestamp(timestamp)
            on_timer_ctx.set_current_key(user_current_key)
            on_timer_ctx.set_time_domain(time_domain)

            return process_function.on_timer(timestamp, on_timer_ctx)

        if func_type == UserDefinedDataStreamFunction.KEYED_PROCESS:

            def process_element(normal_data, timestamp: int):
                ctx.set_timestamp(timestamp)
                ctx.set_current_key(user_key_selector(normal_data))
                keyed_state_backend.set_current_key(
                    state_key_selector(normal_data))
                return process_function.process_element(
                    input_selector(normal_data), ctx)

        elif func_type == UserDefinedDataStreamFunction.KEYED_CO_PROCESS:

            def process_element(normal_data, timestamp: int):
                is_left = normal_data[0]
                if is_left:
                    user_input = normal_data[1]
                else:
                    user_input = normal_data[2]

                ctx.set_timestamp(timestamp)
                on_timer_ctx.set_current_key(user_key_selector(user_input))
                keyed_state_backend.set_current_key(
                    state_key_selector(user_input))

                if is_left:
                    return process_function.process_element1(
                        input_selector(user_input), ctx)
                else:
                    return process_function.process_element2(
                        input_selector(user_input), ctx)

        else:
            raise Exception("Unsupported func_type: " + str(func_type))

    elif func_type == UserDefinedDataStreamFunction.WINDOW:
        window_operation_descriptor = user_defined_func
        window_assigner = window_operation_descriptor.assigner
        window_trigger = window_operation_descriptor.trigger
        allowed_lateness = window_operation_descriptor.allowed_lateness
        window_state_descriptor = window_operation_descriptor.window_state_descriptor
        internal_window_function = window_operation_descriptor.internal_window_function
        window_serializer = window_operation_descriptor.window_serializer
        window_coder = window_serializer._get_coder()
        keyed_state_backend.namespace_coder = window_coder
        keyed_state_backend._namespace_coder_impl = window_coder.get_impl()
        window_operator = WindowOperator(window_assigner, keyed_state_backend,
                                         user_key_selector,
                                         window_state_descriptor,
                                         internal_window_function,
                                         window_trigger, allowed_lateness)
        internal_timer_service.set_namespace_serializer(window_serializer)

        def open_func():
            window_operator.open(runtime_context, internal_timer_service)

        def close_func():
            window_operator.close()

        def process_element(normal_data, timestamp: int):
            keyed_state_backend.set_current_key(
                state_key_selector(normal_data))
            return window_operator.process_element(input_selector(normal_data),
                                                   timestamp)

        def on_event_time(timestamp: int, key, namespace):
            keyed_state_backend.set_current_key(key)
            return window_operator.on_event_time(timestamp, key, namespace)

        def on_processing_time(timestamp: int, key, namespace):
            keyed_state_backend.set_current_key(key)
            return window_operator.on_processing_time(timestamp, key,
                                                      namespace)

    else:
        raise Exception("Unsupported function_type: " + str(func_type))

    input_handler = RunnerInputHandler(internal_timer_service, process_element)
    process_element_func = input_handler.process_element

    timer_handler = TimerHandler(internal_timer_service, on_event_time,
                                 on_processing_time,
                                 keyed_state_backend._namespace_coder_impl)
    process_timer_func = timer_handler.process_timer

    return open_func, close_func, process_element_func, process_timer_func, internal_timer_service
Esempio n. 17
0
def extract_stateless_function(user_defined_function_proto,
                               runtime_context: RuntimeContext):
    """
    Extracts user-defined-function from the proto representation of a
    :class:`Function`.

    :param user_defined_function_proto: the proto representation of the Python :class:`Function`
    :param runtime_context: the streaming runtime context
    """
    func_type = user_defined_function_proto.function_type
    UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction

    if func_type == UserDefinedDataStreamFunction.REVISE_OUTPUT:

        def open_func():
            pass

        def close_func():
            pass

        def revise_output(value):
            # VALUE[CURRENT_TIMESTAMP, CURRENT_WATERMARK, NORMAL_DATA]
            timestamp = value[0]
            element = value[2]
            yield Row(timestamp, element)

        process_element_func = revise_output

    else:
        user_defined_func = pickle.loads(user_defined_function_proto.payload)

        def open_func():
            if hasattr(user_defined_func, "open"):
                user_defined_func.open(runtime_context)

        def close_func():
            if hasattr(user_defined_func, "close"):
                user_defined_func.close()

        if func_type == UserDefinedDataStreamFunction.PROCESS:
            process_element = user_defined_func.process_element
            ctx = InternalProcessFunctionContext(NonKeyedTimerServiceImpl())

            def wrapped_func(value):
                # VALUE[CURRENT_TIMESTAMP, CURRENT_WATERMARK, NORMAL_DATA]
                timestamp = value[0]
                watermark = value[1]
                ctx.set_timestamp(timestamp)
                ctx.timer_service().advance_watermark(watermark)
                results = process_element(value[2], ctx)
                yield from _emit_results(timestamp, watermark, results)

            process_element_func = wrapped_func

        elif func_type == UserDefinedDataStreamFunction.CO_PROCESS:
            process_element1 = user_defined_func.process_element1
            process_element2 = user_defined_func.process_element2
            ctx = InternalProcessFunctionContext(NonKeyedTimerServiceImpl())

            def wrapped_func(value):
                # VALUE[CURRENT_TIMESTAMP, CURRENT_WATERMARK, [isLeft, leftInput, rightInput]]
                timestamp = value[0]
                watermark = value[1]
                ctx.set_timestamp(timestamp)
                ctx.timer_service().advance_watermark(watermark)

                normal_data = value[2]
                if normal_data[0]:
                    results = process_element1(normal_data[1], ctx)
                else:
                    results = process_element2(normal_data[2], ctx)

                yield from _emit_results(timestamp, watermark, results)

            process_element_func = wrapped_func

        else:
            raise Exception("Unsupported function_type: " + str(func_type))

    return open_func, close_func, process_element_func
Esempio n. 18
0
def extract_stateless_function(user_defined_function_proto,
                               runtime_context: RuntimeContext):
    """
    Extracts user-defined-function from the proto representation of a
    :class:`Function`.

    :param user_defined_function_proto: the proto representation of the Python :class:`Function`
    :param runtime_context: the streaming runtime context
    """
    func_type = user_defined_function_proto.function_type
    user_defined_func = pickle.loads(user_defined_function_proto.payload)
    process_element_func = None

    UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction
    if func_type == UserDefinedDataStreamFunction.MAP:
        process_element_func = user_defined_func.map

    elif func_type == UserDefinedDataStreamFunction.FLAT_MAP:
        process_element_func = user_defined_func.flat_map

    elif func_type == UserDefinedDataStreamFunction.CO_MAP:
        map1 = user_defined_func.map1
        map2 = user_defined_func.map2

        def wrapped_func(value):
            # value in format of: [INPUT_FLAG, REAL_VALUE]
            # INPUT_FLAG value of True for the left stream, while False for the right stream
            return map1(value[1]) if value[0] else map2(value[2])

        process_element_func = wrapped_func

    elif func_type == UserDefinedDataStreamFunction.CO_FLAT_MAP:
        flat_map1 = user_defined_func.flat_map1
        flat_map2 = user_defined_func.flat_map2

        def wrapped_func(value):
            if value[0]:
                yield from flat_map1(value[1])
            else:
                yield from flat_map2(value[2])

        process_element_func = wrapped_func

    elif func_type == UserDefinedDataStreamFunction.TIMESTAMP_ASSIGNER:
        extract_timestamp = user_defined_func.extract_timestamp

        def wrapped_func(value):
            pre_timestamp = value[0]
            real_data = value[1]
            return extract_timestamp(real_data, pre_timestamp)

        process_element_func = wrapped_func

    elif func_type == UserDefinedDataStreamFunction.PROCESS:
        process_element = user_defined_func.process_element
        ctx = InternalProcessFunctionContext(NonKeyedTimerServiceImpl())

        def wrapped_func(value):
            # VALUE[CURRENT_TIMESTAMP, CURRENT_WATERMARK, NORMAL_DATA]
            ctx.set_timestamp(value[0])
            ctx.timer_service().advance_watermark(value[1])
            output_result = process_element(value[2], ctx)
            return output_result

        process_element_func = wrapped_func

    def open_func():
        if hasattr(user_defined_func, "open"):
            user_defined_func.open(runtime_context)

    def close_func():
        if hasattr(user_defined_func, "close"):
            user_defined_func.close()

    return process_element_func, open_func, close_func
Esempio n. 19
0
def extract_user_defined_function(user_defined_function_proto, pandas_udaf=False,
                                  one_arg_optimization=False)\
        -> Tuple[str, Dict, List]:
    """
    Extracts user-defined-function from the proto representation of a
    :class:`UserDefinedFunction`.

    :param user_defined_function_proto: the proto representation of the Python
    :param pandas_udaf: whether the user_defined_function_proto is pandas udaf
    :param one_arg_optimization: whether the optimization enabled
    :class:`UserDefinedFunction`
    """
    def _next_func_num():
        global _func_num
        _func_num = _func_num + 1
        return _func_num

    def _extract_input(args) -> Tuple[str, Dict, List]:
        local_variable_dict = {}
        local_funcs = []
        args_str = []
        for arg in args:
            if arg.HasField("udf"):
                # for chaining Python UDF input: the input argument is a Python ScalarFunction
                udf_arg, udf_variable_dict, udf_funcs = extract_user_defined_function(
                    arg.udf, one_arg_optimization=one_arg_optimization)
                args_str.append(udf_arg)
                local_variable_dict.update(udf_variable_dict)
                local_funcs.extend(udf_funcs)
            elif arg.HasField("inputOffset"):
                if one_arg_optimization:
                    args_str.append("value")
                else:
                    # the input argument is a column of the input row
                    args_str.append("value[%s]" % arg.inputOffset)
            else:
                # the input argument is a constant value
                constant_value_name, parsed_constant_value = \
                    _parse_constant_value(arg.inputConstant)
                args_str.append(constant_value_name)
                local_variable_dict[
                    constant_value_name] = parsed_constant_value
        return ",".join(args_str), local_variable_dict, local_funcs

    variable_dict = {}
    user_defined_funcs = []

    user_defined_func = pickle.loads(user_defined_function_proto.payload)
    if pandas_udaf:
        user_defined_func = PandasAggregateFunctionWrapper(user_defined_func)
    func_name = 'f%s' % _next_func_num()
    if isinstance(user_defined_func, DelegatingScalarFunction) \
            or isinstance(user_defined_func, DelegationTableFunction):
        if user_defined_function_proto.is_pandas_udf:
            variable_dict[func_name] = partial(check_pandas_udf_result,
                                               user_defined_func.func)
        else:
            variable_dict[func_name] = user_defined_func.func
    else:
        variable_dict[func_name] = user_defined_func.eval
    user_defined_funcs.append(user_defined_func)

    func_args, input_variable_dict, input_funcs = _extract_input(
        user_defined_function_proto.inputs)
    variable_dict.update(input_variable_dict)
    user_defined_funcs.extend(input_funcs)
    if user_defined_function_proto.takes_row_as_input:
        if input_variable_dict:
            # for constant or other udfs as input arguments.
            func_str = "%s(%s)" % (func_name, func_args)
        elif user_defined_function_proto.is_pandas_udf or pandas_udaf:
            # for pandas udf/udaf, the input data structure is a List of Pandas.Series
            # we need to merge these Pandas.Series into a Pandas.DataFrame
            variable_dict[
                'wrap_input_series_as_dataframe'] = wrap_input_series_as_dataframe
            func_str = "%s(wrap_input_series_as_dataframe(%s))" % (func_name,
                                                                   func_args)
        else:
            # directly use `value` as input argument
            # e.g.
            # lambda value: Row(value[0], value[1])
            #   can be optimized to
            # lambda value: value
            func_str = "%s(value)" % func_name
    else:
        func_str = "%s(%s)" % (func_name, func_args)
    return func_str, variable_dict, user_defined_funcs
Esempio n. 20
0
def extract_keyed_process_function(user_defined_function_proto, ctx,
                                   on_timer_ctx, collector,
                                   keyed_state_backend):
    func_type = user_defined_function_proto.function_type
    UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction
    func = None
    process_function = pickle.loads(user_defined_function_proto.payload)
    on_timer = process_function.on_timer

    if func_type == UserDefinedDataStreamFunction.KEYED_PROCESS:
        process_element = process_function.process_element

        def wrapped_keyed_process_function(value):
            if value[0] is not None:
                # it is timer data
                # VALUE:
                # TIMER_FLAG, TIMESTAMP_OF_TIMER, CURRENT_WATERMARK, CURRENT_KEY_OF_TIMER, None
                on_timer_ctx.set_timestamp(value[1])
                on_timer_ctx.timer_service().set_current_watermark(value[2])
                state_current_key = value[3]
                user_current_key = state_current_key[0]
                on_timer_ctx.set_current_key(user_current_key)
                keyed_state_backend.set_current_key(state_current_key)
                if value[
                        0] == KeyedProcessFunctionInputFlag.EVENT_TIME_TIMER.value:
                    on_timer_ctx.set_time_domain(TimeDomain.EVENT_TIME)
                elif value[
                        0] == KeyedProcessFunctionInputFlag.PROC_TIME_TIMER.value:
                    on_timer_ctx.set_time_domain(TimeDomain.PROCESSING_TIME)
                else:
                    raise TypeError(
                        "TimeCharacteristic[%s] is not supported." %
                        str(value[0]))
                output_result = on_timer(value[1], on_timer_ctx)
            else:
                # it is normal data
                # VALUE: TIMER_FLAG, CURRENT_TIMESTAMP, CURRENT_WATERMARK, None, NORMAL_DATA
                # NORMAL_DATA: CURRENT_KEY, DATA
                ctx.set_timestamp(value[1])
                ctx.timer_service().set_current_watermark(value[2])
                user_current_key = value[4][0]
                state_current_key = Row(user_current_key)
                ctx.set_current_key(user_current_key)
                keyed_state_backend.set_current_key(state_current_key)

                output_result = process_element(value[4][1], ctx)

            if output_result:
                for result in output_result:
                    yield Row(None, None, None, result)

            for result in collector.buf:
                # 0: proc time timer data
                # 1: event time timer data
                # 2: normal data
                # result_row: [TIMER_FLAG, TIMER TYPE, TIMER_KEY, RESULT_DATA]
                yield Row(result[0], result[1], result[2], None)

            collector.clear()

        func = wrapped_keyed_process_function
    elif func_type == UserDefinedDataStreamFunction.KEYED_CO_PROCESS:
        input_handler = KeyedTwoInputTimerRowHandler(ctx, on_timer_ctx,
                                                     collector,
                                                     keyed_state_backend,
                                                     process_function)

        func = input_handler.process_element

    return func, process_function
Esempio n. 21
0
def extract_keyed_stateful_function(
        user_defined_function_proto,
        keyed_state_backend: RemoteKeyedStateBackend,
        runtime_context: RuntimeContext):
    func_type = user_defined_function_proto.function_type
    UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction
    payload = pickle.loads(user_defined_function_proto.payload)
    internal_timer_service = InternalTimerServiceImpl(keyed_state_backend)

    def state_key_selector(normal_data):
        return Row(normal_data[0])

    def user_key_selector(normal_data):
        return normal_data[0]

    def input_selector(normal_data):
        return normal_data[1]

    if func_type == UserDefinedDataStreamFunction.KEYED_PROCESS or \
            func_type == UserDefinedDataStreamFunction.KEYED_CO_PROCESS:
        timer_service = TimerServiceImpl(internal_timer_service)
        on_timer_ctx = InternalKeyedProcessFunctionOnTimerContext(
            timer_service)
        ctx = InternalKeyedProcessFunctionContext(timer_service)
        process_function = payload
        output_factory = RowWithTimerOutputFactory(VoidNamespaceSerializer())

        def open_func():
            if hasattr(process_function, "open"):
                process_function.open(runtime_context)

        def close_func():
            if hasattr(process_function, "close"):
                process_function.close()

        if func_type == UserDefinedDataStreamFunction.KEYED_PROCESS:

            def process_element(normal_data, timestamp: int):
                ctx.set_timestamp(timestamp)
                user_current_key = user_key_selector(normal_data)
                ctx.set_current_key(user_current_key)
                return process_function.process_element(
                    input_selector(normal_data), ctx)

            def on_event_time(internal_timer: InternalTimerImpl):
                timestamp = internal_timer.get_timestamp()
                state_current_key = internal_timer.get_key()
                user_current_key = user_key_selector(state_current_key)

                on_timer_ctx.set_current_key(user_current_key)
                on_timer_ctx.set_time_domain(TimeDomain.EVENT_TIME)

                return process_function.on_timer(timestamp, on_timer_ctx)

            def on_processing_time(internal_timer: InternalTimerImpl):
                timestamp = internal_timer.get_timestamp()
                state_current_key = internal_timer.get_key()
                user_current_key = user_key_selector(state_current_key)

                on_timer_ctx.set_current_key(user_current_key)
                on_timer_ctx.set_time_domain(TimeDomain.PROCESSING_TIME)

                return process_function.on_timer(timestamp, on_timer_ctx)

            input_handler = OneInputRowWithTimerHandler(
                internal_timer_service, keyed_state_backend,
                state_key_selector, process_element, on_event_time,
                on_processing_time, output_factory)

            process_element_func = input_handler.accept
        elif func_type == UserDefinedDataStreamFunction.KEYED_CO_PROCESS:
            input_handler = TwoInputRowWithTimerHandler(
                ctx, on_timer_ctx, timer_service, keyed_state_backend,
                process_function, output_factory)

            process_element_func = input_handler.accept
        else:
            raise Exception("Unsupported func_type: " + str(func_type))
    elif func_type == UserDefinedDataStreamFunction.WINDOW:
        window_operation_descriptor = payload
        window_assigner = window_operation_descriptor.assigner
        window_trigger = window_operation_descriptor.trigger
        allowed_lateness = window_operation_descriptor.allowed_lateness
        window_state_descriptor = window_operation_descriptor.window_state_descriptor
        internal_window_function = window_operation_descriptor.internal_window_function
        window_serializer = window_operation_descriptor.window_serializer
        keyed_state_backend._namespace_coder_impl = window_serializer._get_coder(
        )
        window_operator = WindowOperator(window_assigner, keyed_state_backend,
                                         user_key_selector,
                                         window_state_descriptor,
                                         internal_window_function,
                                         window_trigger, allowed_lateness)
        output_factory = RowWithTimerOutputFactory(window_serializer)

        def open_func():
            window_operator.open(runtime_context, internal_timer_service)

        def close_func():
            window_operator.close()

        input_handler = OneInputRowWithTimerHandler(
            internal_timer_service, keyed_state_backend, state_key_selector,
            lambda n, t: window_operator.process_element(input_selector(n), t),
            window_operator.on_event_time, window_operator.on_processing_time,
            output_factory)

        process_element_func = input_handler.accept
    else:
        raise Exception("Unsupported func_type: " + str(func_type))

    return process_element_func, open_func, close_func