def extract_data_stream_stateless_function(udf_proto): """ Extracts user-defined-function from the proto representation of a :class:`Function`. :param udf_proto: the proto representation of the Python :class:`Function` """ func_type = udf_proto.function_type UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction func = None user_defined_func = pickle.loads(udf_proto.payload) if func_type == UserDefinedDataStreamFunction.MAP: func = user_defined_func.map elif func_type == UserDefinedDataStreamFunction.FLAT_MAP: func = user_defined_func.flat_map elif func_type == UserDefinedDataStreamFunction.REDUCE: reduce_func = user_defined_func.reduce def wrapped_func(value): return reduce_func(value[0], value[1]) func = wrapped_func elif func_type == UserDefinedDataStreamFunction.CO_MAP: co_map_func = user_defined_func def wrapped_func(value): # value in format of: [INPUT_FLAG, REAL_VALUE] # INPUT_FLAG value of True for the left stream, while False for the right stream return Row(CoMapFunctionOutputFlag.LEFT.value, co_map_func.map1(value[1])) \ if value[0] else Row(CoMapFunctionOutputFlag.RIGHT.value, co_map_func.map2(value[2])) func = wrapped_func elif func_type == UserDefinedDataStreamFunction.CO_FLAT_MAP: co_flat_map_func = user_defined_func def wrapped_func(value): if value[0]: result = co_flat_map_func.flat_map1(value[1]) if result: for result_val in result: yield Row(CoFlatMapFunctionOutputFlag.LEFT.value, result_val) yield Row(CoFlatMapFunctionOutputFlag.LEFT_END.value, None) else: result = co_flat_map_func.flat_map2(value[2]) if result: for result_val in result: yield Row(CoFlatMapFunctionOutputFlag.RIGHT.value, result_val) yield Row(CoFlatMapFunctionOutputFlag.RIGHT_END.value, None) func = wrapped_func elif func_type == UserDefinedDataStreamFunction.TIMESTAMP_ASSIGNER: extract_timestamp = user_defined_func.extract_timestamp def wrapped_func(value): pre_timestamp = value[0] real_data = value[1] return extract_timestamp(real_data, pre_timestamp) func = wrapped_func return func, user_defined_func
def load_aggregate_function(payload): if is_built_in_function(payload): built_in_function_class_name = payload[1:].decode("utf-8") cls = getattr(functions, built_in_function_class_name) return cls() else: return pickle.loads(payload)
def extract_user_defined_function(user_defined_function_proto, pandas_udaf=False)\ -> Tuple[str, Dict, List]: """ Extracts user-defined-function from the proto representation of a :class:`UserDefinedFunction`. :param user_defined_function_proto: the proto representation of the Python :param pandas_udaf: whether the user_defined_function_proto is pandas udaf :class:`UserDefinedFunction` """ def _next_func_num(): global _func_num _func_num = _func_num + 1 return _func_num variable_dict = {} user_defined_funcs = [] user_defined_func = pickle.loads(user_defined_function_proto.payload) if pandas_udaf: user_defined_func = PandasAggregateFunctionWrapper(user_defined_func) func_name = 'f%s' % _next_func_num() if isinstance(user_defined_func, DelegatingScalarFunction) \ or isinstance(user_defined_func, DelegationTableFunction): variable_dict[func_name] = user_defined_func.func else: variable_dict[func_name] = user_defined_func.eval user_defined_funcs.append(user_defined_func) func_args, input_variable_dict, input_funcs = _extract_input( user_defined_function_proto.inputs) variable_dict.update(input_variable_dict) user_defined_funcs.extend(input_funcs) return "%s(%s)" % (func_name, func_args), variable_dict, user_defined_funcs
def extract_user_defined_function(user_defined_function_proto, pandas_udaf=False)\ -> Tuple[str, Dict, List]: """ Extracts user-defined-function from the proto representation of a :class:`UserDefinedFunction`. :param user_defined_function_proto: the proto representation of the Python :param pandas_udaf: whether the user_defined_function_proto is pandas udaf :class:`UserDefinedFunction` """ def _next_func_num(): global _func_num _func_num = _func_num + 1 return _func_num def _extract_input(args) -> Tuple[str, Dict, List]: local_variable_dict = {} local_funcs = [] args_str = [] for arg in args: if arg.HasField("udf"): # for chaining Python UDF input: the input argument is a Python ScalarFunction udf_arg, udf_variable_dict, udf_funcs = extract_user_defined_function( arg.udf) args_str.append(udf_arg) local_variable_dict.update(udf_variable_dict) local_funcs.extend(udf_funcs) elif arg.HasField("inputOffset"): # the input argument is a column of the input row args_str.append("value[%s]" % arg.inputOffset) else: # the input argument is a constant value constant_value_name, parsed_constant_value = \ _parse_constant_value(arg.inputConstant) args_str.append(constant_value_name) local_variable_dict[ constant_value_name] = parsed_constant_value return ",".join(args_str), local_variable_dict, local_funcs variable_dict = {} user_defined_funcs = [] user_defined_func = pickle.loads(user_defined_function_proto.payload) if pandas_udaf: user_defined_func = PandasAggregateFunctionWrapper(user_defined_func) func_name = 'f%s' % _next_func_num() if isinstance(user_defined_func, DelegatingScalarFunction) \ or isinstance(user_defined_func, DelegationTableFunction): variable_dict[func_name] = user_defined_func.func else: variable_dict[func_name] = user_defined_func.eval user_defined_funcs.append(user_defined_func) func_args, input_variable_dict, input_funcs = _extract_input( user_defined_function_proto.inputs) variable_dict.update(input_variable_dict) user_defined_funcs.extend(input_funcs) return "%s(%s)" % (func_name, func_args), variable_dict, user_defined_funcs
def extract_data_stream_stateless_function(udf_proto, runtime_context): """ Extracts user-defined-function from the proto representation of a :class:`Function`. :param udf_proto: the proto representation of the Python :class:`Function` :param runtime_context: the streaming runtime context """ func_type = udf_proto.function_type UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction func = None user_defined_func = pickle.loads(udf_proto.payload) if func_type == UserDefinedDataStreamFunction.MAP: func = user_defined_func.map elif func_type == UserDefinedDataStreamFunction.FLAT_MAP: func = user_defined_func.flat_map elif func_type == UserDefinedDataStreamFunction.CO_MAP: co_map_func = user_defined_func def wrapped_func(value): # value in format of: [INPUT_FLAG, REAL_VALUE] # INPUT_FLAG value of True for the left stream, while False for the right stream return co_map_func.map1( value[1]) if value[0] else co_map_func.map2(value[2]) func = wrapped_func elif func_type == UserDefinedDataStreamFunction.CO_FLAT_MAP: co_flat_map_func = user_defined_func def wrapped_func(value): if value[0]: yield from co_flat_map_func.flat_map1(value[1]) else: yield from co_flat_map_func.flat_map2(value[2]) func = wrapped_func elif func_type == UserDefinedDataStreamFunction.TIMESTAMP_ASSIGNER: extract_timestamp = user_defined_func.extract_timestamp def wrapped_func(value): pre_timestamp = value[0] real_data = value[1] return extract_timestamp(real_data, pre_timestamp) func = wrapped_func def open_func(): if hasattr(user_defined_func, "open"): user_defined_func.open(runtime_context) def close_func(): if hasattr(user_defined_func, "close"): user_defined_func.close() return func, open_func, close_func
def extract_data_stream_stateless_function(udf_proto): """ Extracts user-defined-function from the proto representation of a :class:`Function`. :param udf_proto: the proto representation of the Python :class:`Function` """ func_type = udf_proto.function_type UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction func = None # import pyflink.datastream.tests.test_data_stream # from pyflink.datastream.tests.test_data_stream import MyKeySelector user_defined_func = pickle.loads(udf_proto.payload) if func_type == UserDefinedDataStreamFunction.MAP: func = user_defined_func.map elif func_type == UserDefinedDataStreamFunction.FLAT_MAP: func = user_defined_func.flat_map elif func_type == UserDefinedDataStreamFunction.REDUCE: reduce_func = user_defined_func.reduce def wrap_func(value): return reduce_func(value[0], value[1]) func = wrap_func elif func_type == UserDefinedDataStreamFunction.CO_MAP: co_map_func = user_defined_func def wrap_func(value): return co_map_func.map1( value[1]) if value[0] else co_map_func.map2(value[2]) func = wrap_func elif func_type == UserDefinedDataStreamFunction.CO_FLAT_MAP: co_flat_map_func = user_defined_func def wrap_func(value): return co_flat_map_func.flat_map1(value[1]) if value[0] else \ co_flat_map_func.flat_map2(value[2]) func = wrap_func elif func_type == UserDefinedDataStreamFunction.TIMESTAMP_ASSIGNER: extract_timestamp = user_defined_func.extract_timestamp def wrap_func(value): pre_timestamp = value[0] real_data = value[1] new_timestamp = extract_timestamp(real_data, pre_timestamp) return Row(new_timestamp, real_data) func = wrap_func return func, user_defined_func
def extract_process_function(user_defined_function_proto, ctx): process_function = pickle.loads(user_defined_function_proto.payload) process_element = process_function.process_element def wrapped_process_function(value): # VALUE[CURRENT_TIMESTAMP, CURRENT_WATERMARK, NORMAL_DATA] ctx.set_timestamp(value[0]) ctx.timer_service().set_current_watermark(value[1]) output_result = process_element(value[2], ctx) return output_result return wrapped_process_function, process_function
def extract_keyed_process_function(user_defined_function_proto, ctx, on_timer_ctx, collector, keyed_state_backend): process_function = pickle.loads(user_defined_function_proto.payload) process_element = process_function.process_element on_timer = process_function.on_timer def wrapped_keyed_process_function(value): if value[0] is not None: # it is timer data # VALUE: TIMER_FLAG, TIMESTAMP_OF_TIMER, CURRENT_WATERMARK, CURRENT_KEY_OF_TIMER, None on_timer_ctx.set_timestamp(value[1]) on_timer_ctx.timer_service().set_current_watermark(value[2]) current_key = value[3] on_timer_ctx.set_current_key(current_key) keyed_state_backend.set_current_key(current_key) if value[ 0] == KeyedProcessFunctionInputFlag.EVENT_TIME_TIMER.value: on_timer_ctx.set_time_domain(TimeDomain.EVENT_TIME) elif value[ 0] == KeyedProcessFunctionInputFlag.PROC_TIME_TIMER.value: on_timer_ctx.set_time_domain(TimeDomain.PROCESSING_TIME) else: raise TypeError("TimeCharacteristic[%s] is not supported." % str(value[0])) output_result = on_timer(value[1], on_timer_ctx) else: # it is normal data # VALUE: TIMER_FLAG, CURRENT_TIMESTAMP, CURRENT_WATERMARK, None, NORMAL_DATA # NORMAL_DATA: CURRENT_KEY, DATA ctx.set_timestamp(value[1]) ctx.timer_service().set_current_watermark(value[2]) current_key = value[4][0] ctx.set_current_key(current_key) keyed_state_backend.set_current_key(Row(current_key)) output_result = process_element(value[4][1], ctx) if output_result: for result in output_result: yield Row(None, None, None, result) for result in collector.buf: # 0: proc time timer data # 1: event time timer data # 2: normal data # result_row: [TIMER_FLAG, TIMER TYPE, TIMER_KEY, RESULT_DATA] yield Row(result[0], result[1], result[2], None) collector.clear() return wrapped_keyed_process_function, process_function
def extract_process_function(user_defined_function_proto, ctx, collector): process_function = pickle.loads(user_defined_function_proto.payload) process_element = process_function.process_element def wrapped_process_function(value): # VALUE[CURRENT_TIMESTAMP, CURRENT_WATERMARK, NORMAL_DATA] ctx.set_timestamp(value[0]) ctx.timer_service().set_current_watermark(value[1]) process_element(value[2], ctx, collector) for a in collector.buf: yield a[1] collector.clear() return wrapped_process_function, process_function
def extract_user_defined_process_function(user_defined_function_proto, ctx, on_timer_ctx, collector, keyed_state_backend): proc_func = pickle.loads(user_defined_function_proto.payload) process_element_func = proc_func.process_element on_timer_func = proc_func.on_timer def wrapped_func(value): # VALUE[TIMER_FLAG, TIMER_VALUE, CURRENT_WATERMARK, TIMER_KEY, NORMAL_DATA] current_watermark = value[2] ctx.timer_service()._current_watermark = current_watermark on_timer_ctx.timer_service()._current_watermark = current_watermark # it is timer data if value[0] is not None: timer_key = value[3] keyed_state_backend.set_current_key(timer_key) if value[0] == 0: time_domain = TimeDomain.EVENT_TIME elif value[0] == 1: time_domain = TimeDomain.PROCESSING_TIME else: raise TypeError("TimeCharacteristic[%s] is not supported." % str(value[0])) on_timer_ctx._time_domain = time_domain on_timer_func(value[1], on_timer_ctx, collector) else: # it is normal data # VALUE[TIMER_FLAG, TIMER_VALUE, CURRENT_WATERMARK, TIMER_KEY, NORMAL_DATA] # NORMAL_DATA[CURRENT_KEY, DATA] current_key = Row(value[4][0]) keyed_state_backend.set_current_key(current_key) real_data = value[4][1] process_element_func(real_data, ctx, collector) for a in collector.buf: # 0: proc time timer data # 1: event time timer data # 2: normal data # result_row: [TIMER_FLAG, TIMER TYPE, TIMER_KEY, RESULT_DATA] if a[0] == 2: yield Row(None, None, None, a[1]) else: yield Row(a[0], a[1], a[2], None) collector.clear() return wrapped_func, proc_func
def extract_user_defined_aggregate_function(user_defined_function_proto): user_defined_agg = pickle.loads(user_defined_function_proto.payload) assert isinstance(user_defined_agg, AggregateFunction) args_str = [] local_variable_dict = {} for arg in user_defined_function_proto.inputs: if arg.HasField("inputOffset"): # the input argument is a column of the input row args_str.append("value[%s]" % arg.inputOffset) else: # the input argument is a constant value constant_value_name, parsed_constant_value = \ _parse_constant_value(arg.inputConstant) args_str.append(constant_value_name) local_variable_dict[constant_value_name] = parsed_constant_value return user_defined_agg, eval("lambda value : [%s]" % ",".join(args_str), local_variable_dict)
def extract_user_defined_aggregate_function( current_index, user_defined_function_proto, distinct_info_dict: Dict[Tuple[List[str]], Tuple[List[int], List[int]]]): user_defined_agg = pickle.loads(user_defined_function_proto.payload) assert isinstance(user_defined_agg, AggregateFunction) args_str = [] local_variable_dict = {} for arg in user_defined_function_proto.inputs: if arg.HasField("inputOffset"): # the input argument is a column of the input row args_str.append("value[%s]" % arg.inputOffset) else: # the input argument is a constant value constant_value_name, parsed_constant_value = \ _parse_constant_value(arg.inputConstant) for key, value in local_variable_dict.items(): if value == parsed_constant_value: constant_value_name = key break if constant_value_name not in local_variable_dict: local_variable_dict[ constant_value_name] = parsed_constant_value args_str.append(constant_value_name) if user_defined_function_proto.distinct: if tuple(args_str) in distinct_info_dict: distinct_info_dict[tuple(args_str)][0].append(current_index) distinct_info_dict[tuple(args_str)][1].append( user_defined_function_proto.filter_arg) distinct_index = distinct_info_dict[tuple(args_str)][0][0] else: distinct_info_dict[tuple(args_str)] = \ ([current_index], [user_defined_function_proto.filter_arg]) distinct_index = current_index else: distinct_index = -1 return user_defined_agg, \ eval("lambda value : (%s,)" % ",".join(args_str), local_variable_dict), \ user_defined_function_proto.filter_arg, \ distinct_index
def extract_process_function(user_defined_function_proto, ctx, runtime_context): process_function = pickle.loads(user_defined_function_proto.payload) process_element = process_function.process_element def wrapped_process_function(value): # VALUE[CURRENT_TIMESTAMP, CURRENT_WATERMARK, NORMAL_DATA] ctx.set_timestamp(value[0]) ctx.timer_service().advance_watermark(value[1]) output_result = process_element(value[2], ctx) return output_result def open_func(): if hasattr(process_function, "open"): process_function.open(runtime_context) def close_func(): if hasattr(process_function, "close"): process_function.close() return wrapped_process_function, open_func, close_func
def extract_one_input_process_function(function_urn, user_defined_function_proto, runtime_context, function_context): user_defined_func = pickle.loads(user_defined_function_proto.payload) def open_func(): if hasattr(user_defined_func, "open"): user_defined_func.open(runtime_context) def close_func(): if hasattr(user_defined_func, "close"): user_defined_func.close() process_element = user_defined_func.process_element if function_urn == DATA_STREAM_STATELESS_FUNCTION_URN: context = InternalProcessFunctionContext(function_context) def process_element_func(value): yield from process_element(value, context) return open_func, close_func, process_element_func
def extract_process_function( user_defined_function_proto, j_runtime_context, j_function_context, j_timer_context, j_side_output_context, job_parameters, j_keyed_state_backend, j_operator_state_backend): from pyflink.fn_execution import flink_fn_execution_pb2 UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction user_defined_func = pickle.loads(user_defined_function_proto.payload) func_type = user_defined_function_proto.function_type runtime_context = StreamingRuntimeContext.of(j_runtime_context, job_parameters) if j_side_output_context: side_output_context = SideOutputContext(j_side_output_context) def process_func(values): for value in values: if isinstance(value, tuple) and isinstance(value[0], OutputTag): output_tag = value[0] # type: OutputTag side_output_context.collect(output_tag.tag_id, value[1]) else: yield value else: def process_func(values): yield from values def open_func(): if hasattr(user_defined_func, "open"): user_defined_func.open(runtime_context) def close_func(): if hasattr(user_defined_func, "close"): user_defined_func.close() if func_type == UserDefinedDataStreamFunction.PROCESS: function_context = InternalProcessFunctionContext(j_function_context) process_element = user_defined_func.process_element def process_element_func(value): yield from process_func(process_element(value, function_context)) return OneInputOperation(open_func, close_func, process_element_func) elif func_type == UserDefinedDataStreamFunction.KEYED_PROCESS: function_context = InternalKeyedProcessFunctionContext( j_function_context, user_defined_function_proto.key_type_info) timer_context = InternalKeyedProcessFunctionOnTimerContext( j_timer_context, user_defined_function_proto.key_type_info) keyed_state_backend = KeyedStateBackend( function_context, j_keyed_state_backend) runtime_context.set_keyed_state_backend(keyed_state_backend) process_element = user_defined_func.process_element on_timer = user_defined_func.on_timer def process_element_func(value): yield from process_func(process_element(value[1], function_context)) def on_timer_func(timestamp): yield from process_func(on_timer(timestamp, timer_context)) return OneInputOperation(open_func, close_func, process_element_func, on_timer_func) elif func_type == UserDefinedDataStreamFunction.CO_PROCESS: function_context = InternalProcessFunctionContext(j_function_context) process_element1 = user_defined_func.process_element1 process_element2 = user_defined_func.process_element2 def process_element_func1(value): yield from process_func(process_element1(value, function_context)) def process_element_func2(value): yield from process_func(process_element2(value, function_context)) return TwoInputOperation( open_func, close_func, process_element_func1, process_element_func2) elif func_type == UserDefinedDataStreamFunction.CO_BROADCAST_PROCESS: broadcast_ctx = InternalBroadcastProcessFunctionContext( j_function_context, j_operator_state_backend) read_only_broadcast_ctx = InternalBroadcastProcessFunctionReadOnlyContext( j_function_context, j_operator_state_backend) process_element = user_defined_func.process_element process_broadcast_element = user_defined_func.process_broadcast_element def process_element_func1(value): elements = process_element(value, read_only_broadcast_ctx) if elements: yield from elements def process_element_func2(value): elements = process_broadcast_element(value, broadcast_ctx) if elements: yield from elements return TwoInputOperation( open_func, close_func, process_element_func1, process_element_func2) elif func_type == UserDefinedDataStreamFunction.KEYED_CO_PROCESS: function_context = InternalKeyedProcessFunctionContext( j_function_context, user_defined_function_proto.key_type_info) timer_context = InternalKeyedProcessFunctionOnTimerContext( j_timer_context, user_defined_function_proto.key_type_info) keyed_state_backend = KeyedStateBackend( function_context, j_keyed_state_backend) runtime_context.set_keyed_state_backend(keyed_state_backend) process_element1 = user_defined_func.process_element1 process_element2 = user_defined_func.process_element2 on_timer = user_defined_func.on_timer def process_element_func1(value): yield from process_func(process_element1(value[1], function_context)) def process_element_func2(value): yield from process_func(process_element2(value[1], function_context)) def on_timer_func(timestamp): yield from process_func(on_timer(timestamp, timer_context)) return TwoInputOperation( open_func, close_func, process_element_func1, process_element_func2, on_timer_func) elif func_type == UserDefinedDataStreamFunction.KEYED_CO_BROADCAST_PROCESS: broadcast_ctx = InternalKeyedBroadcastProcessFunctionContext( j_function_context, j_operator_state_backend) read_only_broadcast_ctx = InternalKeyedBroadcastProcessFunctionReadOnlyContext( j_function_context, user_defined_function_proto.key_type_info, j_operator_state_backend) timer_context = InternalKeyedBroadcastProcessFunctionOnTimerContext( j_timer_context, user_defined_function_proto.key_type_info, j_operator_state_backend) process_element = user_defined_func.process_element process_broadcast_element = user_defined_func.process_broadcast_element on_timer = user_defined_func.on_timer def process_element_func1(value): elements = process_element(value[1], read_only_broadcast_ctx) if elements: yield from elements def process_element_func2(value): elements = process_broadcast_element(value, broadcast_ctx) if elements: yield from elements def on_timer_func(timestamp): yield from on_timer(timestamp, timer_context) return TwoInputOperation( open_func, close_func, process_element_func1, process_element_func2, on_timer_func) elif func_type == UserDefinedDataStreamFunction.WINDOW: window_operation_descriptor = ( user_defined_func ) # type: WindowOperationDescriptor def user_key_selector(normal_data): return normal_data window_assigner = window_operation_descriptor.assigner window_trigger = window_operation_descriptor.trigger allowed_lateness = window_operation_descriptor.allowed_lateness late_data_output_tag = window_operation_descriptor.late_data_output_tag window_state_descriptor = window_operation_descriptor.window_state_descriptor internal_window_function = window_operation_descriptor.internal_window_function window_serializer = window_operation_descriptor.window_serializer window_coder = window_serializer._get_coder() if isinstance(window_coder, TimeWindowCoder): window_converter = TimeWindowConverter() elif isinstance(window_coder, CountWindowCoder): window_converter = CountWindowConverter() else: window_converter = GlobalWindowConverter() internal_timer_service = InternalTimerServiceImpl( j_timer_context.timerService(), window_converter) function_context = InternalKeyedProcessFunctionContext( j_function_context, user_defined_function_proto.key_type_info) window_timer_context = InternalWindowTimerContext( j_timer_context, user_defined_function_proto.key_type_info, window_converter) keyed_state_backend = KeyedStateBackend( function_context, j_keyed_state_backend, j_function_context.getWindowSerializer(), window_converter) runtime_context.set_keyed_state_backend(keyed_state_backend) window_operator = WindowOperator( window_assigner, keyed_state_backend, user_key_selector, window_state_descriptor, internal_window_function, window_trigger, allowed_lateness, late_data_output_tag) def open_func(): window_operator.open(runtime_context, internal_timer_service) def close_func(): window_operator.close() def process_element_func(value): yield from process_func( window_operator.process_element(value[1], function_context.timestamp())) if window_assigner.is_event_time(): def on_timer_func(timestamp): window = window_timer_context.window() key = window_timer_context.get_current_key() yield from process_func(window_operator.on_event_time(timestamp, key, window)) else: def on_timer_func(timestamp): window = window_timer_context.window() key = window_timer_context.get_current_key() yield from process_func(window_operator.on_processing_time(timestamp, key, window)) return OneInputOperation(open_func, close_func, process_element_func, on_timer_func) else: raise Exception("Unknown function type {0}.".format(func_type))
def extract_stateful_function(user_defined_function_proto, runtime_context: RuntimeContext, keyed_state_backend: RemoteKeyedStateBackend): func_type = user_defined_function_proto.function_type user_defined_func = pickle.loads(user_defined_function_proto.payload) internal_timer_service = InternalTimerServiceImpl(keyed_state_backend) def state_key_selector(normal_data): return Row(normal_data[0]) def user_key_selector(normal_data): return normal_data[0] def input_selector(normal_data): return normal_data[1] UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction if func_type in (UserDefinedDataStreamFunction.KEYED_PROCESS, UserDefinedDataStreamFunction.KEYED_CO_PROCESS): timer_service = TimerServiceImpl(internal_timer_service) ctx = InternalKeyedProcessFunctionContext(timer_service) on_timer_ctx = InternalKeyedProcessFunctionOnTimerContext( timer_service) process_function = user_defined_func internal_timer_service.set_namespace_serializer( VoidNamespaceSerializer()) def open_func(): if hasattr(process_function, "open"): process_function.open(runtime_context) def close_func(): if hasattr(process_function, "close"): process_function.close() def on_event_time(timestamp: int, key, namespace): keyed_state_backend.set_current_key(key) return _on_timer(TimeDomain.EVENT_TIME, timestamp, key) def on_processing_time(timestamp: int, key, namespace): keyed_state_backend.set_current_key(key) return _on_timer(TimeDomain.PROCESSING_TIME, timestamp, key) def _on_timer(time_domain: TimeDomain, timestamp: int, key): user_current_key = user_key_selector(key) on_timer_ctx.set_timestamp(timestamp) on_timer_ctx.set_current_key(user_current_key) on_timer_ctx.set_time_domain(time_domain) return process_function.on_timer(timestamp, on_timer_ctx) if func_type == UserDefinedDataStreamFunction.KEYED_PROCESS: def process_element(normal_data, timestamp: int): ctx.set_timestamp(timestamp) ctx.set_current_key(user_key_selector(normal_data)) keyed_state_backend.set_current_key( state_key_selector(normal_data)) return process_function.process_element( input_selector(normal_data), ctx) elif func_type == UserDefinedDataStreamFunction.KEYED_CO_PROCESS: def process_element(normal_data, timestamp: int): is_left = normal_data[0] if is_left: user_input = normal_data[1] else: user_input = normal_data[2] ctx.set_timestamp(timestamp) on_timer_ctx.set_current_key(user_key_selector(user_input)) keyed_state_backend.set_current_key( state_key_selector(user_input)) if is_left: return process_function.process_element1( input_selector(user_input), ctx) else: return process_function.process_element2( input_selector(user_input), ctx) else: raise Exception("Unsupported func_type: " + str(func_type)) elif func_type == UserDefinedDataStreamFunction.WINDOW: window_operation_descriptor = user_defined_func window_assigner = window_operation_descriptor.assigner window_trigger = window_operation_descriptor.trigger allowed_lateness = window_operation_descriptor.allowed_lateness window_state_descriptor = window_operation_descriptor.window_state_descriptor internal_window_function = window_operation_descriptor.internal_window_function window_serializer = window_operation_descriptor.window_serializer window_coder = window_serializer._get_coder() keyed_state_backend.namespace_coder = window_coder keyed_state_backend._namespace_coder_impl = window_coder.get_impl() window_operator = WindowOperator(window_assigner, keyed_state_backend, user_key_selector, window_state_descriptor, internal_window_function, window_trigger, allowed_lateness) internal_timer_service.set_namespace_serializer(window_serializer) def open_func(): window_operator.open(runtime_context, internal_timer_service) def close_func(): window_operator.close() def process_element(normal_data, timestamp: int): keyed_state_backend.set_current_key( state_key_selector(normal_data)) return window_operator.process_element(input_selector(normal_data), timestamp) def on_event_time(timestamp: int, key, namespace): keyed_state_backend.set_current_key(key) return window_operator.on_event_time(timestamp, key, namespace) def on_processing_time(timestamp: int, key, namespace): keyed_state_backend.set_current_key(key) return window_operator.on_processing_time(timestamp, key, namespace) else: raise Exception("Unsupported function_type: " + str(func_type)) input_handler = RunnerInputHandler(internal_timer_service, process_element) process_element_func = input_handler.process_element timer_handler = TimerHandler(internal_timer_service, on_event_time, on_processing_time, keyed_state_backend._namespace_coder_impl) process_timer_func = timer_handler.process_timer return open_func, close_func, process_element_func, process_timer_func, internal_timer_service
def extract_stateless_function(user_defined_function_proto, runtime_context: RuntimeContext): """ Extracts user-defined-function from the proto representation of a :class:`Function`. :param user_defined_function_proto: the proto representation of the Python :class:`Function` :param runtime_context: the streaming runtime context """ func_type = user_defined_function_proto.function_type UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction if func_type == UserDefinedDataStreamFunction.REVISE_OUTPUT: def open_func(): pass def close_func(): pass def revise_output(value): # VALUE[CURRENT_TIMESTAMP, CURRENT_WATERMARK, NORMAL_DATA] timestamp = value[0] element = value[2] yield Row(timestamp, element) process_element_func = revise_output else: user_defined_func = pickle.loads(user_defined_function_proto.payload) def open_func(): if hasattr(user_defined_func, "open"): user_defined_func.open(runtime_context) def close_func(): if hasattr(user_defined_func, "close"): user_defined_func.close() if func_type == UserDefinedDataStreamFunction.PROCESS: process_element = user_defined_func.process_element ctx = InternalProcessFunctionContext(NonKeyedTimerServiceImpl()) def wrapped_func(value): # VALUE[CURRENT_TIMESTAMP, CURRENT_WATERMARK, NORMAL_DATA] timestamp = value[0] watermark = value[1] ctx.set_timestamp(timestamp) ctx.timer_service().advance_watermark(watermark) results = process_element(value[2], ctx) yield from _emit_results(timestamp, watermark, results) process_element_func = wrapped_func elif func_type == UserDefinedDataStreamFunction.CO_PROCESS: process_element1 = user_defined_func.process_element1 process_element2 = user_defined_func.process_element2 ctx = InternalProcessFunctionContext(NonKeyedTimerServiceImpl()) def wrapped_func(value): # VALUE[CURRENT_TIMESTAMP, CURRENT_WATERMARK, [isLeft, leftInput, rightInput]] timestamp = value[0] watermark = value[1] ctx.set_timestamp(timestamp) ctx.timer_service().advance_watermark(watermark) normal_data = value[2] if normal_data[0]: results = process_element1(normal_data[1], ctx) else: results = process_element2(normal_data[2], ctx) yield from _emit_results(timestamp, watermark, results) process_element_func = wrapped_func else: raise Exception("Unsupported function_type: " + str(func_type)) return open_func, close_func, process_element_func
def extract_stateless_function(user_defined_function_proto, runtime_context: RuntimeContext): """ Extracts user-defined-function from the proto representation of a :class:`Function`. :param user_defined_function_proto: the proto representation of the Python :class:`Function` :param runtime_context: the streaming runtime context """ func_type = user_defined_function_proto.function_type user_defined_func = pickle.loads(user_defined_function_proto.payload) process_element_func = None UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction if func_type == UserDefinedDataStreamFunction.MAP: process_element_func = user_defined_func.map elif func_type == UserDefinedDataStreamFunction.FLAT_MAP: process_element_func = user_defined_func.flat_map elif func_type == UserDefinedDataStreamFunction.CO_MAP: map1 = user_defined_func.map1 map2 = user_defined_func.map2 def wrapped_func(value): # value in format of: [INPUT_FLAG, REAL_VALUE] # INPUT_FLAG value of True for the left stream, while False for the right stream return map1(value[1]) if value[0] else map2(value[2]) process_element_func = wrapped_func elif func_type == UserDefinedDataStreamFunction.CO_FLAT_MAP: flat_map1 = user_defined_func.flat_map1 flat_map2 = user_defined_func.flat_map2 def wrapped_func(value): if value[0]: yield from flat_map1(value[1]) else: yield from flat_map2(value[2]) process_element_func = wrapped_func elif func_type == UserDefinedDataStreamFunction.TIMESTAMP_ASSIGNER: extract_timestamp = user_defined_func.extract_timestamp def wrapped_func(value): pre_timestamp = value[0] real_data = value[1] return extract_timestamp(real_data, pre_timestamp) process_element_func = wrapped_func elif func_type == UserDefinedDataStreamFunction.PROCESS: process_element = user_defined_func.process_element ctx = InternalProcessFunctionContext(NonKeyedTimerServiceImpl()) def wrapped_func(value): # VALUE[CURRENT_TIMESTAMP, CURRENT_WATERMARK, NORMAL_DATA] ctx.set_timestamp(value[0]) ctx.timer_service().advance_watermark(value[1]) output_result = process_element(value[2], ctx) return output_result process_element_func = wrapped_func def open_func(): if hasattr(user_defined_func, "open"): user_defined_func.open(runtime_context) def close_func(): if hasattr(user_defined_func, "close"): user_defined_func.close() return process_element_func, open_func, close_func
def extract_user_defined_function(user_defined_function_proto, pandas_udaf=False, one_arg_optimization=False)\ -> Tuple[str, Dict, List]: """ Extracts user-defined-function from the proto representation of a :class:`UserDefinedFunction`. :param user_defined_function_proto: the proto representation of the Python :param pandas_udaf: whether the user_defined_function_proto is pandas udaf :param one_arg_optimization: whether the optimization enabled :class:`UserDefinedFunction` """ def _next_func_num(): global _func_num _func_num = _func_num + 1 return _func_num def _extract_input(args) -> Tuple[str, Dict, List]: local_variable_dict = {} local_funcs = [] args_str = [] for arg in args: if arg.HasField("udf"): # for chaining Python UDF input: the input argument is a Python ScalarFunction udf_arg, udf_variable_dict, udf_funcs = extract_user_defined_function( arg.udf, one_arg_optimization=one_arg_optimization) args_str.append(udf_arg) local_variable_dict.update(udf_variable_dict) local_funcs.extend(udf_funcs) elif arg.HasField("inputOffset"): if one_arg_optimization: args_str.append("value") else: # the input argument is a column of the input row args_str.append("value[%s]" % arg.inputOffset) else: # the input argument is a constant value constant_value_name, parsed_constant_value = \ _parse_constant_value(arg.inputConstant) args_str.append(constant_value_name) local_variable_dict[ constant_value_name] = parsed_constant_value return ",".join(args_str), local_variable_dict, local_funcs variable_dict = {} user_defined_funcs = [] user_defined_func = pickle.loads(user_defined_function_proto.payload) if pandas_udaf: user_defined_func = PandasAggregateFunctionWrapper(user_defined_func) func_name = 'f%s' % _next_func_num() if isinstance(user_defined_func, DelegatingScalarFunction) \ or isinstance(user_defined_func, DelegationTableFunction): if user_defined_function_proto.is_pandas_udf: variable_dict[func_name] = partial(check_pandas_udf_result, user_defined_func.func) else: variable_dict[func_name] = user_defined_func.func else: variable_dict[func_name] = user_defined_func.eval user_defined_funcs.append(user_defined_func) func_args, input_variable_dict, input_funcs = _extract_input( user_defined_function_proto.inputs) variable_dict.update(input_variable_dict) user_defined_funcs.extend(input_funcs) if user_defined_function_proto.takes_row_as_input: if input_variable_dict: # for constant or other udfs as input arguments. func_str = "%s(%s)" % (func_name, func_args) elif user_defined_function_proto.is_pandas_udf or pandas_udaf: # for pandas udf/udaf, the input data structure is a List of Pandas.Series # we need to merge these Pandas.Series into a Pandas.DataFrame variable_dict[ 'wrap_input_series_as_dataframe'] = wrap_input_series_as_dataframe func_str = "%s(wrap_input_series_as_dataframe(%s))" % (func_name, func_args) else: # directly use `value` as input argument # e.g. # lambda value: Row(value[0], value[1]) # can be optimized to # lambda value: value func_str = "%s(value)" % func_name else: func_str = "%s(%s)" % (func_name, func_args) return func_str, variable_dict, user_defined_funcs
def extract_keyed_process_function(user_defined_function_proto, ctx, on_timer_ctx, collector, keyed_state_backend): func_type = user_defined_function_proto.function_type UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction func = None process_function = pickle.loads(user_defined_function_proto.payload) on_timer = process_function.on_timer if func_type == UserDefinedDataStreamFunction.KEYED_PROCESS: process_element = process_function.process_element def wrapped_keyed_process_function(value): if value[0] is not None: # it is timer data # VALUE: # TIMER_FLAG, TIMESTAMP_OF_TIMER, CURRENT_WATERMARK, CURRENT_KEY_OF_TIMER, None on_timer_ctx.set_timestamp(value[1]) on_timer_ctx.timer_service().set_current_watermark(value[2]) state_current_key = value[3] user_current_key = state_current_key[0] on_timer_ctx.set_current_key(user_current_key) keyed_state_backend.set_current_key(state_current_key) if value[ 0] == KeyedProcessFunctionInputFlag.EVENT_TIME_TIMER.value: on_timer_ctx.set_time_domain(TimeDomain.EVENT_TIME) elif value[ 0] == KeyedProcessFunctionInputFlag.PROC_TIME_TIMER.value: on_timer_ctx.set_time_domain(TimeDomain.PROCESSING_TIME) else: raise TypeError( "TimeCharacteristic[%s] is not supported." % str(value[0])) output_result = on_timer(value[1], on_timer_ctx) else: # it is normal data # VALUE: TIMER_FLAG, CURRENT_TIMESTAMP, CURRENT_WATERMARK, None, NORMAL_DATA # NORMAL_DATA: CURRENT_KEY, DATA ctx.set_timestamp(value[1]) ctx.timer_service().set_current_watermark(value[2]) user_current_key = value[4][0] state_current_key = Row(user_current_key) ctx.set_current_key(user_current_key) keyed_state_backend.set_current_key(state_current_key) output_result = process_element(value[4][1], ctx) if output_result: for result in output_result: yield Row(None, None, None, result) for result in collector.buf: # 0: proc time timer data # 1: event time timer data # 2: normal data # result_row: [TIMER_FLAG, TIMER TYPE, TIMER_KEY, RESULT_DATA] yield Row(result[0], result[1], result[2], None) collector.clear() func = wrapped_keyed_process_function elif func_type == UserDefinedDataStreamFunction.KEYED_CO_PROCESS: input_handler = KeyedTwoInputTimerRowHandler(ctx, on_timer_ctx, collector, keyed_state_backend, process_function) func = input_handler.process_element return func, process_function
def extract_keyed_stateful_function( user_defined_function_proto, keyed_state_backend: RemoteKeyedStateBackend, runtime_context: RuntimeContext): func_type = user_defined_function_proto.function_type UserDefinedDataStreamFunction = flink_fn_execution_pb2.UserDefinedDataStreamFunction payload = pickle.loads(user_defined_function_proto.payload) internal_timer_service = InternalTimerServiceImpl(keyed_state_backend) def state_key_selector(normal_data): return Row(normal_data[0]) def user_key_selector(normal_data): return normal_data[0] def input_selector(normal_data): return normal_data[1] if func_type == UserDefinedDataStreamFunction.KEYED_PROCESS or \ func_type == UserDefinedDataStreamFunction.KEYED_CO_PROCESS: timer_service = TimerServiceImpl(internal_timer_service) on_timer_ctx = InternalKeyedProcessFunctionOnTimerContext( timer_service) ctx = InternalKeyedProcessFunctionContext(timer_service) process_function = payload output_factory = RowWithTimerOutputFactory(VoidNamespaceSerializer()) def open_func(): if hasattr(process_function, "open"): process_function.open(runtime_context) def close_func(): if hasattr(process_function, "close"): process_function.close() if func_type == UserDefinedDataStreamFunction.KEYED_PROCESS: def process_element(normal_data, timestamp: int): ctx.set_timestamp(timestamp) user_current_key = user_key_selector(normal_data) ctx.set_current_key(user_current_key) return process_function.process_element( input_selector(normal_data), ctx) def on_event_time(internal_timer: InternalTimerImpl): timestamp = internal_timer.get_timestamp() state_current_key = internal_timer.get_key() user_current_key = user_key_selector(state_current_key) on_timer_ctx.set_current_key(user_current_key) on_timer_ctx.set_time_domain(TimeDomain.EVENT_TIME) return process_function.on_timer(timestamp, on_timer_ctx) def on_processing_time(internal_timer: InternalTimerImpl): timestamp = internal_timer.get_timestamp() state_current_key = internal_timer.get_key() user_current_key = user_key_selector(state_current_key) on_timer_ctx.set_current_key(user_current_key) on_timer_ctx.set_time_domain(TimeDomain.PROCESSING_TIME) return process_function.on_timer(timestamp, on_timer_ctx) input_handler = OneInputRowWithTimerHandler( internal_timer_service, keyed_state_backend, state_key_selector, process_element, on_event_time, on_processing_time, output_factory) process_element_func = input_handler.accept elif func_type == UserDefinedDataStreamFunction.KEYED_CO_PROCESS: input_handler = TwoInputRowWithTimerHandler( ctx, on_timer_ctx, timer_service, keyed_state_backend, process_function, output_factory) process_element_func = input_handler.accept else: raise Exception("Unsupported func_type: " + str(func_type)) elif func_type == UserDefinedDataStreamFunction.WINDOW: window_operation_descriptor = payload window_assigner = window_operation_descriptor.assigner window_trigger = window_operation_descriptor.trigger allowed_lateness = window_operation_descriptor.allowed_lateness window_state_descriptor = window_operation_descriptor.window_state_descriptor internal_window_function = window_operation_descriptor.internal_window_function window_serializer = window_operation_descriptor.window_serializer keyed_state_backend._namespace_coder_impl = window_serializer._get_coder( ) window_operator = WindowOperator(window_assigner, keyed_state_backend, user_key_selector, window_state_descriptor, internal_window_function, window_trigger, allowed_lateness) output_factory = RowWithTimerOutputFactory(window_serializer) def open_func(): window_operator.open(runtime_context, internal_timer_service) def close_func(): window_operator.close() input_handler = OneInputRowWithTimerHandler( internal_timer_service, keyed_state_backend, state_key_selector, lambda n, t: window_operator.process_element(input_selector(n), t), window_operator.on_event_time, window_operator.on_processing_time, output_factory) process_element_func = input_handler.accept else: raise Exception("Unsupported func_type: " + str(func_type)) return process_element_func, open_func, close_func