def _get_java_python_function_operator(self, func: Union[Function, FunctionWrapper], type_info: TypeInformation, func_name: str, func_type: int): """ Create a flink operator according to user provided function object, data types, function name and function type. :param func: a function object that implements the Function interface. :param type_info: the data type of the function output data. :param func_name: function name. :param func_type: function type, supports MAP, FLAT_MAP, etc. :return: A flink java operator which is responsible for execution user defined python function. """ gateway = get_gateway() import cloudpickle serialized_func = cloudpickle.dumps(func) j_input_types = self._j_data_stream.getTransformation().getOutputType() if type_info is None: output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO( ) else: if isinstance(type_info, list): output_type_info = RowTypeInfo(type_info) else: output_type_info = type_info DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \ .python.DataStreamPythonFunction j_python_data_stream_scalar_function = DataStreamPythonFunction( func_name, bytearray(serialized_func), _get_python_env()) DataStreamPythonFunctionInfo = gateway.jvm. \ org.apache.flink.datastream.runtime.functions.python \ .DataStreamPythonFunctionInfo j_python_data_stream_function_info = DataStreamPythonFunctionInfo( j_python_data_stream_scalar_function, func_type) j_env = self._j_data_stream.getExecutionEnvironment() PythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil j_conf = PythonConfigUtil.getMergedConfig(j_env) DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \ .operators.python.DataStreamPythonStatelessFunctionOperator j_python_data_stream_scalar_function_operator = DataStreamPythonFunctionOperator( j_conf, j_input_types, output_type_info.get_java_type_info(), j_python_data_stream_function_info) return j_python_data_stream_scalar_function_operator, output_type_info
def _get_connected_stream_operator(self, func: Union[Function, FunctionWrapper], type_info: TypeInformation, func_name: str, func_type: int): gateway = get_gateway() import cloudpickle serialized_func = cloudpickle.dumps(func) j_input_types1 = self.stream1._j_data_stream.getTransformation().getOutputType() j_input_types2 = self.stream2._j_data_stream.getTransformation().getOutputType() if type_info is None: output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO() else: if isinstance(type_info, list): output_type_info = RowTypeInfo(type_info) else: output_type_info = type_info DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \ .python.DataStreamPythonFunction j_python_data_stream_scalar_function = DataStreamPythonFunction( func_name, bytearray(serialized_func), _get_python_env()) DataStreamPythonFunctionInfo = gateway.jvm. \ org.apache.flink.datastream.runtime.functions.python \ .DataStreamPythonFunctionInfo j_python_data_stream_function_info = DataStreamPythonFunctionInfo( j_python_data_stream_scalar_function, func_type) j_conf = gateway.jvm.org.apache.flink.configuration.Configuration() DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \ .operators.python.DataStreamTwoInputPythonStatelessFunctionOperator j_python_data_stream_function_operator = DataStreamPythonFunctionOperator( j_conf, j_input_types1, j_input_types2, output_type_info.get_java_type_info(), j_python_data_stream_function_info, self._is_keyed_stream()) return j_python_data_stream_function_operator, output_type_info.get_java_type_info()
def assign_timestamps_and_watermarks(self, watermark_strategy: WatermarkStrategy) -> \ 'DataStream': """ Assigns timestamps to the elements in the data stream and generates watermarks to signal event time progress. The given {@link WatermarkStrategy} is used to create a TimestampAssigner and WatermarkGenerator. :param watermark_strategy: The strategy to generate watermarks based on event timestamps. :return: The stream after the transformation, with assigned timestamps and watermarks. """ if watermark_strategy._timestamp_assigner is not None: # user implement a TimestampAssigner, we need to extracted and generate watermarks with # a custom Operator. from pyflink.fn_execution import flink_fn_execution_pb2 as ffpb2 gateway = get_gateway() import cloudpickle serialized_func = cloudpickle.dumps(watermark_strategy._timestamp_assigner) JDataStreamPythonFunction = gateway.jvm.DataStreamPythonFunction j_data_stream_python_function = JDataStreamPythonFunction( bytearray(serialized_func), _get_python_env()) JDataStreamPythonFunctionInfo = gateway.jvm.DataStreamPythonFunctionInfo j_data_stream_python_function_info = JDataStreamPythonFunctionInfo( j_data_stream_python_function, ffpb2.UserDefinedDataStreamFunction.TIMESTAMP_ASSIGNER) # type: ignore j_conf = gateway.jvm.org.apache.flink.configuration.Configuration() j_output_type = self._j_data_stream.getType() j_operator = gateway.jvm\ .org.apache.flink.streaming.api.operators.python\ .PythonTimestampsAndWatermarksOperator( j_conf, j_output_type, j_data_stream_python_function_info, watermark_strategy._j_watermark_strategy) return DataStream(self._j_data_stream.transform( "TIMESTAMP_AND_WATERMARK", j_output_type, j_operator)) else: # if user not specify a TimestampAssigner, then return directly assign the Java # watermark strategy. return DataStream(self._j_data_stream.assignTimestampsAndWatermarks( watermark_strategy._j_watermark_strategy))
def _get_java_python_function_operator(self, func: Union[Function, FunctionWrapper], type_info: TypeInformation, func_name: str, func_type: int): """ Create a flink operator according to user provided function object, data types, function name and function type. :param func: a function object that implements the Function interface. :param type_info: the data type of the function output data. :param func_name: function name. :param func_type: function type, supports MAP, FLAT_MAP, etc. :return: A flink java operator which is responsible for execution user defined python function. """ gateway = get_gateway() import cloudpickle serialized_func = cloudpickle.dumps(func) j_input_types = self._j_data_stream.getTransformation().getOutputType() if type_info is None: output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO( ) else: if isinstance(type_info, list): output_type_info = RowTypeInfo(type_info) else: output_type_info = type_info DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \ .python.DataStreamPythonFunction j_python_data_stream_scalar_function = DataStreamPythonFunction( func_name, bytearray(serialized_func), _get_python_env()) DataStreamPythonFunctionInfo = gateway.jvm. \ org.apache.flink.datastream.runtime.functions.python \ .DataStreamPythonFunctionInfo j_python_data_stream_function_info = DataStreamPythonFunctionInfo( j_python_data_stream_scalar_function, func_type) j_conf = gateway.jvm.org.apache.flink.configuration.Configuration() # set max bundle size to 1 to force synchronize process for reduce function. from pyflink.fn_execution.flink_fn_execution_pb2 import UserDefinedDataStreamFunction if func_type == UserDefinedDataStreamFunction.REDUCE: j_conf.setInteger( gateway.jvm.org.apache.flink.python.PythonOptions. MAX_BUNDLE_SIZE, 1) DataStreamPythonReduceFunctionOperator = gateway.jvm.org.apache.flink.datastream \ .runtime.operators.python.DataStreamPythonReduceFunctionOperator j_output_type_info = j_input_types.getTypeAt(1) j_python_data_stream_function_operator = DataStreamPythonReduceFunctionOperator( j_conf, j_input_types, j_output_type_info, j_python_data_stream_function_info) return j_python_data_stream_function_operator, j_output_type_info else: if str(func) == '_Flink_PartitionCustomMapFunction': DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \ .operators.python.DataStreamPythonPartitionCustomFunctionOperator else: DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \ .operators.python.DataStreamPythonStatelessFunctionOperator j_python_data_stream_function_operator = DataStreamPythonFunctionOperator( j_conf, j_input_types, output_type_info.get_java_type_info(), j_python_data_stream_function_info) return j_python_data_stream_function_operator, output_type_info.get_java_type_info( )