Exemple #1
0
    def _get_java_python_function_operator(self, func: Union[Function,
                                                             FunctionWrapper],
                                           type_info: TypeInformation,
                                           func_name: str, func_type: int):
        """
        Create a flink operator according to user provided function object, data types,
        function name and function type.

        :param func: a function object that implements the Function interface.
        :param type_info: the data type of the function output data.
        :param func_name: function name.
        :param func_type: function type, supports MAP, FLAT_MAP, etc.
        :return: A flink java operator which is responsible for execution user defined python
                 function.
        """

        gateway = get_gateway()
        import cloudpickle
        serialized_func = cloudpickle.dumps(func)

        j_input_types = self._j_data_stream.getTransformation().getOutputType()

        if type_info is None:
            output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO(
            )
        else:
            if isinstance(type_info, list):
                output_type_info = RowTypeInfo(type_info)
            else:
                output_type_info = type_info

        DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \
            .python.DataStreamPythonFunction
        j_python_data_stream_scalar_function = DataStreamPythonFunction(
            func_name, bytearray(serialized_func), _get_python_env())

        DataStreamPythonFunctionInfo = gateway.jvm. \
            org.apache.flink.datastream.runtime.functions.python \
            .DataStreamPythonFunctionInfo

        j_python_data_stream_function_info = DataStreamPythonFunctionInfo(
            j_python_data_stream_scalar_function, func_type)

        j_env = self._j_data_stream.getExecutionEnvironment()
        PythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil
        j_conf = PythonConfigUtil.getMergedConfig(j_env)

        DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \
            .operators.python.DataStreamPythonStatelessFunctionOperator

        j_python_data_stream_scalar_function_operator = DataStreamPythonFunctionOperator(
            j_conf, j_input_types, output_type_info.get_java_type_info(),
            j_python_data_stream_function_info)
        return j_python_data_stream_scalar_function_operator, output_type_info
Exemple #2
0
    def _get_connected_stream_operator(self, func: Union[Function, FunctionWrapper],
                                       type_info: TypeInformation, func_name: str,
                                       func_type: int):
        gateway = get_gateway()
        import cloudpickle
        serialized_func = cloudpickle.dumps(func)

        j_input_types1 = self.stream1._j_data_stream.getTransformation().getOutputType()
        j_input_types2 = self.stream2._j_data_stream.getTransformation().getOutputType()

        if type_info is None:
            output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO()
        else:
            if isinstance(type_info, list):
                output_type_info = RowTypeInfo(type_info)
            else:
                output_type_info = type_info

        DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \
            .python.DataStreamPythonFunction
        j_python_data_stream_scalar_function = DataStreamPythonFunction(
            func_name,
            bytearray(serialized_func),
            _get_python_env())

        DataStreamPythonFunctionInfo = gateway.jvm. \
            org.apache.flink.datastream.runtime.functions.python \
            .DataStreamPythonFunctionInfo

        j_python_data_stream_function_info = DataStreamPythonFunctionInfo(
            j_python_data_stream_scalar_function,
            func_type)

        j_conf = gateway.jvm.org.apache.flink.configuration.Configuration()
        DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \
            .operators.python.DataStreamTwoInputPythonStatelessFunctionOperator
        j_python_data_stream_function_operator = DataStreamPythonFunctionOperator(
            j_conf,
            j_input_types1,
            j_input_types2,
            output_type_info.get_java_type_info(),
            j_python_data_stream_function_info,
            self._is_keyed_stream())

        return j_python_data_stream_function_operator, output_type_info.get_java_type_info()
Exemple #3
0
    def assign_timestamps_and_watermarks(self, watermark_strategy: WatermarkStrategy) -> \
            'DataStream':
        """
        Assigns timestamps to the elements in the data stream and generates watermarks to signal
        event time progress. The given {@link WatermarkStrategy} is used to create a
        TimestampAssigner and WatermarkGenerator.

        :param watermark_strategy: The strategy to generate watermarks based on event timestamps.
        :return: The stream after the transformation, with assigned timestamps and watermarks.
        """
        if watermark_strategy._timestamp_assigner is not None:
            # user implement a TimestampAssigner, we need to extracted and generate watermarks with
            # a custom Operator.
            from pyflink.fn_execution import flink_fn_execution_pb2 as ffpb2
            gateway = get_gateway()
            import cloudpickle
            serialized_func = cloudpickle.dumps(watermark_strategy._timestamp_assigner)
            JDataStreamPythonFunction = gateway.jvm.DataStreamPythonFunction
            j_data_stream_python_function = JDataStreamPythonFunction(
                bytearray(serialized_func),
                _get_python_env())

            JDataStreamPythonFunctionInfo = gateway.jvm.DataStreamPythonFunctionInfo
            j_data_stream_python_function_info = JDataStreamPythonFunctionInfo(
                j_data_stream_python_function,
                ffpb2.UserDefinedDataStreamFunction.TIMESTAMP_ASSIGNER)  # type: ignore
            j_conf = gateway.jvm.org.apache.flink.configuration.Configuration()
            j_output_type = self._j_data_stream.getType()
            j_operator = gateway.jvm\
                .org.apache.flink.streaming.api.operators.python\
                .PythonTimestampsAndWatermarksOperator(
                    j_conf,
                    j_output_type,
                    j_data_stream_python_function_info,
                    watermark_strategy._j_watermark_strategy)
            return DataStream(self._j_data_stream.transform(
                "TIMESTAMP_AND_WATERMARK",
                j_output_type,
                j_operator))
        else:
            # if user not specify a TimestampAssigner, then return directly assign the Java
            # watermark strategy.
            return DataStream(self._j_data_stream.assignTimestampsAndWatermarks(
                watermark_strategy._j_watermark_strategy))
Exemple #4
0
    def _get_java_python_function_operator(self, func: Union[Function,
                                                             FunctionWrapper],
                                           type_info: TypeInformation,
                                           func_name: str, func_type: int):
        """
        Create a flink operator according to user provided function object, data types,
        function name and function type.

        :param func: a function object that implements the Function interface.
        :param type_info: the data type of the function output data.
        :param func_name: function name.
        :param func_type: function type, supports MAP, FLAT_MAP, etc.
        :return: A flink java operator which is responsible for execution user defined python
                 function.
        """

        gateway = get_gateway()
        import cloudpickle
        serialized_func = cloudpickle.dumps(func)

        j_input_types = self._j_data_stream.getTransformation().getOutputType()

        if type_info is None:
            output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO(
            )
        else:
            if isinstance(type_info, list):
                output_type_info = RowTypeInfo(type_info)
            else:
                output_type_info = type_info

        DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \
            .python.DataStreamPythonFunction
        j_python_data_stream_scalar_function = DataStreamPythonFunction(
            func_name, bytearray(serialized_func), _get_python_env())

        DataStreamPythonFunctionInfo = gateway.jvm. \
            org.apache.flink.datastream.runtime.functions.python \
            .DataStreamPythonFunctionInfo

        j_python_data_stream_function_info = DataStreamPythonFunctionInfo(
            j_python_data_stream_scalar_function, func_type)

        j_conf = gateway.jvm.org.apache.flink.configuration.Configuration()

        # set max bundle size to 1 to force synchronize process for reduce function.
        from pyflink.fn_execution.flink_fn_execution_pb2 import UserDefinedDataStreamFunction
        if func_type == UserDefinedDataStreamFunction.REDUCE:
            j_conf.setInteger(
                gateway.jvm.org.apache.flink.python.PythonOptions.
                MAX_BUNDLE_SIZE, 1)
            DataStreamPythonReduceFunctionOperator = gateway.jvm.org.apache.flink.datastream \
                .runtime.operators.python.DataStreamPythonReduceFunctionOperator

            j_output_type_info = j_input_types.getTypeAt(1)
            j_python_data_stream_function_operator = DataStreamPythonReduceFunctionOperator(
                j_conf, j_input_types, j_output_type_info,
                j_python_data_stream_function_info)
            return j_python_data_stream_function_operator, j_output_type_info
        else:
            if str(func) == '_Flink_PartitionCustomMapFunction':
                DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \
                    .operators.python.DataStreamPythonPartitionCustomFunctionOperator
            else:
                DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \
                    .operators.python.DataStreamPythonStatelessFunctionOperator

            j_python_data_stream_function_operator = DataStreamPythonFunctionOperator(
                j_conf, j_input_types, output_type_info.get_java_type_info(),
                j_python_data_stream_function_info)

            return j_python_data_stream_function_operator, output_type_info.get_java_type_info(
            )