Esempio n. 1
0
    def _get_connected_stream_operator(self, func: Union[Function, FunctionWrapper],
                                       type_info: TypeInformation, func_name: str,
                                       func_type: int):
        gateway = get_gateway()
        import cloudpickle
        serialized_func = cloudpickle.dumps(func)

        j_input_types1 = self.stream1._j_data_stream.getTransformation().getOutputType()
        j_input_types2 = self.stream2._j_data_stream.getTransformation().getOutputType()

        if type_info is None:
            output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO()
        else:
            if isinstance(type_info, list):
                output_type_info = RowTypeInfo(type_info)
            else:
                output_type_info = type_info

        DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \
            .python.DataStreamPythonFunction
        j_python_data_stream_scalar_function = DataStreamPythonFunction(
            func_name,
            bytearray(serialized_func),
            _get_python_env())

        DataStreamPythonFunctionInfo = gateway.jvm. \
            org.apache.flink.datastream.runtime.functions.python \
            .DataStreamPythonFunctionInfo

        j_python_data_stream_function_info = DataStreamPythonFunctionInfo(
            j_python_data_stream_scalar_function,
            func_type)

        j_conf = gateway.jvm.org.apache.flink.configuration.Configuration()
        DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \
            .operators.python.DataStreamTwoInputPythonStatelessFunctionOperator
        j_python_data_stream_function_operator = DataStreamPythonFunctionOperator(
            j_conf,
            j_input_types1,
            j_input_types2,
            output_type_info.get_java_type_info(),
            j_python_data_stream_function_info,
            self._is_keyed_stream())

        return j_python_data_stream_function_operator, output_type_info.get_java_type_info()
Esempio n. 2
0
    def _get_java_python_function_operator(self, func: Union[Function,
                                                             FunctionWrapper],
                                           type_info: TypeInformation,
                                           func_name: str, func_type: int):
        """
        Create a flink operator according to user provided function object, data types,
        function name and function type.

        :param func: a function object that implements the Function interface.
        :param type_info: the data type of the function output data.
        :param func_name: function name.
        :param func_type: function type, supports MAP, FLAT_MAP, etc.
        :return: A flink java operator which is responsible for execution user defined python
                 function.
        """

        gateway = get_gateway()
        import cloudpickle
        serialized_func = cloudpickle.dumps(func)

        j_input_types = self._j_data_stream.getTransformation().getOutputType()

        if type_info is None:
            output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO(
            )
        else:
            if isinstance(type_info, list):
                output_type_info = RowTypeInfo(type_info)
            else:
                output_type_info = type_info

        DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \
            .python.DataStreamPythonFunction
        j_python_data_stream_scalar_function = DataStreamPythonFunction(
            func_name, bytearray(serialized_func), _get_python_env())

        DataStreamPythonFunctionInfo = gateway.jvm. \
            org.apache.flink.datastream.runtime.functions.python \
            .DataStreamPythonFunctionInfo

        j_python_data_stream_function_info = DataStreamPythonFunctionInfo(
            j_python_data_stream_scalar_function, func_type)

        j_env = self._j_data_stream.getExecutionEnvironment()
        PythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil
        j_conf = PythonConfigUtil.getMergedConfig(j_env)

        DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \
            .operators.python.DataStreamPythonStatelessFunctionOperator

        j_python_data_stream_scalar_function_operator = DataStreamPythonFunctionOperator(
            j_conf, j_input_types, output_type_info.get_java_type_info(),
            j_python_data_stream_function_info)
        return j_python_data_stream_scalar_function_operator, output_type_info
Esempio n. 3
0
 def _build_orc_job(self, row_type: RowType, row_type_info: RowTypeInfo, data: List[Row]):
     jvm = get_gateway().jvm
     sink = FileSink.for_bulk_format(
         self.orc_dir_name, OrcBulkWriters.for_row_type(row_type)
     ).build()
     j_list = jvm.java.util.ArrayList()
     for d in data:
         j_list.add(to_java_data_structure(d))
     ds = DataStream(self.env._j_stream_execution_environment.fromCollection(
         j_list,
         row_type_info.get_java_type_info()
     ))
     ds.sink_to(sink)
Esempio n. 4
0
class OutputTag(object):
    """
    An :class:`OutputTag` is a typed and named tag to use for tagging side outputs of an operator.

    Example:
    ::

        # Explicitly specify output type
        >>> info = OutputTag("late-data", Types.TUPLE([Types.STRING(), Types.LONG()]))
        # Implicitly wrap list to Types.ROW
        >>> info_row = OutputTag("row", [Types.STRING(), Types.LONG()])
        # Implicitly use pickle serialization
        >>> info_side = OutputTag("side")
        # ERROR: tag id cannot be empty string (extra requirement for Python API)
        >>> info_error = OutputTag("")

    """
    def __init__(self,
                 tag_id: str,
                 type_info: Optional[Union[TypeInformation, list]] = None):
        if not tag_id:
            raise ValueError("OutputTag tag_id cannot be None or empty string")
        self.tag_id = tag_id
        if type_info is None:
            self.type_info = Types.PICKLED_BYTE_ARRAY()
        elif isinstance(type_info, list):
            self.type_info = RowTypeInfo(type_info)
        elif not isinstance(type_info, TypeInformation):
            raise TypeError(
                "OutputTag type_info must be None, list or TypeInformation")
        else:
            self.type_info = type_info

    def get_java_output_tag(self):
        gateway = get_gateway()
        j_obj = gateway.jvm.org.apache.flink.util.OutputTag(
            self.tag_id, self.type_info.get_java_type_info())
        # deal with serializability
        self.type_info._j_typeinfo = None
        return j_obj
Esempio n. 5
0
    def _get_java_python_function_operator(self, func: Union[Function,
                                                             FunctionWrapper],
                                           type_info: TypeInformation,
                                           func_name: str, func_type: int):
        """
        Create a flink operator according to user provided function object, data types,
        function name and function type.

        :param func: a function object that implements the Function interface.
        :param type_info: the data type of the function output data.
        :param func_name: function name.
        :param func_type: function type, supports MAP, FLAT_MAP, etc.
        :return: A flink java operator which is responsible for execution user defined python
                 function.
        """

        gateway = get_gateway()
        import cloudpickle
        serialized_func = cloudpickle.dumps(func)

        j_input_types = self._j_data_stream.getTransformation().getOutputType()

        if type_info is None:
            output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO(
            )
        else:
            if isinstance(type_info, list):
                output_type_info = RowTypeInfo(type_info)
            else:
                output_type_info = type_info

        DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \
            .python.DataStreamPythonFunction
        j_python_data_stream_scalar_function = DataStreamPythonFunction(
            func_name, bytearray(serialized_func), _get_python_env())

        DataStreamPythonFunctionInfo = gateway.jvm. \
            org.apache.flink.datastream.runtime.functions.python \
            .DataStreamPythonFunctionInfo

        j_python_data_stream_function_info = DataStreamPythonFunctionInfo(
            j_python_data_stream_scalar_function, func_type)

        j_conf = gateway.jvm.org.apache.flink.configuration.Configuration()

        # set max bundle size to 1 to force synchronize process for reduce function.
        from pyflink.fn_execution.flink_fn_execution_pb2 import UserDefinedDataStreamFunction
        if func_type == UserDefinedDataStreamFunction.REDUCE:
            j_conf.setInteger(
                gateway.jvm.org.apache.flink.python.PythonOptions.
                MAX_BUNDLE_SIZE, 1)
            DataStreamPythonReduceFunctionOperator = gateway.jvm.org.apache.flink.datastream \
                .runtime.operators.python.DataStreamPythonReduceFunctionOperator

            j_output_type_info = j_input_types.getTypeAt(1)
            j_python_data_stream_function_operator = DataStreamPythonReduceFunctionOperator(
                j_conf, j_input_types, j_output_type_info,
                j_python_data_stream_function_info)
            return j_python_data_stream_function_operator, j_output_type_info
        else:
            if str(func) == '_Flink_PartitionCustomMapFunction':
                DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \
                    .operators.python.DataStreamPythonPartitionCustomFunctionOperator
            else:
                DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \
                    .operators.python.DataStreamPythonStatelessFunctionOperator

            j_python_data_stream_function_operator = DataStreamPythonFunctionOperator(
                j_conf, j_input_types, output_type_info.get_java_type_info(),
                j_python_data_stream_function_info)

            return j_python_data_stream_function_operator, output_type_info.get_java_type_info(
            )