def _get_java_python_function_operator(self, func: Union[Function, FunctionWrapper], type_info: TypeInformation, func_name: str, func_type: int): """ Create a flink operator according to user provided function object, data types, function name and function type. :param func: a function object that implements the Function interface. :param type_info: the data type of the function output data. :param func_name: function name. :param func_type: function type, supports MAP, FLAT_MAP, etc. :return: A flink java operator which is responsible for execution user defined python function. """ gateway = get_gateway() import cloudpickle serialized_func = cloudpickle.dumps(func) j_input_types = self._j_data_stream.getTransformation().getOutputType() if type_info is None: output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO( ) else: if isinstance(type_info, list): output_type_info = RowTypeInfo(type_info) else: output_type_info = type_info DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \ .python.DataStreamPythonFunction j_python_data_stream_scalar_function = DataStreamPythonFunction( func_name, bytearray(serialized_func), _get_python_env()) DataStreamPythonFunctionInfo = gateway.jvm. \ org.apache.flink.datastream.runtime.functions.python \ .DataStreamPythonFunctionInfo j_python_data_stream_function_info = DataStreamPythonFunctionInfo( j_python_data_stream_scalar_function, func_type) j_env = self._j_data_stream.getExecutionEnvironment() PythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil j_conf = PythonConfigUtil.getMergedConfig(j_env) DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \ .operators.python.DataStreamPythonStatelessFunctionOperator j_python_data_stream_scalar_function_operator = DataStreamPythonFunctionOperator( j_conf, j_input_types, output_type_info.get_java_type_info(), j_python_data_stream_function_info) return j_python_data_stream_scalar_function_operator, output_type_info
def test_row_type(self): self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()]) .get_field_names(), ['f0', 'f1']) self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']).get_field_names(), ['a', 'b']) self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']) == RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']), True) self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']) == RowTypeInfo([Types.STRING(), Types.INT()], ['a', 'b']), False) self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']).__str__(), "RowTypeInfo(a: String, b: String)") self.assertEqual(Types.ROW([Types.STRING(), Types.STRING()]), RowTypeInfo([Types.STRING(), Types.STRING()]), True) self.assertEqual(Types.ROW_NAMED(['a', 'b'], [Types.STRING(), Types.STRING()]) .get_field_names(), ['a', 'b'], True) self.assertEqual(Types.ROW_NAMED(['a', 'b'], [Types.STRING(), Types.STRING()]) .get_field_types(), [Types.STRING(), Types.STRING()], True)
def __init__(self, tag_id: str, type_info: Optional[Union[TypeInformation, list]] = None): if not tag_id: raise ValueError("OutputTag tag_id cannot be None or empty string") self.tag_id = tag_id if type_info is None: self.type_info = Types.PICKLED_BYTE_ARRAY() elif isinstance(type_info, list): self.type_info = RowTypeInfo(type_info) elif not isinstance(type_info, TypeInformation): raise TypeError( "OutputTag type_info must be None, list or TypeInformation") else: self.type_info = type_info
def _get_connected_stream_operator(self, func: Union[Function, FunctionWrapper], type_info: TypeInformation, func_name: str, func_type: int): gateway = get_gateway() import cloudpickle serialized_func = cloudpickle.dumps(func) j_input_types1 = self.stream1._j_data_stream.getTransformation().getOutputType() j_input_types2 = self.stream2._j_data_stream.getTransformation().getOutputType() if type_info is None: output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO() else: if isinstance(type_info, list): output_type_info = RowTypeInfo(type_info) else: output_type_info = type_info DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \ .python.DataStreamPythonFunction j_python_data_stream_scalar_function = DataStreamPythonFunction( func_name, bytearray(serialized_func), _get_python_env()) DataStreamPythonFunctionInfo = gateway.jvm. \ org.apache.flink.datastream.runtime.functions.python \ .DataStreamPythonFunctionInfo j_python_data_stream_function_info = DataStreamPythonFunctionInfo( j_python_data_stream_scalar_function, func_type) j_conf = gateway.jvm.org.apache.flink.configuration.Configuration() DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \ .operators.python.DataStreamTwoInputPythonStatelessFunctionOperator j_python_data_stream_function_operator = DataStreamPythonFunctionOperator( j_conf, j_input_types1, j_input_types2, output_type_info.get_java_type_info(), j_python_data_stream_function_info, self._is_keyed_stream()) return j_python_data_stream_function_operator, output_type_info.get_java_type_info()
def _build_orc_job(self, row_type: RowType, row_type_info: RowTypeInfo, data: List[Row]): jvm = get_gateway().jvm sink = FileSink.for_bulk_format( self.orc_dir_name, OrcBulkWriters.for_row_type(row_type) ).build() j_list = jvm.java.util.ArrayList() for d in data: j_list.add(to_java_data_structure(d)) ds = DataStream(self.env._j_stream_execution_environment.fromCollection( j_list, row_type_info.get_java_type_info() )) ds.sink_to(sink)
class OutputTag(object): """ An :class:`OutputTag` is a typed and named tag to use for tagging side outputs of an operator. Example: :: # Explicitly specify output type >>> info = OutputTag("late-data", Types.TUPLE([Types.STRING(), Types.LONG()])) # Implicitly wrap list to Types.ROW >>> info_row = OutputTag("row", [Types.STRING(), Types.LONG()]) # Implicitly use pickle serialization >>> info_side = OutputTag("side") # ERROR: tag id cannot be empty string (extra requirement for Python API) >>> info_error = OutputTag("") """ def __init__(self, tag_id: str, type_info: Optional[Union[TypeInformation, list]] = None): if not tag_id: raise ValueError("OutputTag tag_id cannot be None or empty string") self.tag_id = tag_id if type_info is None: self.type_info = Types.PICKLED_BYTE_ARRAY() elif isinstance(type_info, list): self.type_info = RowTypeInfo(type_info) elif not isinstance(type_info, TypeInformation): raise TypeError( "OutputTag type_info must be None, list or TypeInformation") else: self.type_info = type_info def get_java_output_tag(self): gateway = get_gateway() j_obj = gateway.jvm.org.apache.flink.util.OutputTag( self.tag_id, self.type_info.get_java_type_info()) # deal with serializability self.type_info._j_typeinfo = None return j_obj
def sink(sql: str, type_info: RowTypeInfo, jdbc_connection_options: 'JdbcConnectionOptions', jdbc_execution_options: 'JdbcExecutionOptions' = None): """ Create a JDBC sink. :param sql: arbitrary DML query (e.g. insert, update, upsert) :param type_info: A RowTypeInfo for query field types. :param jdbc_execution_options: parameters of execution, such as batch size and maximum retries. :param jdbc_connection_options: parameters of connection, such as JDBC URL. :return: A JdbcSink. """ sql_types = [] gateway = get_gateway() JJdbcTypeUtil = gateway.jvm.org.apache.flink.connector.jdbc.utils.JdbcTypeUtil for field_type in type_info.get_field_types(): sql_types.append( JJdbcTypeUtil.typeInformationToSqlType( field_type.get_java_type_info())) j_sql_type = to_jarray(gateway.jvm.int, sql_types) output_format_clz = gateway.jvm.Class\ .forName('org.apache.flink.connector.jdbc.internal.JdbcBatchingOutputFormat', False, get_gateway().jvm.Thread.currentThread().getContextClassLoader()) j_int_array_type = to_jarray(gateway.jvm.int, []).getClass() j_builder_method = output_format_clz.getDeclaredMethod( 'createRowJdbcStatementBuilder', to_jarray(gateway.jvm.Class, [j_int_array_type])) j_builder_method.setAccessible(True) j_statement_builder = j_builder_method.invoke( None, to_jarray(gateway.jvm.Object, [j_sql_type])) jdbc_execution_options = jdbc_execution_options if jdbc_execution_options is not None \ else JdbcExecutionOptions.defaults() j_jdbc_sink = gateway.jvm.org.apache.flink.connector.jdbc.JdbcSink\ .sink(sql, j_statement_builder, jdbc_execution_options._j_jdbc_execution_options, jdbc_connection_options._j_jdbc_connection_options) return JdbcSink(j_jdbc_sink=j_jdbc_sink)
def _get_java_python_function_operator(self, func: Union[Function, FunctionWrapper], type_info: TypeInformation, func_name: str, func_type: int): """ Create a flink operator according to user provided function object, data types, function name and function type. :param func: a function object that implements the Function interface. :param type_info: the data type of the function output data. :param func_name: function name. :param func_type: function type, supports MAP, FLAT_MAP, etc. :return: A flink java operator which is responsible for execution user defined python function. """ gateway = get_gateway() import cloudpickle serialized_func = cloudpickle.dumps(func) j_input_types = self._j_data_stream.getTransformation().getOutputType() if type_info is None: output_type_info = PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO( ) else: if isinstance(type_info, list): output_type_info = RowTypeInfo(type_info) else: output_type_info = type_info DataStreamPythonFunction = gateway.jvm.org.apache.flink.datastream.runtime.functions \ .python.DataStreamPythonFunction j_python_data_stream_scalar_function = DataStreamPythonFunction( func_name, bytearray(serialized_func), _get_python_env()) DataStreamPythonFunctionInfo = gateway.jvm. \ org.apache.flink.datastream.runtime.functions.python \ .DataStreamPythonFunctionInfo j_python_data_stream_function_info = DataStreamPythonFunctionInfo( j_python_data_stream_scalar_function, func_type) j_conf = gateway.jvm.org.apache.flink.configuration.Configuration() # set max bundle size to 1 to force synchronize process for reduce function. from pyflink.fn_execution.flink_fn_execution_pb2 import UserDefinedDataStreamFunction if func_type == UserDefinedDataStreamFunction.REDUCE: j_conf.setInteger( gateway.jvm.org.apache.flink.python.PythonOptions. MAX_BUNDLE_SIZE, 1) DataStreamPythonReduceFunctionOperator = gateway.jvm.org.apache.flink.datastream \ .runtime.operators.python.DataStreamPythonReduceFunctionOperator j_output_type_info = j_input_types.getTypeAt(1) j_python_data_stream_function_operator = DataStreamPythonReduceFunctionOperator( j_conf, j_input_types, j_output_type_info, j_python_data_stream_function_info) return j_python_data_stream_function_operator, j_output_type_info else: if str(func) == '_Flink_PartitionCustomMapFunction': DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \ .operators.python.DataStreamPythonPartitionCustomFunctionOperator else: DataStreamPythonFunctionOperator = gateway.jvm.org.apache.flink.datastream.runtime \ .operators.python.DataStreamPythonStatelessFunctionOperator j_python_data_stream_function_operator = DataStreamPythonFunctionOperator( j_conf, j_input_types, output_type_info.get_java_type_info(), j_python_data_stream_function_info) return j_python_data_stream_function_operator, output_type_info.get_java_type_info( )