def _create_judf(self): gateway = get_gateway() def get_python_function_kind(udf_type): JPythonFunctionKind = gateway.jvm.org.apache.flink.table.functions.python.\ PythonFunctionKind if udf_type == "general": return JPythonFunctionKind.GENERAL elif udf_type == "pandas": return JPythonFunctionKind.PANDAS else: raise TypeError("Unsupported udf_type: %s." % udf_type) func = self._func if not isinstance(self._func, UserDefinedFunction): func = DelegatingScalarFunction(self._func) import cloudpickle serialized_func = cloudpickle.dumps(func) j_input_types = utils.to_jarray( gateway.jvm.TypeInformation, [_to_java_type(i) for i in self._input_types]) j_result_type = _to_java_type(self._result_type) j_function_kind = get_python_function_kind(self._udf_type) PythonScalarFunction = gateway.jvm \ .org.apache.flink.table.functions.python.PythonScalarFunction j_scalar_function = PythonScalarFunction(self._name, bytearray(serialized_func), j_input_types, j_result_type, j_function_kind, self._deterministic, _get_python_env()) return j_scalar_function
def _create_judtf(self): func = self._func if not isinstance(self._func, UserDefinedFunction): func = DelegationTableFunction(self._func) import cloudpickle serialized_func = cloudpickle.dumps(func) gateway = get_gateway() if self._input_types is not None: j_input_types = utils.to_jarray( gateway.jvm.TypeInformation, [_to_java_type(i) for i in self._input_types]) else: j_input_types = None j_result_types = utils.to_jarray( gateway.jvm.TypeInformation, [_to_java_type(i) for i in self._result_types]) j_result_type = gateway.jvm.org.apache.flink.api.java.typeutils.RowTypeInfo( j_result_types) j_function_kind = gateway.jvm.org.apache.flink.table.functions.python. \ PythonFunctionKind.GENERAL PythonTableFunction = gateway.jvm \ .org.apache.flink.table.functions.python.PythonTableFunction j_table_function = PythonTableFunction(self._name, bytearray(serialized_func), j_input_types, j_result_type, j_function_kind, self._deterministic, _get_python_env()) return j_table_function
def _create_judf(self, is_blink_planner, table_config): func = self._func if not isinstance(self._func, UserDefinedFunction): func = DelegatingScalarFunction(self._func) import cloudpickle serialized_func = cloudpickle.dumps(func) gateway = get_gateway() j_input_types = utils.to_jarray( gateway.jvm.TypeInformation, [_to_java_type(i) for i in self._input_types]) j_result_type = _to_java_type(self._result_type) if is_blink_planner: PythonTableUtils = gateway.jvm\ .org.apache.flink.table.planner.utils.python.PythonTableUtils j_scalar_function = PythonTableUtils \ .createPythonScalarFunction(table_config, self._name, bytearray(serialized_func), j_input_types, j_result_type, self._deterministic, _get_python_env()) else: PythonTableUtils = gateway.jvm.PythonTableUtils j_scalar_function = PythonTableUtils \ .createPythonScalarFunction(self._name, bytearray(serialized_func), j_input_types, j_result_type, self._deterministic, _get_python_env()) return j_scalar_function
def __init__(self, hostname=None, port=None, line_delimiter=None, field_delimiter=None, field_names=None, field_types=None, append_proctime=None): gateway = get_gateway() j_builder = gateway.jvm.org.apache.flink.python.connector.SocketTableSource.Builder() if hostname is not None: j_builder.withHostname(hostname) if port is not None: j_builder.withPort(port) if line_delimiter is not None: j_builder.withLineDelimiter(line_delimiter) if field_delimiter is not None: j_builder.withFieldDelimiter(field_delimiter) if field_names is not None and field_types is not None: j_field_names = to_jarray(gateway.jvm.String, field_names) j_field_types = to_jarray(gateway.jvm.TypeInformation, [_to_java_type(field_type) for field_type in field_types]) j_builder.withSchema(j_field_names, j_field_types) if append_proctime is not None: j_builder.appendProctime(append_proctime) super(SocketTableSource, self).__init__(j_builder.build())
def __init__(self, field_names, field_types, path, field_delimiter=',', num_files=-1, write_mode=None): gateway = get_gateway() if write_mode == WriteMode.NO_OVERWRITE: j_write_mode = gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.NO_OVERWRITE elif write_mode == WriteMode.OVERWRITE: j_write_mode = gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.OVERWRITE elif write_mode is None: j_write_mode = None else: raise Exception('Unsupported write_mode: %s' % write_mode) j_csv_table_sink = gateway.jvm.CsvTableSink(path, field_delimiter, num_files, j_write_mode) j_field_names = utils.to_jarray(gateway.jvm.String, field_names) j_field_types = utils.to_jarray( gateway.jvm.TypeInformation, [_to_java_type(field_type) for field_type in field_types]) j_csv_table_sink = j_csv_table_sink.configure(j_field_names, j_field_types) super(CsvTableSink, self).__init__(j_csv_table_sink)
def _from_file(self, filename, schema): gateway = get_gateway() jds = gateway.jvm.PythonBridgeUtils.createDataSetFromFile( self._j_tenv.execEnv(), filename, True) return Table( gateway.jvm.PythonTableUtils.fromDataSet(self._j_tenv, jds, _to_java_type(schema)))
def java_user_defined_function(self): if self._judf_placeholder is None: gateway = get_gateway() def get_python_function_kind(): JPythonFunctionKind = gateway.jvm.org.apache.flink.table.functions.python. \ PythonFunctionKind if self._func_type == "general": return JPythonFunctionKind.GENERAL elif self._func_type == "pandas": return JPythonFunctionKind.PANDAS else: raise TypeError("Unsupported func_type: %s." % self._func_type) if self._input_types is not None: j_input_types = utils.to_jarray( gateway.jvm.TypeInformation, [_to_java_type(i) for i in self._input_types]) else: j_input_types = None j_function_kind = get_python_function_kind() func = self._func if not isinstance(self._func, UserDefinedFunction): func = self._create_delegate_function() import cloudpickle serialized_func = cloudpickle.dumps(func) self._judf_placeholder = \ self._create_judf(serialized_func, j_input_types, j_function_kind) return self._judf_placeholder
def _create_judf(self, serialized_func, j_input_types, j_function_kind): if self._func_type == "pandas": from pyflink.table.types import DataTypes self._accumulator_type = DataTypes.ARRAY(self._result_type) j_result_type = _to_java_type(self._result_type) j_accumulator_type = _to_java_type(self._accumulator_type) gateway = get_gateway() PythonAggregateFunction = gateway.jvm \ .org.apache.flink.table.functions.python.PythonAggregateFunction j_aggregate_function = PythonAggregateFunction( self._name, bytearray(serialized_func), j_input_types, j_result_type, j_accumulator_type, j_function_kind, self._deterministic, _get_python_env()) return j_aggregate_function
def __init__(self, field_names, field_types, path, field_delimiter=',', num_files=1, write_mode=None): # type: (list[str], list[DataType], str, str, int, int) -> None gateway = get_gateway() if write_mode == WriteMode.NO_OVERWRITE: j_write_mode = gateway.jvm.scala.Option.apply( gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode. NO_OVERWRITE) elif write_mode == WriteMode.OVERWRITE: j_write_mode = gateway.jvm.scala.Option.apply( gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode. OVERWRITE) elif write_mode is None: j_write_mode = gateway.jvm.scala.Option.empty() else: raise Exception('Unsupported write_mode: %s' % write_mode) j_some_field_delimiter = gateway.jvm.scala.Option.apply( field_delimiter) j_some_num_files = gateway.jvm.scala.Option.apply(num_files) j_csv_table_sink = gateway.jvm.CsvTableSink(path, j_some_field_delimiter, j_some_num_files, j_write_mode) j_field_names = utils.to_jarray(gateway.jvm.String, field_names) j_field_types = utils.to_jarray( gateway.jvm.TypeInformation, [_to_java_type(field_type) for field_type in field_types]) j_csv_table_sink = j_csv_table_sink.configure(j_field_names, j_field_types) super(CsvTableSink, self).__init__(j_csv_table_sink)
def _from_elements(self, elements, schema): """ Creates a table from a collection of elements. :param elements: The elements to create a table from. :return: The result :class:`Table`. """ # serializes to a file, and we read the file in java temp_file = tempfile.NamedTemporaryFile(delete=False, dir=tempfile.mkdtemp()) serializer = BatchedSerializer(self._serializer) try: try: serializer.dump_to_stream(elements, temp_file) finally: temp_file.close() row_type_info = _to_java_type(schema) execution_config = self._get_execution_config( temp_file.name, schema) gateway = get_gateway() j_objs = gateway.jvm.PythonBridgeUtils.readPythonObjects( temp_file.name, True) j_input_format = gateway.jvm.PythonTableUtils.getInputFormat( j_objs, row_type_info, execution_config) j_table_source = gateway.jvm.PythonInputFormatTableSource( j_input_format, row_type_info) return Table(self._j_tenv.fromTableSource(j_table_source)) finally: os.unlink(temp_file.name)
def __init__(self, j_table_sink, field_names, field_types): gateway = get_gateway() j_field_names = utils.to_jarray(gateway.jvm.String, field_names) j_field_types = utils.to_jarray( gateway.jvm.TypeInformation, [_to_java_type(field_type) for field_type in field_types]) j_table_sink = j_table_sink.configure(j_field_names, j_field_types) super(TestTableSink, self).__init__(j_table_sink)
def __init__(self, source_path, field_names, field_types): # type: (str, list[str], list[DataType]) -> None gateway = get_gateway() j_field_names = utils.to_jarray(gateway.jvm.String, field_names) j_field_types = utils.to_jarray(gateway.jvm.TypeInformation, [_to_java_type(field_type) for field_type in field_types]) super(CsvTableSource, self).__init__( gateway.jvm.CsvTableSource(source_path, j_field_names, j_field_types))
def schema(self, schema_data_type): """ Sets the format schema with field names and the types. Required if schema is not derived. :param schema_data_type: Data type from :class:`DataTypes` that describes the schema. :return: This :class:`Csv` object. """ self._j_csv = self._j_csv.schema(_to_java_type(schema_data_type)) return self
def __init__(self, field_names, field_types, out_row=100000): gateway = get_gateway() j_field_names = utils.to_jarray(gateway.jvm.String, field_names) j_field_types = utils.to_jarray( gateway.jvm.TypeInformation, [_to_java_type(field_type) for field_type in field_types]) j_table_sink = gateway.jvm.com.alibaba.flink.sink.PrintTableSink( j_field_names, j_field_types, out_row) super(PrintTableSink, self).__init__(j_table_sink)
def __init__(self, source_path, field_names, field_types): # type: (str, list[str], list[DataType]) -> None gateway = get_gateway() j_field_names = utils.to_jarray(gateway.jvm.String, field_names) j_field_types = utils.to_jarray( gateway.jvm.TypeInformation, [_to_java_type(field_type) for field_type in field_types]) super(CsvTableSource, self).__init__( gateway.jvm.CsvTableSource(source_path, j_field_names, j_field_types))
def __init__( self, source_path, field_names, field_types, field_delim=None, line_delim=None, quote_character=None, ignore_first_line=None, ignore_comments=None, lenient=None, empty_column_as_null=None, ): gateway = get_gateway() builder = gateway.jvm.CsvTableSource.builder() builder.path(source_path) for (field_name, field_type) in zip(field_names, field_types): builder.field(field_name, _to_java_type(field_type)) if field_delim is not None: builder.fieldDelimiter(field_delim) if line_delim is not None: builder.lineDelimiter(line_delim) if quote_character is not None: # Java API has a Character type for this field. At time of writing, # Py4J will convert the Python str to Java Character by taking only # the first character. This results in either: # - Silently truncating a Python str with more than one character # with no further type error from either Py4J or Java # CsvTableSource # - java.lang.StringIndexOutOfBoundsException from Py4J for an # empty Python str. That error can be made more friendly here. if len(quote_character) != 1: raise ValueError( "Expected a single CSV quote character but got '{}'". format(quote_character)) builder.quoteCharacter(quote_character) if ignore_first_line: builder.ignoreFirstLine() if ignore_comments is not None: builder.commentPrefix(ignore_comments) if lenient: builder.ignoreParseErrors() if empty_column_as_null: builder.emptyColumnAsNull() super(CsvTableSource, self).__init__(builder.build())
def __init__(self, field_names, field_types): gateway = get_gateway() j_print_table_sink = gateway.jvm.org.apache.flink.python.connector.PrintTableSink( ) j_field_names = to_jarray(gateway.jvm.String, field_names) j_field_types = to_jarray( gateway.jvm.TypeInformation, [_to_java_type(field_type) for field_type in field_types]) j_print_table_sink = j_print_table_sink.configure( j_field_names, j_field_types) super(PrintTableSink, self).__init__(j_print_table_sink)
def test_multiset_type(self): test_types = [DataTypes.MULTISET(DataTypes.BIGINT()), DataTypes.MULTISET(DataTypes.STRING()), DataTypes.MULTISET(DataTypes.MULTISET(DataTypes.BIGINT())), DataTypes.MULTISET(DataTypes.MULTISET(DataTypes.STRING()))] java_types = [_to_java_type(item) for item in test_types] converted_python_types = [_from_java_type(item) for item in java_types] self.assertEqual(test_types, converted_python_types)
def __init__(self, field_names=None, data_types=None, j_table_schema=None): if j_table_schema is None: gateway = get_gateway() j_field_names = to_jarray(gateway.jvm.String, field_names) j_data_types = to_jarray( gateway.jvm.TypeInformation, [_to_java_type(item) for item in data_types]) self._j_table_schema = gateway.jvm.TableSchema( j_field_names, j_data_types) else: self._j_table_schema = j_table_schema
def test_list_view_type(self): test_types = [ DataTypes.LIST_VIEW(DataTypes.BIGINT()), DataTypes.LIST_VIEW(DataTypes.STRING()) ] java_types = [_to_java_type(item) for item in test_types] converted_python_types = [_from_java_type(item) for item in java_types] self.assertEqual(test_types, converted_python_types)
def test_row_type(self): test_types = [DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT()), DataTypes.FIELD("b", DataTypes.ROW( [DataTypes.FIELD("c", DataTypes.STRING())]))])] java_types = [_to_java_type(item) for item in test_types] converted_python_types = [_from_java_type(item) for item in java_types] self.assertEqual(test_types, converted_python_types)
def _create_judf(self, serialized_func, j_input_types, j_function_kind): gateway = get_gateway() j_result_type = _to_java_type(self._result_type) PythonScalarFunction = gateway.jvm \ .org.apache.flink.table.functions.python.PythonScalarFunction j_scalar_function = PythonScalarFunction(self._name, bytearray(serialized_func), j_input_types, j_result_type, j_function_kind, self._deterministic, _get_python_env()) return j_scalar_function
def test_array_type(self): # nullable/not_null flag will be lost during the conversion. test_types = [DataTypes.ARRAY(DataTypes.BIGINT()), DataTypes.ARRAY(DataTypes.BIGINT()), DataTypes.ARRAY(DataTypes.STRING()), DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING()))] java_types = [_to_java_type(item) for item in test_types] converted_python_types = [_from_java_type(item) for item in java_types] self.assertEqual(test_types, converted_python_types)
def schema(self, schema_data_type): """ Sets the schema using :class:`DataTypes`. JSON objects are represented as ROW types. The schema might be nested. :param schema_data_type: Data type that describes the schema. :return: This object. """ self._j_json = self._j_json.schema(_to_java_type(schema_data_type)) return self
def test_array_type(self): test_types = [DataTypes.ARRAY(DataTypes.BIGINT()), # array type with not null basic data type means primitive array DataTypes.ARRAY(DataTypes.BIGINT().not_null()), DataTypes.ARRAY(DataTypes.STRING()), DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING()))] java_types = [_to_java_type(item) for item in test_types] converted_python_types = [_from_java_type(item) for item in java_types] self.assertEqual(test_types, converted_python_types)
def _to_flink_type_string(data_type): FlinkTypeConverter = get_java_class("com.alibaba.alink.operator.common.io.types.FlinkTypeConverter") if isinstance(data_type, (AlinkDataType,)): type_string = data_type.to_type_string() else: type_string = FlinkTypeConverter.getTypeString(_to_java_type(data_type)) mapping = { "TINYINT": "BYTE", "SMALLINT": "SHORT", "VARCHAR": "STRING", } if type_string in mapping: type_string = mapping[type_string] return type_string
def _from_file(self, filename, schema): gateway = get_gateway() blink_t_env_class = get_java_class(gateway.jvm.org.apache.flink.table. api.internal.TableEnvironmentImpl) if blink_t_env_class == self._j_tenv.getClass(): raise NotImplementedError( "The operation 'from_elements' in batch mode is currently " "not supported when using blink planner.") else: jds = gateway.jvm.PythonBridgeUtils.createDataSetFromFile( self._j_tenv.execEnv(), filename, True) return Table( gateway.jvm.PythonTableUtils.fromDataSet( self._j_tenv, jds, _to_java_type(schema)))
def get_function_definition(f): if isinstance(f, UserDefinedTableFunctionWrapper): """ TypeInference was not supported for TableFunction in the old planner. Use TableFunctionDefinition to work around this issue. """ j_result_types = to_jarray(gateway.jvm.TypeInformation, [_to_java_type(i) for i in f._result_types]) j_result_type = gateway.jvm.org.apache.flink.api.java.typeutils.RowTypeInfo( j_result_types) return gateway.jvm.org.apache.flink.table.functions.TableFunctionDefinition( 'f', f.java_user_defined_function(), j_result_type) else: return f.java_user_defined_function()
def field(self, field_name, field_type): """ Adds a format field with the field name and the data type or type string. Required. This method can be called multiple times. The call order of this method defines also the order of the fields in the format. :param field_name: The field name. :param field_type: The data type or type string of the field. :return: This :class:`OldCsv` object. """ if isinstance(field_type, str): self._j_csv = self._j_csv.field(field_name, field_type) else: self._j_csv = self._j_csv.field(field_name, _to_java_type(field_type)) return self
def field(self, field_name, field_type): """ Adds a format field with the field name and the data type or type string. Required. This method can be called multiple times. The call order of this method defines also the order of the fields in the format. :param field_name: The field name. :param field_type: The data type or type string of the field. :return: This :class:`OldCsv` object. """ if isinstance(field_type, (str, unicode)): self._j_csv = self._j_csv.field(field_name, field_type) else: self._j_csv = self._j_csv.field(field_name, _to_java_type(field_type)) return self
def field(self, field_name, field_type): """ Adds a field with the field name and the data type or type string. Required. This method can be called multiple times. The call order of this method defines also the order of the fields in a row. Here is a document that introduces the type strings: https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connect.html#type-strings :param field_name: The field name. :param field_type: The data type or type string of the field. :return: This schema object. """ if isinstance(field_type, (str, unicode)): self._j_schema = self._j_schema.field(field_name, field_type) else: self._j_schema = self._j_schema.field(field_name, _to_java_type(field_type)) return self
def field(self, field_name, field_type): """ Adds a field with the field name and the data type or type string. Required. This method can be called multiple times. The call order of this method defines also the order of the fields in a row. Here is a document that introduces the type strings: https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connect.html#type-strings :param field_name: The field name. :param field_type: The data type or type string of the field. :return: This schema object. """ if isinstance(field_type, str): self._j_schema = self._j_schema.field(field_name, field_type) else: self._j_schema = self._j_schema.field(field_name, _to_java_type(field_type)) return self
def _create_judf(self, serialized_func, j_input_types, j_function_kind): gateway = get_gateway() j_result_types = utils.to_jarray( gateway.jvm.TypeInformation, [_to_java_type(i) for i in self._result_types]) j_result_type = gateway.jvm.org.apache.flink.api.java.typeutils.RowTypeInfo( j_result_types) PythonTableFunction = gateway.jvm \ .org.apache.flink.table.functions.python.PythonTableFunction j_table_function = PythonTableFunction(self._name, bytearray(serialized_func), j_input_types, j_result_type, j_function_kind, self._deterministic, _get_python_env()) return j_table_function