def for_schema(schema: 'CsvSchema') -> 'CsvReaderFormat': """ Builds a :class:`CsvReaderFormat` using `CsvSchema`. """ jvm = get_gateway().jvm jackson = jvm.org.apache.flink.shaded.jackson2.com.fasterxml.jackson constructor = get_java_class(jvm.org.apache.flink.formats.csv.CsvReaderFormat) \ .getDeclaredConstructor( to_jarray(jvm.Class, [ get_java_class(jackson.dataformat.csv.CsvMapper), get_java_class(jackson.dataformat.csv.CsvSchema), get_java_class(jvm.Class), get_java_class(jvm.org.apache.flink.formats.common.Converter), get_java_class(jvm.org.apache.flink.api.common.typeinfo.TypeInformation), get_java_class(jvm.boolean) ]) ) constructor.setAccessible(True) j_csv_format = constructor.newInstance( to_jarray(jvm.Object, [ jackson.dataformat.csv.CsvMapper(), schema._j_schema, get_java_class(jackson.databind.JsonNode), jvm.org.apache.flink.formats.csv.CsvToRowDataConverters( False).createRowConverter( _to_java_data_type(schema._data_type).getLogicalType(), True), jvm.org.apache.flink.table.runtime.typeutils.InternalTypeInfo. of(_to_java_data_type( schema._data_type).getLogicalType()), False ])) return CsvReaderFormat(j_csv_format)
def _create_judf(self, serialized_func, j_input_types, j_function_kind): if self._func_type == "pandas": from pyflink.table.types import DataTypes self._accumulator_type = DataTypes.ARRAY(self._result_type) if j_input_types is not None: gateway = get_gateway() j_input_types = java_utils.to_jarray( gateway.jvm.DataType, [_to_java_data_type(i) for i in self._input_types]) j_result_type = _to_java_data_type(self._result_type) j_accumulator_type = _to_java_data_type(self._accumulator_type) gateway = get_gateway() if self._is_table_aggregate: PythonAggregateFunction = gateway.jvm \ .org.apache.flink.table.functions.python.PythonTableAggregateFunction else: PythonAggregateFunction = gateway.jvm \ .org.apache.flink.table.functions.python.PythonAggregateFunction j_aggregate_function = PythonAggregateFunction( self._name, bytearray(serialized_func), j_input_types, j_result_type, j_accumulator_type, j_function_kind, self._deterministic, self._takes_row_as_input, _get_python_env()) return j_aggregate_function
def cast(self, data_type: DataType) -> 'Expression': """ Converts a value to a given data type. e.g. lit("42").cast(DataTypes.INT()) leads to 42. """ return _binary_op("cast")(self, _to_java_data_type(data_type))
def __init__(self, field_names, field_types, path, field_delimiter=',', num_files=-1, write_mode=None): gateway = get_gateway() if write_mode == WriteMode.NO_OVERWRITE: j_write_mode = gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.NO_OVERWRITE elif write_mode == WriteMode.OVERWRITE: j_write_mode = gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.OVERWRITE elif write_mode is None: j_write_mode = None else: raise Exception('Unsupported write_mode: %s' % write_mode) j_field_names = java_utils.to_jarray(gateway.jvm.String, field_names) j_field_types = java_utils.to_jarray( gateway.jvm.DataType, [_to_java_data_type(field_type) for field_type in field_types]) j_csv_table_sink = gateway.jvm.CsvTableSink(path, field_delimiter, num_files, j_write_mode, j_field_names, j_field_types) super(CsvTableSink, self).__init__(j_csv_table_sink)
def _java_user_defined_function(self): if self._judf_placeholder is None: gateway = get_gateway() def get_python_function_kind(): JPythonFunctionKind = gateway.jvm.org.apache.flink.table.functions.python. \ PythonFunctionKind if self._func_type == "general": return JPythonFunctionKind.GENERAL elif self._func_type == "pandas": return JPythonFunctionKind.PANDAS else: raise TypeError("Unsupported func_type: %s." % self._func_type) if self._input_types is not None: j_input_types = java_utils.to_jarray( gateway.jvm.DataType, [_to_java_data_type(i) for i in self._input_types]) else: j_input_types = None j_function_kind = get_python_function_kind() func = self._func if not isinstance(self._func, UserDefinedFunction): func = self._create_delegate_function() import cloudpickle serialized_func = cloudpickle.dumps(func) self._judf_placeholder = \ self._create_judf(serialized_func, j_input_types, j_function_kind) return self._judf_placeholder
def get_type_info(self): if self._type_info is None: jvm = get_gateway().jvm j_type_info = jvm.org.apache.flink.table.types.utils.LegacyTypeInfoDataTypeConverter \ .toLegacyTypeInfo(_to_java_data_type(self._row_type)) self._type_info = _from_java_type(j_type_info) return self._type_info
def _write_row_data_to_parquet_file(path: str, row_type: RowType, rows: List[Row]): jvm = get_gateway().jvm flink = jvm.org.apache.flink j_output_stream = flink.core.fs.local.LocalDataOutputStream(jvm.java.io.File(path)) j_bulk_writer = flink.formats.parquet.row.ParquetRowDataBuilder.createWriterFactory( _to_java_data_type(row_type).getLogicalType(), create_hadoop_configuration(Configuration()), True, ).create(j_output_stream) row_row_converter = flink.table.data.conversion.RowRowConverter.create( _to_java_data_type(row_type) ) row_row_converter.open(row_row_converter.getClass().getClassLoader()) for row in rows: j_bulk_writer.addElement(row_row_converter.toInternal(to_java_data_structure(row))) j_bulk_writer.finish()
def apply(self, ds): jvm = get_gateway().jvm if _check_if_row_data_type(ds): return ds j_map_function = jvm.org.apache.flink.python.util.PythonConnectorUtils \ .RowRowMapper(_to_java_data_type(row_type)) return DataStream(ds._j_data_stream.process(j_map_function))
def for_schema(schema: 'CsvSchema') -> 'BulkWriterFactory': """ Builds a :class:`BulkWriterFactory` for writing records to files in CSV format. """ jvm = get_gateway().jvm csv = jvm.org.apache.flink.formats.csv j_factory = csv.PythonCsvUtils.createCsvBulkWriterFactory( schema._j_schema, _to_java_data_type(schema._row_type)) return RowDataBulkWriterFactory(j_factory, schema._row_type)
def __init__(self, field_names, field_types): TestTableSink._ensure_initialized() gateway = get_gateway() j_field_names = java_utils.to_jarray(gateway.jvm.String, field_names) j_field_types = java_utils.to_jarray( gateway.jvm.DataType, [_to_java_data_type(field_type) for field_type in field_types]) super(TestRetractSink, self).__init__( gateway.jvm.org.apache.flink.table.utils.TestingSinks. TestAppendingSink(j_field_names, j_field_types))
def __init__( self, source_path, field_names, field_types, field_delim=None, line_delim=None, quote_character=None, ignore_first_line=None, ignore_comments=None, lenient=None, empty_column_as_null=None, ): gateway = get_gateway() builder = gateway.jvm.CsvTableSource.builder() builder.path(source_path) for (field_name, field_type) in zip(field_names, field_types): builder.field(field_name, _to_java_data_type(field_type)) if field_delim is not None: builder.fieldDelimiter(field_delim) if line_delim is not None: builder.lineDelimiter(line_delim) if quote_character is not None: # Java API has a Character type for this field. At time of writing, # Py4J will convert the Python str to Java Character by taking only # the first character. This results in either: # - Silently truncating a Python str with more than one character # with no further type error from either Py4J or Java # CsvTableSource # - java.lang.StringIndexOutOfBoundsException from Py4J for an # empty Python str. That error can be made more friendly here. if len(quote_character) != 1: raise ValueError( "Expected a single CSV quote character but got '{}'".format(quote_character) ) builder.quoteCharacter(quote_character) if ignore_first_line: builder.ignoreFirstLine() if ignore_comments is not None: builder.commentPrefix(ignore_comments) if lenient: builder.ignoreParseErrors() if empty_column_as_null: builder.emptyColumnAsNull() super(CsvTableSource, self).__init__(builder.build())
def for_row_type(row_type: RowType, writer_properties: Optional[Configuration] = None, hadoop_config: Optional[Configuration] = None) \ -> BulkWriterFactory: """ Create a RowDataBulkWriterFactory that writes Row records with a defined RowType into Orc files in a batch fashion. Example: :: >>> row_type = DataTypes.ROW([ ... DataTypes.FIELD('string', DataTypes.STRING()), ... DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT())) ... ]) >>> row_type_info = Types.ROW_NAMED( ... ['string', 'int_array'], ... [Types.STRING(), Types.LIST(Types.INT())] ... ) >>> sink = FileSink.for_bulk_format( ... OUTPUT_DIR, OrcBulkWriters.for_row_type( ... row_type=row_type, ... writer_properties=Configuration(), ... hadoop_config=Configuration(), ... ) ... ).build() >>> ds.map(lambda e: e, output_type=row_type_info).sink_to(sink) Note that in the above example, an identity map to indicate its RowTypeInfo is necessary before ``sink_to`` when ``ds`` is a source stream producing **RowData** records, because RowDataBulkWriterFactory assumes the input record type is Row. """ if not isinstance(row_type, RowType): raise TypeError('row_type must be an instance of RowType') j_data_type = _to_java_data_type(row_type) jvm = get_gateway().jvm j_row_type = j_data_type.getLogicalType() orc_types = to_jarray( jvm.org.apache.flink.table.types.logical.LogicalType, [i for i in j_row_type.getChildren()]) type_description = jvm.org.apache.flink.orc \ .OrcSplitReaderUtil.logicalTypeToOrcType(j_row_type) if writer_properties is None: writer_properties = Configuration() if hadoop_config is None: hadoop_config = Configuration() return RowDataBulkWriterFactory( jvm.org.apache.flink.orc.writer.OrcBulkWriterFactory( jvm.org.apache.flink.orc.vector.RowDataVectorizer( type_description.toString(), orc_types), create_java_properties(writer_properties), create_hadoop_configuration(hadoop_config)), row_type)
def for_schema(schema: 'CsvSchema') -> 'CsvReaderFormat': """ Builds a :class:`CsvReaderFormat` using `CsvSchema`. """ jvm = get_gateway().jvm j_csv_format = jvm.org.apache.flink.formats.csv.CsvReaderFormatFactory \ .createCsvReaderFormat( schema._j_schema, _to_java_data_type(schema._data_type) ) return CsvReaderFormat(j_csv_format)
def __init__(self, hadoop_config: Configuration, row_type: RowType, batch_size: int, is_utc_timestamp: bool, is_case_sensitive: bool): jvm = get_gateway().jvm j_row_type = _to_java_data_type(row_type).getLogicalType() produced_type_info = jvm.org.apache.flink.table.runtime.typeutils. \ InternalTypeInfo.of(j_row_type) j_parquet_columnar_format = jvm.org.apache.flink.formats.parquet. \ ParquetColumnarRowInputFormat(self._create_hadoop_configuration(hadoop_config), j_row_type, produced_type_info, batch_size, is_utc_timestamp, is_case_sensitive) super().__init__(j_parquet_columnar_format)
def for_schema(schema: 'CsvSchema') -> 'CsvReaderFormat': """ Builds a :class:`CsvReaderFormat` using `CsvSchema`. """ from pyflink.table.types import _to_java_data_type jvm = get_gateway().jvm j_csv_format = jvm.org.apache.flink.formats.csv.PythonCsvUtils \ .createCsvReaderFormat( schema._j_schema, _to_java_data_type(schema._row_type) ) return CsvReaderFormat(j_csv_format)
def from_fields(self, field_names: List[str], field_data_types: List[DataType]) -> 'Schema.Builder': """ Adopts the given field names and field data types as physical columns of the schema. """ gateway = get_gateway() j_field_names = to_jarray(gateway.jvm.String, field_names) j_field_data_types = to_jarray(gateway.jvm.AbstractDataType, [ _to_java_data_type(field_data_type) for field_data_type in field_data_types ]) self._j_builder.fromFields(j_field_names, j_field_data_types) return self
def for_schema(schema: 'CsvSchema') -> 'BulkWriterFactory': """ Creates a :class:`~pyflink.common.serialization.BulkWriterFactory` for writing records to files in CSV format. """ from pyflink.table.types import _to_java_data_type jvm = get_gateway().jvm csv = jvm.org.apache.flink.formats.csv j_factory = csv.PythonCsvUtils.createCsvBulkWriterFactory( schema._j_schema, _to_java_data_type(schema._row_type)) return RowDataBulkWriterFactory(j_factory, schema._row_type)
def test_map_view_type(self): test_types = [ DataTypes.MAP_VIEW(DataTypes.STRING(), DataTypes.BIGINT()), DataTypes.MAP_VIEW(DataTypes.INT(), DataTypes.STRING()) ] java_types = [_to_java_data_type(item) for item in test_types] converted_python_types = [ _from_java_data_type(item) for item in java_types ] self.assertEqual(test_types, converted_python_types)
def _create_judf(self, serialized_func, j_input_types, j_function_kind): gateway = get_gateway() j_result_type = _to_java_data_type(self._result_type) PythonScalarFunction = gateway.jvm \ .org.apache.flink.table.functions.python.PythonScalarFunction j_scalar_function = PythonScalarFunction(self._name, bytearray(serialized_func), j_input_types, j_result_type, j_function_kind, self._deterministic, self._takes_row_as_input, _get_python_env()) return j_scalar_function
def __init__(self, field_names: List[str] = None, data_types: List[DataType] = None, j_table_schema=None): if j_table_schema is None: gateway = get_gateway() j_field_names = to_jarray(gateway.jvm.String, field_names) j_data_types = to_jarray( gateway.jvm.DataType, [_to_java_data_type(item) for item in data_types]) self._j_table_schema = gateway.jvm.TableSchema.builder()\ .fields(j_field_names, j_data_types).build() else: self._j_table_schema = j_table_schema
def test_multiset_type(self): test_types = [ DataTypes.MULTISET(DataTypes.BIGINT()), DataTypes.MULTISET(DataTypes.STRING()), DataTypes.MULTISET(DataTypes.MULTISET(DataTypes.BIGINT())), DataTypes.MULTISET(DataTypes.MULTISET(DataTypes.STRING())) ] java_types = [_to_java_data_type(item) for item in test_types] converted_python_types = [ _from_java_data_type(item) for item in java_types ] self.assertEqual(test_types, converted_python_types)
def lit(v, data_type: DataType = None) -> Expression: """ Creates a SQL literal. The data type is derived from the object's class and its value. For example, `lit(12)` leads to `INT`, `lit("abc")` leads to `CHAR(3)`. Example: :: >>> tab.select(col("key"), lit("abc")) """ if data_type is None: return _unary_op("lit", v) else: return _binary_op("lit", v, _to_java_data_type(data_type))
def field(self, field_name: str, field_type: Union[DataType, str]) -> 'Schema': """ Adds a field with the field name and the data type or type string. Required. This method can be called multiple times. The call order of this method defines also the order of the fields in a row. Here is a document that introduces the type strings: https://nightlies.apache.org/flink/flink-docs-stable/dev/table/connect.html#type-strings :param field_name: The field name. :param field_type: The data type or type string of the field. :return: This schema object. """ if isinstance(field_type, str): self._j_schema = self._j_schema.field(field_name, field_type) else: self._j_schema = self._j_schema.field(field_name, _to_java_data_type(field_type)) return self
def _create_judf(self, serialized_func, j_input_types, j_function_kind): gateway = get_gateway() j_result_types = java_utils.to_jarray( gateway.jvm.DataType, [_to_java_data_type(i) for i in self._result_types]) j_result_type = gateway.jvm.DataTypes.ROW(j_result_types) PythonTableFunction = gateway.jvm \ .org.apache.flink.table.functions.python.PythonTableFunction j_table_function = PythonTableFunction(self._name, bytearray(serialized_func), j_input_types, j_result_type, j_function_kind, self._deterministic, self._takes_row_as_input, _get_python_env()) return j_table_function
def test_row_type(self): test_types = [ DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.INT()), DataTypes.FIELD( "b", DataTypes.ROW([DataTypes.FIELD("c", DataTypes.STRING())])) ]) ] java_types = [_to_java_data_type(item) for item in test_types] converted_python_types = [ _from_java_data_type(item) for item in java_types ] self.assertEqual(test_types, converted_python_types)
def test_array_type(self): # nullable/not_null flag will be lost during the conversion. test_types = [ DataTypes.ARRAY(DataTypes.BIGINT()), DataTypes.ARRAY(DataTypes.BIGINT()), DataTypes.ARRAY(DataTypes.STRING()), DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING())) ] java_types = [_to_java_data_type(item) for item in test_types] converted_python_types = [ _from_java_data_type(item) for item in java_types ] self.assertEqual(test_types, converted_python_types)
def __init__(self, row_type: 'RowType', hadoop_config: Optional[Configuration] = None, batch_size: int = 2048, is_utc_timestamp: bool = False, is_case_sensitive: bool = True): if not hadoop_config: hadoop_config = Configuration() from pyflink.table.types import _to_java_data_type jvm = get_gateway().jvm j_row_type = _to_java_data_type(row_type).getLogicalType() produced_type_info = jvm.org.apache.flink.table.runtime.typeutils. \ InternalTypeInfo.of(j_row_type) j_parquet_columnar_format = jvm.org.apache.flink.formats.parquet. \ ParquetColumnarRowInputFormat(create_hadoop_configuration(hadoop_config), j_row_type, produced_type_info, batch_size, is_utc_timestamp, is_case_sensitive) super().__init__(j_parquet_columnar_format)
def for_row_type(row_type: RowType, hadoop_config: Optional[Configuration] = None, utc_timestamp: bool = False) -> 'BulkWriterFactory': """ Create a RowDataBulkWriterFactory that writes Rows records with a defined RowType into Parquet files in a batch fashion. Example: :: >>> row_type = DataTypes.ROW([ ... DataTypes.FIELD('string', DataTypes.STRING()), ... DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT())) ... ]) >>> row_type_info = Types.ROW_NAMED( ... ['string', 'int_array'], ... [Types.STRING(), Types.LIST(Types.INT())] ... ) >>> sink = FileSink.for_bulk_format( ... OUTPUT_DIR, ParquetBulkWriter.for_row_type( ... row_type, ... hadoop_config=Configuration(), ... utc_timestamp=True, ... ) ... ).build() >>> ds.map(lambda e: e, output_type=row_type_info).sink_to(sink) Note that in the above example, an identity map to indicate its RowTypeInfo is necessary before ``sink_to`` when ``ds`` is a source stream producing **RowData** records, because RowDataBulkWriterFactory assumes the input record type is **Row** . """ if not hadoop_config: hadoop_config = Configuration() jvm = get_gateway().jvm JParquetRowDataBuilder = jvm.org.apache.flink.formats.parquet.row.ParquetRowDataBuilder return RowDataBulkWriterFactory( JParquetRowDataBuilder.createWriterFactory( _to_java_data_type(row_type).getLogicalType(), create_hadoop_configuration(hadoop_config), utc_timestamp), row_type)
def column(self, column_name: str, data_type: Union[str, DataType]) -> 'Schema.Builder': """ Declares a physical column that is appended to this schema. Physical columns are regular columns known from databases. They define the names, the types, and the order of fields in the physical data. Thus, physical columns represent the payload that is read from and written to an external system. Connectors and formats use these columns (in the defined order) to configure themselves. Other kinds of columns can be declared between physical columns but will not influence the final physical schema. :param column_name: Column name :param data_type: Data type of the column """ if isinstance(data_type, str): self._j_builder.column(column_name, data_type) else: self._j_builder.column(column_name, _to_java_data_type(data_type)) return self
def column_by_metadata(self, column_name: str, data_type: Union[DataType, str], metadata_key: str = None, is_virtual: bool = False) -> 'Schema.Builder': """ Declares a metadata column that is appended to this schema. Metadata columns allow to access connector and/or format specific fields for every row of a table. For example, a metadata column can be used to read and write the timestamp from and to Kafka records for time-based operations. The connector and format documentation lists the available metadata fields for every component. Every metadata field is identified by a string-based key and has a documented data type. The metadata key can be omitted if the column name should be used as the identifying metadata key. For convenience, the runtime will perform an explicit cast if the data type of the column differs from the data type of the metadata field. Of course, this requires that the two data types are compatible. By default, a metadata column can be used for both reading and writing. However, in many cases an external system provides more read-only metadata fields than writable fields. Therefore, it is possible to exclude metadata columns from persisting by setting the {@code is_virtual} flag to {@code true}. :param column_name: Column name :param data_type: Data type of the column :param metadata_key: Identifying metadata key, if null the column name will be used as metadata key :param is_virtual: Whether the column should be persisted or not """ if isinstance(data_type, DataType): self._j_builder.columnByMetadata(column_name, _to_java_data_type(data_type), metadata_key, is_virtual) else: self._j_builder.columnByMetadata(column_name, data_type, metadata_key, is_virtual) return self