Beispiel #1
0
    def for_row_type(row_type: RowType,
                     writer_properties: Optional[Configuration] = None,
                     hadoop_config: Optional[Configuration] = None) \
            -> BulkWriterFactory:
        """
        Create a RowDataBulkWriterFactory that writes Row records with a defined RowType into Orc
        files in a batch fashion.

        Example:
        ::

            >>> row_type = DataTypes.ROW([
            ...     DataTypes.FIELD('string', DataTypes.STRING()),
            ...     DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT()))
            ... ])
            >>> row_type_info = Types.ROW_NAMED(
            ...     ['string', 'int_array'],
            ...     [Types.STRING(), Types.LIST(Types.INT())]
            ... )
            >>> sink = FileSink.for_bulk_format(
            ...     OUTPUT_DIR, OrcBulkWriters.for_row_type(
            ...         row_type=row_type,
            ...         writer_properties=Configuration(),
            ...         hadoop_config=Configuration(),
            ...     )
            ... ).build()
            >>> ds.map(lambda e: e, output_type=row_type_info).sink_to(sink)

        Note that in the above example, an identity map to indicate its RowTypeInfo is necessary
        before ``sink_to`` when ``ds`` is a source stream producing **RowData** records,
        because RowDataBulkWriterFactory assumes the input record type is Row.
        """
        if not isinstance(row_type, RowType):
            raise TypeError('row_type must be an instance of RowType')

        j_data_type = _to_java_data_type(row_type)
        jvm = get_gateway().jvm
        j_row_type = j_data_type.getLogicalType()
        orc_types = to_jarray(
            jvm.org.apache.flink.table.types.logical.LogicalType,
            [i for i in j_row_type.getChildren()])
        type_description = jvm.org.apache.flink.orc \
            .OrcSplitReaderUtil.logicalTypeToOrcType(j_row_type)
        if writer_properties is None:
            writer_properties = Configuration()
        if hadoop_config is None:
            hadoop_config = Configuration()

        return RowDataBulkWriterFactory(
            jvm.org.apache.flink.orc.writer.OrcBulkWriterFactory(
                jvm.org.apache.flink.orc.vector.RowDataVectorizer(
                    type_description.toString(), orc_types),
                create_java_properties(writer_properties),
                create_hadoop_configuration(hadoop_config)), row_type)
Beispiel #2
0
def _write_row_data_to_parquet_file(path: str, row_type: RowType, rows: List[Row]):
    jvm = get_gateway().jvm
    flink = jvm.org.apache.flink

    j_output_stream = flink.core.fs.local.LocalDataOutputStream(jvm.java.io.File(path))
    j_bulk_writer = flink.formats.parquet.row.ParquetRowDataBuilder.createWriterFactory(
        _to_java_data_type(row_type).getLogicalType(),
        create_hadoop_configuration(Configuration()),
        True,
    ).create(j_output_stream)
    row_row_converter = flink.table.data.conversion.RowRowConverter.create(
        _to_java_data_type(row_type)
    )
    row_row_converter.open(row_row_converter.getClass().getClassLoader())
    for row in rows:
        j_bulk_writer.addElement(row_row_converter.toInternal(to_java_data_structure(row)))
    j_bulk_writer.finish()
Beispiel #3
0
    def __init__(self,
                 row_type: RowType,
                 hadoop_config: Optional[Configuration] = None,
                 batch_size: int = 2048,
                 is_utc_timestamp: bool = False,
                 is_case_sensitive: bool = True):
        if not hadoop_config:
            hadoop_config = Configuration()

        jvm = get_gateway().jvm
        j_row_type = _to_java_data_type(row_type).getLogicalType()
        produced_type_info = jvm.org.apache.flink.table.runtime.typeutils. \
            InternalTypeInfo.of(j_row_type)
        j_parquet_columnar_format = jvm.org.apache.flink.formats.parquet. \
            ParquetColumnarRowInputFormat(create_hadoop_configuration(hadoop_config),
                                          j_row_type, produced_type_info, batch_size,
                                          is_utc_timestamp, is_case_sensitive)
        super().__init__(j_parquet_columnar_format)
Beispiel #4
0
    def for_row_type(row_type: RowType,
                     hadoop_config: Optional[Configuration] = None,
                     utc_timestamp: bool = False) -> 'BulkWriterFactory':
        """
        Create a RowDataBulkWriterFactory that writes Rows records with a defined RowType into
        Parquet files in a batch fashion.

        Example:
        ::

            >>> row_type = DataTypes.ROW([
            ...     DataTypes.FIELD('string', DataTypes.STRING()),
            ...     DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT()))
            ... ])
            >>> row_type_info = Types.ROW_NAMED(
            ...     ['string', 'int_array'],
            ...     [Types.STRING(), Types.LIST(Types.INT())]
            ... )
            >>> sink = FileSink.for_bulk_format(
            ...     OUTPUT_DIR, ParquetBulkWriter.for_row_type(
            ...         row_type,
            ...         hadoop_config=Configuration(),
            ...         utc_timestamp=True,
            ...     )
            ... ).build()
            >>> ds.map(lambda e: e, output_type=row_type_info).sink_to(sink)

        Note that in the above example, an identity map to indicate its RowTypeInfo is necessary
        before ``sink_to`` when ``ds`` is a source stream producing **RowData** records, because
        RowDataBulkWriterFactory assumes the input record type is **Row** .
        """
        if not hadoop_config:
            hadoop_config = Configuration()

        jvm = get_gateway().jvm
        JParquetRowDataBuilder = jvm.org.apache.flink.formats.parquet.row.ParquetRowDataBuilder
        return RowDataBulkWriterFactory(
            JParquetRowDataBuilder.createWriterFactory(
                _to_java_data_type(row_type).getLogicalType(),
                create_hadoop_configuration(hadoop_config), utc_timestamp),
            row_type)
Beispiel #5
0
    def for_row_type(row_type: 'RowType',
                     writer_properties: Optional[Configuration] = None,
                     hadoop_config: Optional[Configuration] = None) \
            -> BulkWriterFactory:
        """
        Create a :class:`~pyflink.common.serialization.BulkWriterFactory` that writes records
        with a predefined schema into Orc files in a batch fashion.

        :param row_type: The RowType of records, it should match the RowTypeInfo of Row records.
        :param writer_properties: Orc writer options.
        :param hadoop_config: Hadoop configuration.
        """
        from pyflink.table.types import RowType
        if not isinstance(row_type, RowType):
            raise TypeError('row_type must be an instance of RowType')

        from pyflink.table.types import _to_java_data_type
        j_data_type = _to_java_data_type(row_type)
        jvm = get_gateway().jvm
        j_row_type = j_data_type.getLogicalType()
        orc_types = to_jarray(
            jvm.org.apache.flink.table.types.logical.LogicalType,
            [i for i in j_row_type.getChildren()])
        type_description = jvm.org.apache.flink.orc \
            .OrcSplitReaderUtil.logicalTypeToOrcType(j_row_type)
        if writer_properties is None:
            writer_properties = Configuration()
        if hadoop_config is None:
            hadoop_config = Configuration()

        return RowDataBulkWriterFactory(
            jvm.org.apache.flink.orc.writer.OrcBulkWriterFactory(
                jvm.org.apache.flink.orc.vector.RowDataVectorizer(
                    type_description.toString(), orc_types),
                create_java_properties(writer_properties),
                create_hadoop_configuration(hadoop_config)), row_type)
Beispiel #6
0
    def for_row_type(row_type: 'RowType',
                     hadoop_config: Optional[Configuration] = None,
                     utc_timestamp: bool = False) -> 'BulkWriterFactory':
        """
        Create a :class:`~pyflink.common.serialization.BulkWriterFactory` that writes records
        with a predefined schema into Parquet files in a batch fashion.

        :param row_type: The RowType of records, it should match the RowTypeInfo of Row records.
        :param hadoop_config: Hadoop configuration.
        :param utc_timestamp: Use UTC timezone or local timezone to the conversion between epoch
            time and LocalDateTime. Hive 0.x/1.x/2.x use local timezone. But Hive 3.x use UTC
            timezone.
        """
        if not hadoop_config:
            hadoop_config = Configuration()

        from pyflink.table.types import _to_java_data_type
        jvm = get_gateway().jvm
        JParquetRowDataBuilder = jvm.org.apache.flink.formats.parquet.row.ParquetRowDataBuilder
        return RowDataBulkWriterFactory(JParquetRowDataBuilder.createWriterFactory(
            _to_java_data_type(row_type).getLogicalType(),
            create_hadoop_configuration(hadoop_config),
            utc_timestamp
        ), row_type)