Ejemplo n.º 1
0
 def for_schema(schema: 'CsvSchema') -> 'CsvReaderFormat':
     """
     Builds a :class:`CsvReaderFormat` using `CsvSchema`.
     """
     jvm = get_gateway().jvm
     jackson = jvm.org.apache.flink.shaded.jackson2.com.fasterxml.jackson
     constructor = get_java_class(jvm.org.apache.flink.formats.csv.CsvReaderFormat) \
         .getDeclaredConstructor(
         to_jarray(jvm.Class, [
             get_java_class(jackson.dataformat.csv.CsvMapper),
             get_java_class(jackson.dataformat.csv.CsvSchema),
             get_java_class(jvm.Class),
             get_java_class(jvm.org.apache.flink.formats.common.Converter),
             get_java_class(jvm.org.apache.flink.api.common.typeinfo.TypeInformation),
             get_java_class(jvm.boolean)
         ])
     )
     constructor.setAccessible(True)
     j_csv_format = constructor.newInstance(
         to_jarray(jvm.Object, [
             jackson.dataformat.csv.CsvMapper(), schema._j_schema,
             get_java_class(jackson.databind.JsonNode),
             jvm.org.apache.flink.formats.csv.CsvToRowDataConverters(
                 False).createRowConverter(
                     _to_java_data_type(schema._data_type).getLogicalType(),
                     True),
             jvm.org.apache.flink.table.runtime.typeutils.InternalTypeInfo.
             of(_to_java_data_type(
                 schema._data_type).getLogicalType()), False
         ]))
     return CsvReaderFormat(j_csv_format)
Ejemplo n.º 2
0
    def __init__(self,
                 field_names,
                 field_types,
                 path,
                 field_delimiter=',',
                 num_files=-1,
                 write_mode=None):
        gateway = get_gateway()
        if write_mode == WriteMode.NO_OVERWRITE:
            j_write_mode = gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.NO_OVERWRITE
        elif write_mode == WriteMode.OVERWRITE:
            j_write_mode = gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.OVERWRITE
        elif write_mode is None:
            j_write_mode = None
        else:
            raise Exception('Unsupported write_mode: %s' % write_mode)
        j_field_names = java_utils.to_jarray(gateway.jvm.String, field_names)
        j_field_types = java_utils.to_jarray(
            gateway.jvm.DataType,
            [_to_java_data_type(field_type) for field_type in field_types])
        j_csv_table_sink = gateway.jvm.CsvTableSink(path, field_delimiter,
                                                    num_files, j_write_mode,
                                                    j_field_names,
                                                    j_field_types)

        super(CsvTableSink, self).__init__(j_csv_table_sink)
Ejemplo n.º 3
0
def call(f: Union[str, UserDefinedFunctionWrapper], *args) -> Expression:
    """
    The first parameter `f` could be a str or a Python user-defined function.

    When it is str, this is a call to a function that will be looked up in a catalog. There
    are two kinds of functions:

        - System functions - which are identified with one part names
        - Catalog functions - which are identified always with three parts names
            (catalog, database, function)

    Moreover each function can either be a temporary function or permanent one
    (which is stored in an external catalog).

    Based on that two properties the resolution order for looking up a function based on
    the provided `function_name` is following:

        - Temporary system function
        - System function
        - Temporary catalog function
        - Catalog function

    :param f: the path of the function or the Python user-defined function.
    :param args: parameters of the user-defined function.
    """
    gateway = get_gateway()

    if isinstance(f, str):
        return Expression(gateway.jvm.Expressions.call(
            f, to_jarray(gateway.jvm.Object, [_get_java_expression(arg) for arg in args])))

    def get_function_definition(f):
        if isinstance(f, UserDefinedTableFunctionWrapper):
            """
            TypeInference was not supported for TableFunction in the old planner. Use
            TableFunctionDefinition to work around this issue.
            """
            j_result_types = to_jarray(gateway.jvm.TypeInformation,
                                       [_to_java_type(i) for i in f._result_types])
            j_result_type = gateway.jvm.org.apache.flink.api.java.typeutils.RowTypeInfo(
                j_result_types)
            return gateway.jvm.org.apache.flink.table.functions.TableFunctionDefinition(
                'f', f._java_user_defined_function(), j_result_type)
        else:
            return f._java_user_defined_function()

    expressions_clz = load_java_class("org.apache.flink.table.api.Expressions")
    function_definition_clz = load_java_class('org.apache.flink.table.functions.FunctionDefinition')
    j_object_array_type = to_jarray(gateway.jvm.Object, []).getClass()

    api_call_method = expressions_clz.getDeclaredMethod(
        "apiCall",
        to_jarray(gateway.jvm.Class, [function_definition_clz, j_object_array_type]))
    api_call_method.setAccessible(True)

    return Expression(api_call_method.invoke(
        None,
        to_jarray(gateway.jvm.Object,
                  [get_function_definition(f),
                   to_jarray(gateway.jvm.Object, [_get_java_expression(arg) for arg in args])])))
 def __init__(self, j_table_sink, field_names, field_types):
     gateway = get_gateway()
     j_field_names = java_utils.to_jarray(gateway.jvm.String, field_names)
     j_field_types = java_utils.to_jarray(
         gateway.jvm.TypeInformation,
         [_to_java_type(field_type) for field_type in field_types])
     j_table_sink = j_table_sink.configure(j_field_names, j_field_types)
     super(TestTableSink, self).__init__(j_table_sink)
Ejemplo n.º 5
0
 def _get_kafka_source_configuration(source: KafkaSource):
     jvm = get_gateway().jvm
     j_source = source.get_java_function()
     j_to_configuration = j_source.getClass().getDeclaredMethod(
         'getConfiguration', to_jarray(jvm.java.lang.Class, []))
     j_to_configuration.setAccessible(True)
     j_configuration = j_to_configuration.invoke(
         j_source, to_jarray(jvm.java.lang.Object, []))
     return Configuration(j_configuration=j_configuration)
Ejemplo n.º 6
0
 def __init__(self, field_names, field_types):
     TestTableSink._ensure_initialized()
     gateway = get_gateway()
     j_field_names = java_utils.to_jarray(gateway.jvm.String, field_names)
     j_field_types = java_utils.to_jarray(
         gateway.jvm.DataType,
         [_to_java_data_type(field_type) for field_type in field_types])
     super(TestRetractSink, self).__init__(
         gateway.jvm.org.apache.flink.table.utils.TestingSinks.
         TestAppendingSink(j_field_names, j_field_types))
Ejemplo n.º 7
0
 def from_fields(self, field_names: List[str],
                 field_data_types: List[DataType]) -> 'Schema.Builder':
     """
     Adopts the given field names and field data types as physical columns of the schema.
     """
     gateway = get_gateway()
     j_field_names = to_jarray(gateway.jvm.String, field_names)
     j_field_data_types = to_jarray(gateway.jvm.AbstractDataType, [
         _to_java_data_type(field_data_type)
         for field_data_type in field_data_types
     ])
     self._j_builder.fromFields(j_field_names, j_field_data_types)
     return self
Ejemplo n.º 8
0
 def __init__(self,
              field_names: List[str] = None,
              data_types: List[DataType] = None,
              j_table_schema=None):
     if j_table_schema is None:
         gateway = get_gateway()
         j_field_names = to_jarray(gateway.jvm.String, field_names)
         j_data_types = to_jarray(
             gateway.jvm.TypeInformation,
             [_to_java_type(item) for item in data_types])
         self._j_table_schema = gateway.jvm.TableSchema(
             j_field_names, j_data_types)
     else:
         self._j_table_schema = j_table_schema
Ejemplo n.º 9
0
Archivo: udf.py Proyecto: zjureel/flink
    def _create_judf(self, serialized_func, j_input_types, j_function_kind):
        if self._func_type == "pandas":
            from pyflink.table.types import DataTypes
            self._accumulator_type = DataTypes.ARRAY(self._result_type)

        if j_input_types is not None:
            gateway = get_gateway()
            j_input_types = java_utils.to_jarray(
                gateway.jvm.DataType,
                [_to_java_data_type(i) for i in self._input_types])
        j_result_type = _to_java_data_type(self._result_type)
        j_accumulator_type = _to_java_data_type(self._accumulator_type)

        gateway = get_gateway()
        if self._is_table_aggregate:
            PythonAggregateFunction = gateway.jvm \
                .org.apache.flink.table.functions.python.PythonTableAggregateFunction
        else:
            PythonAggregateFunction = gateway.jvm \
                .org.apache.flink.table.functions.python.PythonAggregateFunction
        j_aggregate_function = PythonAggregateFunction(
            self._name, bytearray(serialized_func), j_input_types,
            j_result_type, j_accumulator_type, j_function_kind,
            self._deterministic, self._takes_row_as_input, _get_python_env())
        return j_aggregate_function
Ejemplo n.º 10
0
Archivo: udf.py Proyecto: zjureel/flink
    def _java_user_defined_function(self):
        if self._judf_placeholder is None:
            gateway = get_gateway()

            def get_python_function_kind():
                JPythonFunctionKind = gateway.jvm.org.apache.flink.table.functions.python. \
                    PythonFunctionKind
                if self._func_type == "general":
                    return JPythonFunctionKind.GENERAL
                elif self._func_type == "pandas":
                    return JPythonFunctionKind.PANDAS
                else:
                    raise TypeError("Unsupported func_type: %s." %
                                    self._func_type)

            if self._input_types is not None:
                j_input_types = java_utils.to_jarray(
                    gateway.jvm.DataType,
                    [_to_java_data_type(i) for i in self._input_types])
            else:
                j_input_types = None
            j_function_kind = get_python_function_kind()
            func = self._func
            if not isinstance(self._func, UserDefinedFunction):
                func = self._create_delegate_function()

            import cloudpickle
            serialized_func = cloudpickle.dumps(func)
            self._judf_placeholder = \
                self._create_judf(serialized_func, j_input_types, j_function_kind)
        return self._judf_placeholder
Ejemplo n.º 11
0
 def for_bulk_file_format(bulk_format: BulkFormat,
                          *paths: str) -> FileSourceBuilder:
     JPath = get_gateway().jvm.org.apache.flink.core.fs.Path
     JFileSource = get_gateway(
     ).jvm.org.apache.flink.connector.file.src.FileSource
     j_paths = to_jarray(JPath, [JPath(p) for p in paths])
     return FileSourceBuilder(
         JFileSource.forBulkFileFormat(bulk_format._j_bulk_format, j_paths))
Ejemplo n.º 12
0
def or_(predicate0: Union[bool, Expression[bool]],
        predicate1: Union[bool, Expression[bool]],
        *predicates: Union[bool, Expression[bool]]) -> Expression[bool]:
    """
    Boolean OR in three-valued logic.
    """
    gateway = get_gateway()
    predicates = to_jarray(gateway.jvm.Object, [_get_java_expression(p) for p in predicates])
    return _ternary_op("or", predicate0, predicate1, predicates)
Ejemplo n.º 13
0
 def partitioned_by(self,
                    *partition_keys: str) -> 'TableDescriptor.Builder':
     """
     Define which columns this table is partitioned by.
     """
     gateway = get_gateway()
     self._j_builder.partitionedBy(
         to_jarray(gateway.jvm.java.lang.String, partition_keys))
     return self
Ejemplo n.º 14
0
def concat(first: Union[str, Expression[str]],
           *others: Union[str, Expression[str]]) -> Expression[str]:
    """
    Returns the string that results from concatenating the arguments.
    Returns NULL if any argument is NULL.
    """
    gateway = get_gateway()
    return _binary_op(
        "concat", first,
        to_jarray(gateway.jvm.Object,
                  [_get_java_expression(other) for other in others]))
Ejemplo n.º 15
0
    def for_row_type(row_type: RowType,
                     writer_properties: Optional[Configuration] = None,
                     hadoop_config: Optional[Configuration] = None) \
            -> BulkWriterFactory:
        """
        Create a RowDataBulkWriterFactory that writes Row records with a defined RowType into Orc
        files in a batch fashion.

        Example:
        ::

            >>> row_type = DataTypes.ROW([
            ...     DataTypes.FIELD('string', DataTypes.STRING()),
            ...     DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT()))
            ... ])
            >>> row_type_info = Types.ROW_NAMED(
            ...     ['string', 'int_array'],
            ...     [Types.STRING(), Types.LIST(Types.INT())]
            ... )
            >>> sink = FileSink.for_bulk_format(
            ...     OUTPUT_DIR, OrcBulkWriters.for_row_type(
            ...         row_type=row_type,
            ...         writer_properties=Configuration(),
            ...         hadoop_config=Configuration(),
            ...     )
            ... ).build()
            >>> ds.map(lambda e: e, output_type=row_type_info).sink_to(sink)

        Note that in the above example, an identity map to indicate its RowTypeInfo is necessary
        before ``sink_to`` when ``ds`` is a source stream producing **RowData** records,
        because RowDataBulkWriterFactory assumes the input record type is Row.
        """
        if not isinstance(row_type, RowType):
            raise TypeError('row_type must be an instance of RowType')

        j_data_type = _to_java_data_type(row_type)
        jvm = get_gateway().jvm
        j_row_type = j_data_type.getLogicalType()
        orc_types = to_jarray(
            jvm.org.apache.flink.table.types.logical.LogicalType,
            [i for i in j_row_type.getChildren()])
        type_description = jvm.org.apache.flink.orc \
            .OrcSplitReaderUtil.logicalTypeToOrcType(j_row_type)
        if writer_properties is None:
            writer_properties = Configuration()
        if hadoop_config is None:
            hadoop_config = Configuration()

        return RowDataBulkWriterFactory(
            jvm.org.apache.flink.orc.writer.OrcBulkWriterFactory(
                jvm.org.apache.flink.orc.vector.RowDataVectorizer(
                    type_description.toString(), orc_types),
                create_java_properties(writer_properties),
                create_hadoop_configuration(hadoop_config)), row_type)
Ejemplo n.º 16
0
    def set_topics(self, *topics: str) -> 'KafkaSourceBuilder':
        """
        Set a list of topics the KafkaSource should consume from. All the topics in the list should
        have existed in the Kafka cluster. Otherwise, an exception will be thrown. To allow some
        topics to be created lazily, please use :meth:`set_topic_pattern` instead.

        :param topics: the list of topics to consume from.
        :return: this KafkaSourceBuilder.
        """
        self._j_builder.setTopics(
            to_jarray(get_gateway().jvm.java.lang.String, topics))
        return self
Ejemplo n.º 17
0
def array(head, *tail) -> Expression:
    """
    Creates an array of literals.

    Example:
    ::

        >>> tab.select(array(1, 2, 3))
    """
    gateway = get_gateway()
    tail = to_jarray(gateway.jvm.Object, [_get_java_expression(t) for t in tail])
    return _binary_op("array", head, tail)
Ejemplo n.º 18
0
def row(head, *tail) -> Expression:
    """
    Creates a row of expressions.

    Example:
    ::

        >>> tab.select(row("key1", 1))
    """
    gateway = get_gateway()
    tail = to_jarray(gateway.jvm.Object, [_get_java_expression(t) for t in tail])
    return _binary_op("row", head, tail)
Ejemplo n.º 19
0
    def sink(sql: str,
             type_info: RowTypeInfo,
             jdbc_connection_options: 'JdbcConnectionOptions',
             jdbc_execution_options: 'JdbcExecutionOptions' = None):
        """
        Create a JDBC sink.

        :param sql: arbitrary DML query (e.g. insert, update, upsert)
        :param type_info: A RowTypeInfo for query field types.
        :param jdbc_execution_options:  parameters of execution, such as batch size and maximum
                                        retries.
        :param jdbc_connection_options: parameters of connection, such as JDBC URL.
        :return: A JdbcSink.
        """
        sql_types = []
        gateway = get_gateway()
        JJdbcTypeUtil = gateway.jvm.org.apache.flink.connector.jdbc.utils.JdbcTypeUtil
        for field_type in type_info.get_field_types():
            sql_types.append(
                JJdbcTypeUtil.typeInformationToSqlType(
                    field_type.get_java_type_info()))
        j_sql_type = to_jarray(gateway.jvm.int, sql_types)
        output_format_clz = gateway.jvm.Class\
            .forName('org.apache.flink.connector.jdbc.internal.JdbcBatchingOutputFormat', False,
                     get_gateway().jvm.Thread.currentThread().getContextClassLoader())
        j_int_array_type = to_jarray(gateway.jvm.int, []).getClass()
        j_builder_method = output_format_clz.getDeclaredMethod(
            'createRowJdbcStatementBuilder',
            to_jarray(gateway.jvm.Class, [j_int_array_type]))
        j_builder_method.setAccessible(True)
        j_statement_builder = j_builder_method.invoke(
            None, to_jarray(gateway.jvm.Object, [j_sql_type]))

        jdbc_execution_options = jdbc_execution_options if jdbc_execution_options is not None \
            else JdbcExecutionOptions.defaults()
        j_jdbc_sink = gateway.jvm.org.apache.flink.connector.jdbc.JdbcSink\
            .sink(sql, j_statement_builder, jdbc_execution_options._j_jdbc_execution_options,
                  jdbc_connection_options._j_jdbc_connection_options)
        return JdbcSink(j_jdbc_sink=j_jdbc_sink)
Ejemplo n.º 20
0
 def set_hosts(
         self, hosts: Union[str,
                            List[str]]) -> 'ElasticsearchSinkBuilderBase':
     """
     Sets the hosts where the Elasticsearch cluster nodes are reachable.
     """
     if not isinstance(hosts, list):
         hosts = [hosts]
     JHttpHost = self.get_http_host_class()
     j_http_hosts_list = [JHttpHost.create(x) for x in hosts]
     j_http_hosts_array = to_jarray(JHttpHost, j_http_hosts_list)
     self._j_elasticsearch_sink_builder.setHosts(j_http_hosts_array)
     return self
Ejemplo n.º 21
0
    def alias(self, name: str, *extra_names: str) -> 'Expression[T]':
        """
        Specifies a name for an expression i.e. a field.

        Example:
        ::

            >>> tab.select(col('a').alias('b'))

        :param name: name for one field.
        :param extra_names: additional names if the expression expands to multiple fields
        """
        gateway = get_gateway()
        return _ternary_op("as")(self, name, to_jarray(gateway.jvm.String, extra_names))
Ejemplo n.º 22
0
 def get_function_definition(f):
     if isinstance(f, UserDefinedTableFunctionWrapper):
         """
         TypeInference was not supported for TableFunction in the old planner. Use
         TableFunctionDefinition to work around this issue.
         """
         j_result_types = to_jarray(gateway.jvm.TypeInformation,
                                    [_to_java_type(i) for i in f._result_types])
         j_result_type = gateway.jvm.org.apache.flink.api.java.typeutils.RowTypeInfo(
             j_result_types)
         return gateway.jvm.org.apache.flink.table.functions.TableFunctionDefinition(
             'f', f._java_user_defined_function(), j_result_type)
     else:
         return f._java_user_defined_function()
Ejemplo n.º 23
0
def invoke_java_object_method(obj, method_name):
    clz = obj.getClass()
    j_method = None
    while clz is not None:
        try:
            j_method = clz.getDeclaredMethod(method_name, None)
            if j_method is not None:
                break
        except:
            clz = clz.getSuperclass()
    if j_method is None:
        raise Exception("No such method: " + method_name)
    j_method.setAccessible(True)
    return j_method.invoke(obj, to_jarray(get_gateway().jvm.Object, []))
Ejemplo n.º 24
0
        def primary_key(self, *column_names: str) -> 'Schema.Builder':
            """
            Declares a primary key constraint for a set of given columns. Primary key uniquely
            identify a row in a table. Neither of columns in a primary can be nullable. The primary
            key is informational only. It will not be enforced. It can be used for optimizations. It
            is the data owner's responsibility to ensure uniqueness of the data.

            The primary key will be assigned a generated name in the format {@code PK_col1_col2}.

            :param column_names: Columns that form a unique primary key
            """
            gateway = get_gateway()
            self._j_builder.primaryKey(
                to_jarray(gateway.jvm.java.lang.String, column_names))
            return self
Ejemplo n.º 25
0
def add_jars_to_context_class_loader(jar_urls):
    """
    Add jars to Python gateway server for local compilation and local execution (i.e. minicluster).
    There are many component in Flink which won't be added to classpath by default. e.g. Kafka
    connector, JDBC connector, CSV format etc. This utility function can be used to hot load the
    jars.

    :param jar_urls: The list of jar urls.
    """
    gateway = get_gateway()
    # validate and normalize
    jar_urls = [gateway.jvm.java.net.URL(url) for url in jar_urls]
    context_classloader = gateway.jvm.Thread.currentThread().getContextClassLoader()
    existing_urls = []
    class_loader_name = context_classloader.getClass().getName()
    if class_loader_name == "java.net.URLClassLoader":
        existing_urls = set([url.toString() for url in context_classloader.getURLs()])
    if all([url.toString() in existing_urls for url in jar_urls]):
        # if urls all existed, no need to create new class loader.
        return
    URLClassLoaderClass = load_java_class("java.net.URLClassLoader")
    addURL = URLClassLoaderClass.getDeclaredMethod(
        "addURL",
        to_jarray(
            gateway.jvm.Class,
            [load_java_class("java.net.URL")]))
    addURL.setAccessible(True)
    if class_loader_name == "org.apache.flink.runtime.execution.librarycache." \
                            "FlinkUserCodeClassLoaders$SafetyNetWrapperClassLoader":
        ensureInner = context_classloader.getClass().getDeclaredMethod("ensureInner", None)
        ensureInner.setAccessible(True)
        loader = ensureInner.invoke(context_classloader, None)
    else:
        loader = context_classloader
    for url in jar_urls:
        addURL.invoke(loader, to_jarray(get_gateway().jvm.Object, [url]))
Ejemplo n.º 26
0
def with_columns(head, *tails) -> Expression:
    """
    Creates an expression that selects a range of columns. It can be used wherever an array of
    expression is accepted such as function calls, projections, or groupings.

    A range can either be index-based or name-based. Indices start at 1 and boundaries are
    inclusive.

    e.g. with_columns(range_("b", "c")) or with_columns(col("*"))

    .. seealso:: :func:`~pyflink.table.expressions.range_`,
                 :func:`~pyflink.table.expressions.without_columns`
    """
    gateway = get_gateway()
    tails = to_jarray(gateway.jvm.Object, [_get_java_expression(t) for t in tails])
    return _binary_op("withColumns", head, tails)
Ejemplo n.º 27
0
Archivo: udf.py Proyecto: zjureel/flink
 def _create_judf(self, serialized_func, j_input_types, j_function_kind):
     gateway = get_gateway()
     j_result_types = java_utils.to_jarray(
         gateway.jvm.DataType,
         [_to_java_data_type(i) for i in self._result_types])
     j_result_type = gateway.jvm.DataTypes.ROW(j_result_types)
     PythonTableFunction = gateway.jvm \
         .org.apache.flink.table.functions.python.PythonTableFunction
     j_table_function = PythonTableFunction(self._name,
                                            bytearray(serialized_func),
                                            j_input_types, j_result_type,
                                            j_function_kind,
                                            self._deterministic,
                                            self._takes_row_as_input,
                                            _get_python_env())
     return j_table_function
Ejemplo n.º 28
0
        def primary_key_named(self, constraint_name: str,
                              *column_names: str) -> 'Schema.Builder':
            """
            Declares a primary key constraint for a set of given columns. Primary key uniquely
            identify a row in a table. Neither of columns in a primary can be nullable. The primary
            key is informational only. It will not be enforced. It can be used for optimizations. It
            is the data owner's responsibility to ensure uniqueness of the data.

            :param constraint_name: Name for the primary key, can be used to reference the
                constraint
            :param column_names: Columns that form a unique primary key
            """
            gateway = get_gateway()
            self._j_builder.primaryKeyNamed(
                constraint_name,
                to_jarray(gateway.jvm.java.lang.String, column_names))
            return self
Ejemplo n.º 29
0
def concat_ws(separator: Union[str, Expression[str]],
              first: Union[str, Expression[str]],
              *others: Union[str, Expression[str]]) -> Expression[str]:
    """
    Returns the string that results from concatenating the arguments and separator.
    Returns NULL If the separator is NULL.

    .. note::

        this function does not skip empty strings. However, it does skip any NULL
        values after the separator argument.
    """
    gateway = get_gateway()
    return _ternary_op(
        "concatWs", separator, first,
        to_jarray(gateway.jvm.Object,
                  [_get_java_expression(other) for other in others]))
Ejemplo n.º 30
0
    def for_record_stream_format(stream_format: StreamFormat, *paths: str) -> FileSourceBuilder:
        """
        Builds a new FileSource using a :class:`~FileSource.StreamFormat` to read record-by-record
        from a file stream.

        When possible, stream-based formats are generally easier (preferable) to file-based
        formats, because they support better default behavior around I/O batching or progress
        tracking (checkpoints).

        Stream formats also automatically de-compress files based on the file extension. This
        supports files ending in ".deflate" (Deflate), ".xz" (XZ), ".bz2" (BZip2), ".gz", ".gzip"
        (GZip).
        """
        JPath = get_gateway().jvm.org.apache.flink.core.fs.Path
        JFileSource = get_gateway().jvm.org.apache.flink.connector.file.src.FileSource
        j_paths = to_jarray(JPath, [JPath(p) for p in paths])
        return FileSourceBuilder(
            JFileSource.forRecordStreamFormat(stream_format._j_stream_format, j_paths))