コード例 #1
0
ファイル: io.py プロジェクト: PrettyWood/polars
def read_ipc(
    file: Union[str, BinaryIO, BytesIO, Path, bytes],
    columns: Optional[Union[List[int], List[str]]] = None,
    n_rows: Optional[int] = None,
    use_pyarrow: bool = False,
    memory_map: bool = True,
    storage_options: Optional[Dict] = None,
    row_count_name: Optional[str] = None,
    row_count_offset: int = 0,
    rechunk: bool = True,
    **kwargs: Any,
) -> DataFrame:
    """
    Read into a DataFrame from Arrow IPC (Feather v2) file.

    Parameters
    ----------
    file
        Path to a file or a file-like object.
        If ``fsspec`` is installed, it will be used to open remote files.
    columns
        Columns to select. Accepts a list of column indices (starting at zero) or a list of column names.
    n_rows
        Stop reading from IPC file after reading ``n_rows``.
        Only valid when `use_pyarrow=False`.
    use_pyarrow
        Use pyarrow or the native rust reader.
    memory_map
        Memory map underlying file. This will likely increase performance.
        Only used when ``use_pyarrow=True``.
    storage_options
        Extra options that make sense for ``fsspec.open()`` or a particular storage connection, e.g. host, port, username, password, etc.
    row_count_name
        If not None, this will insert a row count column with give name into the DataFrame
    row_count_offset
        Offset to start the row_count column (only use if the name is set)
    rechunk
        Make sure that all data is contiguous.

    Returns
    -------
    DataFrame
    """

    # Map legacy arguments to current ones and remove them from kwargs.
    n_rows = kwargs.pop("stop_after_n_rows", n_rows)

    if columns is None:
        columns = kwargs.pop("projection", None)

    if use_pyarrow:
        if row_count_name is not None:
            raise ValueError(
                "``row_count_name`` cannot be used with ``use_pyarrow=True``.")
        if n_rows:
            raise ValueError(
                "``n_rows`` cannot be used with ``use_pyarrow=True``.")

    storage_options = storage_options or {}
    with _prepare_file_arg(file, **storage_options) as data:
        if use_pyarrow:
            if not _PYARROW_AVAILABLE:
                raise ImportError(
                    "'pyarrow' is required when using 'read_ipc(..., use_pyarrow=True)'."
                )

            tbl = pa.feather.read_table(data,
                                        memory_map=memory_map,
                                        columns=columns)
            return DataFrame._from_arrow(tbl, rechunk=rechunk)

        return DataFrame._read_ipc(
            data,
            columns=columns,
            n_rows=n_rows,
            row_count_name=row_count_name,
            row_count_offset=row_count_offset,
            rechunk=rechunk,
        )
コード例 #2
0
def from_arrow(a: Union["pa.Table", "pa.Array", "pa.ChunkedArray"],
               rechunk: bool = True) -> Union[DataFrame, Series]:
    """
    Create a DataFrame or Series from an Arrow Table or Array.

    This operation will be zero copy for the most part. Types that are not
    supported by Polars may be cast to the closest supported type.

    Parameters
    ----------
    a : Arrow Table or Array
        Data represented as Arrow Table or Array.
    rechunk : bool, default True
        Make sure that all data is contiguous.

    Returns
    -------
    DataFrame or Series

    Examples
    --------
    Constructing a DataFrame from an Arrow Table:

    >>> import pyarrow as pa
    >>> data = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]})
    >>> df = pl.from_arrow(data)
    >>> df
    shape: (3, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 1   ┆ 4   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 2   ┆ 5   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 3   ┆ 6   │
    └─────┴─────┘

    Constructing a Series from an Arrow Array:

    >>> import pyarrow as pa
    >>> data = pa.array([1, 2, 3])
    >>> series = pl.from_arrow(data)
    >>> series
    shape: (3,)
    Series: '' [i64]
    [
        1
        2
        3
    ]

    """
    if not _PYARROW_AVAILABLE:
        raise ImportError("'pyarrow' is required when using from_arrow()."
                          )  # pragma: no cover
    if isinstance(a, pa.Table):
        return DataFrame._from_arrow(a, rechunk=rechunk)
    elif isinstance(a, (pa.Array, pa.ChunkedArray)):
        return Series._from_arrow("", a, rechunk)
    else:
        raise ValueError(f"Expected Arrow Table or Array, got {type(a)}.")