Ejemplo n.º 1
0
def assert_frame_equal_local_categoricals(df_a: pli.DataFrame,
                                          df_b: pli.DataFrame) -> None:
    assert df_a.schema == df_b.schema
    cat_to_str = pli.col(Categorical).cast(str)
    assert df_a.with_column(cat_to_str).frame_equal(
        df_b.with_column(cat_to_str))
    cat_to_phys = pli.col(Categorical).to_physical()
    assert df_a.with_column(cat_to_phys).frame_equal(
        df_b.with_column(cat_to_phys))
Ejemplo n.º 2
0
def from_records(
    data: Sequence[Sequence[Any]],
    columns: Sequence[str] | None = None,
    orient: Literal["col", "row"] | None = None,
) -> DataFrame:
    """
    Construct a DataFrame from a numpy ndarray or sequence of sequences.

    Note that this is slower than creating from columnar memory.

    Parameters
    ----------
    data : numpy ndarray or Sequence of sequences
        Two-dimensional data represented as numpy ndarray or sequence of sequences.
    columns : Sequence of str, default None
        Column labels to use for resulting DataFrame. Must match data dimensions.
        If not specified, columns will be named `column_0`, `column_1`, etc.
    orient : {'col', 'row'}, default None
        Whether to interpret two-dimensional data as columns or as rows. If None,
        the orientation is inferred by matching the columns and data dimensions. If
        this does not yield conclusive results, column orientation is used.

    Returns
    -------
    DataFrame

    Examples
    --------
    >>> data = [[1, 2, 3], [4, 5, 6]]
    >>> df = pl.from_records(data, columns=["a", "b"])
    >>> df
    shape: (3, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 1   ┆ 4   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 2   ┆ 5   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 3   ┆ 6   │
    └─────┴─────┘

    """
    if _NUMPY_AVAILABLE and isinstance(data, np.ndarray):
        warnings.warn(
            "using `from_records` with a numpy ndarray is deprecated, "
            "use `from_numpy` instead",
            DeprecationWarning,
        )
        return DataFrame._from_numpy(data, columns=columns, orient=orient)
    else:
        return DataFrame._from_records(data, columns=columns, orient=orient)
Ejemplo n.º 3
0
def from_dicts(dicts: Sequence[Dict[str, Any]]) -> DataFrame:
    """
    Construct a DataFrame from a sequence of dictionaries.

    Parameters
    ----------
    dicts
        Sequence with dictionaries mapping column name to value

    Returns
    -------
    DataFrame

    Examples
    --------

    >>> data = [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}]
    >>> df = pl.from_dicts(data)
    >>> df
    shape: (3, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 1   ┆ 4   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 2   ┆ 5   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 3   ┆ 6   │
    └─────┴─────┘

    """
    return DataFrame._from_dicts(dicts)
Ejemplo n.º 4
0
def read_avro(
    file: Union[str, Path, BytesIO, BinaryIO],
    columns: Optional[Union[List[int], List[str]]] = None,
    n_rows: Optional[int] = None,
    **kwargs: Any,
) -> DataFrame:
    """
    Read into a DataFrame from Apache Avro format.

    Parameters
    ----------
    file
        Path to a file or a file-like object.
    columns
        Columns to select. Accepts a list of column indices (starting at zero) or a list of column names.
    n_rows
        Stop reading from Apache Avro file after reading ``n_rows``.

    Returns
    -------
    DataFrame
    """
    if isinstance(file, (str, Path)):
        file = format_path(file)
    if columns is None:
        columns = kwargs.pop("projection", None)

    return DataFrame._read_avro(file, n_rows=n_rows, columns=columns)
Ejemplo n.º 5
0
def update_columns(df: DataFrame, new_columns: List[str]) -> DataFrame:
    if df.width > len(new_columns):
        cols = df.columns
        for i, name in enumerate(new_columns):
            cols[i] = name
        new_columns = cols
    df.columns = new_columns
    return df
Ejemplo n.º 6
0
def get_dummies(df: pli.DataFrame) -> pli.DataFrame:
    """
    Convert categorical variables into dummy/indicator variables.

    Parameters
    ----------
    df
        DataFrame to convert.
    """
    return df.to_dummies()
Ejemplo n.º 7
0
Archivo: io.py Proyecto: ghuls/polars
def read_json(source: Union[str, BytesIO]) -> DataFrame:
    """
    Read into a DataFrame from JSON format.

    Parameters
    ----------
    source
        Path to a file or a file like object.
    """
    return DataFrame._read_json(source)
Ejemplo n.º 8
0
def read_json(source: Union[str, IOBase], json_lines: bool = False) -> DataFrame:
    """
    Read into a DataFrame from JSON format.

    Parameters
    ----------
    source
        Path to a file or a file-like object.
    json_lines
        Toggle between "JSON" and "NDJSON" format
    """
    return DataFrame._read_json(source, json_lines)
Ejemplo n.º 9
0
def from_numpy(
    data: np.ndarray,
    columns: Sequence[str] | None = None,
    orient: Literal["col", "row"] | None = None,
) -> DataFrame:
    """
    Construct a DataFrame from a numpy ndarray.

    Note that this is slower than creating from columnar memory.

    Parameters
    ----------
    data : numpy ndarray
        Two-dimensional data represented as a numpy ndarray.
    columns : Sequence of str, default None
        Column labels to use for resulting DataFrame. Must match data dimensions.
        If not specified, columns will be named `column_0`, `column_1`, etc.
    orient : {'col', 'row'}, default None
        Whether to interpret two-dimensional data as columns or as rows. If None,
        the orientation is inferred by matching the columns and data dimensions. If
        this does not yield conclusive results, column orientation is used.

    Returns
    -------
    DataFrame

    Examples
    --------
    >>> import numpy as np
    >>> data = np.array([[1, 2, 3], [4, 5, 6]])
    >>> df = pl.from_numpy(data, columns=["a", "b"], orient="col")
    >>> df
    shape: (3, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 1   ┆ 4   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 2   ┆ 5   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 3   ┆ 6   │
    └─────┴─────┘

    """
    if not _NUMPY_AVAILABLE:
        raise ImportError("'numpy' is required when using from_numpy().")
    return DataFrame._from_numpy(data, columns=columns, orient=orient)
Ejemplo n.º 10
0
def from_dict(
    data: Mapping[str, Union[Sequence, Mapping]],
    columns: Optional[Sequence[str]] = None,
) -> DataFrame:
    """
    Construct a DataFrame from a dictionary of sequences.

    Parameters
    ----------
    data : dict of sequences
        Two-dimensional data represented as a dictionary. dict must contain
        Sequences.
    columns : Sequence of str, default None
        Column labels to use for resulting DataFrame. If specified, overrides any
        labels already present in the data. Must match data dimensions.

    Returns
    -------
    DataFrame

    Examples
    --------

    >>> data = {"a": [1, 2], "b": [3, 4]}
    >>> df = pl.from_dict(data)
    >>> df
    shape: (2, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 1   ┆ 3   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 2   ┆ 4   │
    └─────┴─────┘

    """
    # To deal with structs, we have to modify the data, but we dont want to modify
    # `data` directly. Thus we create a separate dict, and only do so for the
    # for the fields that need this, to save memory
    data_struct = dict()
    for col_name, value in data.items():
        if isinstance(value, dict):
            data_struct[col_name] = from_dict(value).to_struct(col_name)

    return DataFrame._from_dict(data=dict(data, **data_struct),
                                columns=columns)  # type: ignore
Ejemplo n.º 11
0
def from_records(
    data: Union[np.ndarray, Sequence[Sequence[Any]]],
    columns: Optional[Sequence[str]] = None,
    orient: Optional[str] = None,
) -> DataFrame:
    """
    Construct a DataFrame from a numpy ndarray or sequence of sequences.

    Parameters
    ----------
    data : numpy ndarray or Sequence of sequences
        Two-dimensional data represented as numpy ndarray or sequence of sequences.
    columns : Sequence of str, default None
        Column labels to use for resulting DataFrame. Must match data dimensions.
        If not specified, columns will be named `column_0`, `column_1`, etc.
    orient : {'col', 'row'}, default None
        Whether to interpret two-dimensional data as columns or as rows. If None,
        the orientation is inferred by matching the columns and data dimensions. If
        this does not yield conclusive results, column orientation is used.

    Returns
    -------
    DataFrame

    Examples
    --------

    >>> data = [[1, 2, 3], [4, 5, 6]]
    >>> df = pl.from_records(data, columns=["a", "b"])
    >>> df
        shape: (3, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 1   ┆ 4   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 2   ┆ 5   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 3   ┆ 6   │
    └─────┴─────┘

    """
    return DataFrame._from_records(data, columns=columns, orient=orient)
Ejemplo n.º 12
0
def from_dict(
    data: Dict[str, Sequence[Any]],
    columns: Optional[Sequence[str]] = None,
) -> DataFrame:
    """
    Construct a DataFrame from a dictionary of sequences.

    Parameters
    ----------
    data : dict of sequences
        Two-dimensional data represented as a dictionary. dict must contain
        Sequences.
    columns : Sequence of str, default None
        Column labels to use for resulting DataFrame. If specified, overrides any
        labels already present in the data. Must match data dimensions.

    Returns
    -------
    DataFrame

    Examples
    --------

    >>> data = {"a": [1, 2], "b": [3, 4]}
    >>> df = pl.from_dict(data)
    >>> df
    shape: (2, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 1   ┆ 3   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 2   ┆ 4   │
    └─────┴─────┘

    """
    return DataFrame._from_dict(data=data, columns=columns)
Ejemplo n.º 13
0
def from_dicts(dicts: Sequence[dict[str, Any]],
               infer_schema_length: int | None = 50) -> DataFrame:
    """
    Construct a DataFrame from a sequence of dictionaries.

    Parameters
    ----------
    dicts
        Sequence with dictionaries mapping column name to value
    infer_schema_length
        How many dictionaries/rows to scan to determine the data types
        if set to `None` all rows are scanned. This will be slow.

    Returns
    -------
    DataFrame

    Examples
    --------
    >>> data = [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}]
    >>> df = pl.from_dicts(data)
    >>> df
    shape: (3, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 1   ┆ 4   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 2   ┆ 5   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 3   ┆ 6   │
    └─────┴─────┘

    """
    return DataFrame._from_dicts(dicts, infer_schema_length)
Ejemplo n.º 14
0
def read_parquet(
    source: Union[str, Path, BinaryIO, BytesIO, bytes],
    columns: Optional[Union[List[int], List[str]]] = None,
    n_rows: Optional[int] = None,
    use_pyarrow: bool = False,
    memory_map: bool = True,
    storage_options: Optional[Dict] = None,
    parallel: bool = True,
    row_count_name: Optional[str] = None,
    row_count_offset: int = 0,
    **kwargs: Any,
) -> DataFrame:
    """
    Read into a DataFrame from a parquet file.

    Parameters
    ----------
    source
        Path to a file, or a file-like object. If the path is a directory, that directory will be used
        as partition aware scan.
        If ``fsspec`` is installed, it will be used to open remote files.
    columns
        Columns to select. Accepts a list of column indices (starting at zero) or a list of column names.
    n_rows
        Stop reading from parquet file after reading ``n_rows``.
        Only valid when `use_pyarrow=False`.
    use_pyarrow
        Use pyarrow instead of the rust native parquet reader. The pyarrow reader is more stable.
    memory_map
        Memory map underlying file. This will likely increase performance.
        Only used when ``use_pyarrow=True``.
    storage_options
        Extra options that make sense for ``fsspec.open()`` or a particular storage connection, e.g. host, port, username, password, etc.
    parallel
        Read the parquet file in parallel. The single threaded reader consumes less memory.
    row_count_name
        If not None, this will insert a row count column with give name into the DataFrame
    row_count_offset
        Offset to start the row_count column (only use if the name is set)
    **kwargs
        kwargs for [pyarrow.parquet.read_table](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html)

    Returns
    -------
    DataFrame
    """

    # Map legacy arguments to current ones and remove them from kwargs.
    n_rows = kwargs.pop("stop_after_n_rows", n_rows)

    if columns is None:
        columns = kwargs.pop("projection", None)

    if use_pyarrow:
        if n_rows:
            raise ValueError(
                "``n_rows`` cannot be used with ``use_pyarrow=True``.")

    storage_options = storage_options or {}
    with _prepare_file_arg(source, **storage_options) as source_prep:
        if use_pyarrow:
            if not _PYARROW_AVAILABLE:
                raise ImportError(
                    "'pyarrow' is required when using 'read_parquet(..., use_pyarrow=True)'."
                )

            return from_arrow(  # type: ignore[return-value]
                pa.parquet.read_table(
                    source_prep,
                    memory_map=memory_map,
                    columns=columns,
                    **kwargs,
                ))

        return DataFrame._read_parquet(
            source_prep,
            columns=columns,
            n_rows=n_rows,
            parallel=parallel,
            row_count_name=row_count_name,
            row_count_offset=row_count_offset,
        )
Ejemplo n.º 15
0
def read_ipc(
    file: Union[str, BinaryIO, BytesIO, Path, bytes],
    columns: Optional[Union[List[int], List[str]]] = None,
    n_rows: Optional[int] = None,
    use_pyarrow: bool = False,
    memory_map: bool = True,
    storage_options: Optional[Dict] = None,
    row_count_name: Optional[str] = None,
    row_count_offset: int = 0,
    rechunk: bool = True,
    **kwargs: Any,
) -> DataFrame:
    """
    Read into a DataFrame from Arrow IPC (Feather v2) file.

    Parameters
    ----------
    file
        Path to a file or a file-like object.
        If ``fsspec`` is installed, it will be used to open remote files.
    columns
        Columns to select. Accepts a list of column indices (starting at zero) or a list of column names.
    n_rows
        Stop reading from IPC file after reading ``n_rows``.
        Only valid when `use_pyarrow=False`.
    use_pyarrow
        Use pyarrow or the native rust reader.
    memory_map
        Memory map underlying file. This will likely increase performance.
        Only used when ``use_pyarrow=True``.
    storage_options
        Extra options that make sense for ``fsspec.open()`` or a particular storage connection, e.g. host, port, username, password, etc.
    row_count_name
        If not None, this will insert a row count column with give name into the DataFrame
    row_count_offset
        Offset to start the row_count column (only use if the name is set)
    rechunk
        Make sure that all data is contiguous.

    Returns
    -------
    DataFrame
    """

    # Map legacy arguments to current ones and remove them from kwargs.
    n_rows = kwargs.pop("stop_after_n_rows", n_rows)

    if columns is None:
        columns = kwargs.pop("projection", None)

    if use_pyarrow:
        if row_count_name is not None:
            raise ValueError(
                "``row_count_name`` cannot be used with ``use_pyarrow=True``.")
        if n_rows:
            raise ValueError(
                "``n_rows`` cannot be used with ``use_pyarrow=True``.")

    storage_options = storage_options or {}
    with _prepare_file_arg(file, **storage_options) as data:
        if use_pyarrow:
            if not _PYARROW_AVAILABLE:
                raise ImportError(
                    "'pyarrow' is required when using 'read_ipc(..., use_pyarrow=True)'."
                )

            tbl = pa.feather.read_table(data,
                                        memory_map=memory_map,
                                        columns=columns)
            return DataFrame._from_arrow(tbl, rechunk=rechunk)

        return DataFrame._read_ipc(
            data,
            columns=columns,
            n_rows=n_rows,
            row_count_name=row_count_name,
            row_count_offset=row_count_offset,
            rechunk=rechunk,
        )
Ejemplo n.º 16
0
def read_csv(
    file: Union[str, TextIO, BytesIO, Path, BinaryIO, bytes],
    has_header: bool = True,
    columns: Optional[Union[List[int], List[str]]] = None,
    new_columns: Optional[List[str]] = None,
    sep: str = ",",
    comment_char: Optional[str] = None,
    quote_char: Optional[str] = r'"',
    skip_rows: int = 0,
    dtypes: Optional[Union[Mapping[str, Type[DataType]],
                           List[Type[DataType]]]] = None,
    null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
    ignore_errors: bool = False,
    parse_dates: bool = False,
    n_threads: Optional[int] = None,
    infer_schema_length: Optional[int] = 100,
    batch_size: int = 8192,
    n_rows: Optional[int] = None,
    encoding: str = "utf8",
    low_memory: bool = False,
    rechunk: bool = True,
    use_pyarrow: bool = False,
    storage_options: Optional[Dict] = None,
    skip_rows_after_header: int = 0,
    row_count_name: Optional[str] = None,
    row_count_offset: int = 0,
    sample_size: int = 1024,
    **kwargs: Any,
) -> DataFrame:
    """
    Read a CSV file into a Dataframe.

    Parameters
    ----------
    file
        Path to a file or a file-like object.
        By file-like object, we refer to objects with a ``read()``
        method, such as a file handler (e.g. via builtin ``open``
        function) or ``StringIO`` or ``BytesIO``.
        If ``fsspec`` is installed, it will be used to open remote
        files.
    has_header
        Indicate if the first row of dataset is a header or not.
        If set to False, column names will be autogenerated in the
        following format: ``column_x``, with ``x`` being an
        enumeration over every column in the dataset starting at 1.
    columns
        Columns to select. Accepts a list of column indices (starting
        at zero) or a list of column names.
    new_columns
        Rename columns right after parsing the CSV file. If the given
        list is shorter than the width of the DataFrame the remaining
        columns will have their original name.
    sep
        Single byte character to use as delimiter in the file.
    comment_char
        Single byte character that indicates the start of a comment line,
        for instance ``#``.
    quote_char
        Single byte character used for csv quoting, default = ``"``.
        Set to None to turn off special handling and escaping of quotes.
    skip_rows
        Start reading after ``skip_rows`` lines.
    dtypes
        Overwrite dtypes during inference.
    null_values
        Values to interpret as null values. You can provide a:
          - ``str``: All values equal to this string will be null.
          - ``List[str]``: A null value per column.
          - ``Dict[str, str]``: A dictionary that maps column name to a
                                null value string.
    ignore_errors
        Try to keep reading lines if some lines yield errors.
        First try ``infer_schema_length=0`` to read all columns as
        ``pl.Utf8`` to check which values might cause an issue.
    parse_dates
        Try to automatically parse dates. If this does not succeed,
        the column remains of data type ``pl.Utf8``.
    n_threads
        Number of threads to use in csv parsing.
        Defaults to the number of physical cpu's of your system.
    infer_schema_length
        Maximum number of lines to read to infer schema.
        If set to 0, all columns will be read as ``pl.Utf8``.
        If set to ``None``, a full table scan will be done (slow).
    batch_size
        Number of lines to read into the buffer at once.
        Modify this to change performance.
    n_rows
        Stop reading from CSV file after reading ``n_rows``.
        During multi-threaded parsing, an upper bound of ``n_rows``
        rows cannot be guaranteed.
    encoding
        Allowed encodings: ``utf8`` or ``utf8-lossy``.
        Lossy means that invalid utf8 values are replaced with ``�``
        characters.
    low_memory
        Reduce memory usage at expense of performance.
    rechunk
        Make sure that all columns are contiguous in memory by
        aggregating the chunks into a single array.
    use_pyarrow
        Try to use pyarrow's native CSV parser.
        This is not always possible. The set of arguments given to
        this function determines if it is possible to use pyarrow's
        native parser. Note that pyarrow and polars may have a
        different strategy regarding type inference.
    storage_options
        Extra options that make sense for ``fsspec.open()`` or a
        particular storage connection.
        e.g. host, port, username, password, etc.
    skip_rows_after_header
        Skip these number of rows when the header is parsed
    row_count_name
        If not None, this will insert a row count column with give name into the DataFrame
    row_count_offset
        Offset to start the row_count column (only use if the name is set)
    sample_size:
        Set the sample size. This is used to sample statistics to estimate the allocation needed.

    Returns
    -------
    DataFrame
    """

    # Map legacy arguments to current ones and remove them from kwargs.
    has_header = kwargs.pop("has_headers", has_header)
    dtypes = kwargs.pop("dtype", dtypes)
    n_rows = kwargs.pop("stop_after_n_rows", n_rows)

    if columns is None:
        columns = kwargs.pop("projection", None)

    _check_arg_is_1byte("sep", sep, False)
    _check_arg_is_1byte("comment_char", comment_char, False)
    _check_arg_is_1byte("quote_char", quote_char, True)

    projection, columns = handle_projection_columns(columns)

    if isinstance(file, bytes) and len(file) == 0:
        raise ValueError("Empty bytes data provided.")

    storage_options = storage_options or {}

    if columns and not has_header:
        for column in columns:
            if isinstance(column, str) and not column.startswith("column_"):
                raise ValueError(
                    'Specified column names do not start with "column_", '
                    "but autogenerated header names were requested.")

    if use_pyarrow and not _PYARROW_AVAILABLE:
        raise ImportError(
            "'pyarrow' is required when using 'read_csv(..., use_pyarrow=True)'."
        )

    if (use_pyarrow and dtypes is None and n_rows is None and n_threads is None
            and encoding == "utf8" and not low_memory and null_values is None
            and parse_dates):
        include_columns = None

        if columns:
            if not has_header:
                # Convert 'column_1', 'column_2', ... column names to 'f0', 'f1', ... column names for pyarrow,
                # if CSV file does not contain a header.
                include_columns = [
                    f"f{int(column[7:]) - 1}" for column in columns
                ]
            else:
                include_columns = columns

        if not columns and projection:
            # Convert column indices from projection to 'f0', 'f1', ... column names for pyarrow.
            include_columns = [f"f{column_idx}" for column_idx in projection]

        with _prepare_file_arg(file, **storage_options) as data:
            tbl = pa.csv.read_csv(
                data,
                pa.csv.ReadOptions(skip_rows=skip_rows,
                                   autogenerate_column_names=not has_header),
                pa.csv.ParseOptions(delimiter=sep),
                pa.csv.ConvertOptions(
                    column_types=None,
                    include_columns=include_columns,
                    include_missing_columns=ignore_errors,
                ),
            )

        if not has_header:
            # Rename 'f0', 'f1', ... columns names autogenated by pyarrow to 'column_1', 'column_2', ...
            tbl = tbl.rename_columns([
                f"column_{int(column[1:]) + 1}" for column in tbl.column_names
            ])

        df = cast(DataFrame, from_arrow(tbl, rechunk))
        if new_columns:
            return update_columns(df, new_columns)
        return df

    if new_columns and dtypes and isinstance(dtypes, dict):
        current_columns = None

        # As new column names are not available yet while parsing the CSV file, rename column names in
        # dtypes to old names (if possible) so they can be used during CSV parsing.
        if columns:
            if len(columns) < len(new_columns):
                raise ValueError(
                    "More new column names are specified than there are selected columns."
                )

            # Get column names of requested columns.
            current_columns = columns[0:len(new_columns)]
        elif not has_header:
            # When there are no header, column names are autogenerated (and known).

            if projection:
                if columns and len(columns) < len(new_columns):
                    raise ValueError(
                        "More new column names are specified than there are selected columns."
                    )
                # Convert column indices from projection to 'column_1', 'column_2', ... column names.
                current_columns = [
                    f"column_{column_idx + 1}" for column_idx in projection
                ]
            else:
                # Generate autogenerated 'column_1', 'column_2', ... column names for new column names.
                current_columns = [
                    f"column_{column_idx}"
                    for column_idx in range(1,
                                            len(new_columns) + 1)
                ]
        else:
            # When a header is present, column names are not known yet.

            if len(dtypes) <= len(new_columns):
                # If dtypes dictionary contains less or same amount of values than new column names
                # a list of dtypes can be created if all listed column names in dtypes dictionary
                # appear in the first consecutive new column names.
                dtype_list = [
                    dtypes[new_column_name]
                    for new_column_name in new_columns[0:len(dtypes)]
                    if new_column_name in dtypes
                ]

                if len(dtype_list) == len(dtypes):
                    dtypes = dtype_list

        if current_columns and isinstance(dtypes, dict):
            new_to_current = {
                new_column: current_column
                for new_column, current_column in zip(new_columns,
                                                      current_columns)
            }
            # Change new column names to current column names in dtype.
            dtypes = {
                new_to_current.get(column_name, column_name): column_dtype
                for column_name, column_dtype in dtypes.items()
            }

    with _prepare_file_arg(file, **storage_options) as data:
        df = DataFrame._read_csv(
            file=data,
            has_header=has_header,
            columns=columns if columns else projection,
            sep=sep,
            comment_char=comment_char,
            quote_char=quote_char,
            skip_rows=skip_rows,
            dtypes=dtypes,
            null_values=null_values,
            ignore_errors=ignore_errors,
            parse_dates=parse_dates,
            n_threads=n_threads,
            infer_schema_length=infer_schema_length,
            batch_size=batch_size,
            n_rows=n_rows,
            encoding=encoding,
            low_memory=low_memory,
            rechunk=rechunk,
            skip_rows_after_header=skip_rows_after_header,
            row_count_name=row_count_name,
            row_count_offset=row_count_offset,
            sample_size=sample_size,
        )

    if new_columns:
        return update_columns(df, new_columns)
    return df
Ejemplo n.º 17
0
def from_arrow(a: Union["pa.Table", "pa.Array", "pa.ChunkedArray"],
               rechunk: bool = True) -> Union[DataFrame, Series]:
    """
    Create a DataFrame or Series from an Arrow Table or Array.

    This operation will be zero copy for the most part. Types that are not
    supported by Polars may be cast to the closest supported type.

    Parameters
    ----------
    a : Arrow Table or Array
        Data represented as Arrow Table or Array.
    rechunk : bool, default True
        Make sure that all data is contiguous.

    Returns
    -------
    DataFrame or Series

    Examples
    --------
    Constructing a DataFrame from an Arrow Table:

    >>> import pyarrow as pa
    >>> data = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]})
    >>> df = pl.from_arrow(data)
    >>> df
    shape: (3, 2)
    ┌─────┬─────┐
    │ a   ┆ b   │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 1   ┆ 4   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 2   ┆ 5   │
    ├╌╌╌╌╌┼╌╌╌╌╌┤
    │ 3   ┆ 6   │
    └─────┴─────┘

    Constructing a Series from an Arrow Array:

    >>> import pyarrow as pa
    >>> data = pa.array([1, 2, 3])
    >>> series = pl.from_arrow(data)
    >>> series
    shape: (3,)
    Series: '' [i64]
    [
        1
        2
        3
    ]

    """
    if not _PYARROW_AVAILABLE:
        raise ImportError("'pyarrow' is required when using from_arrow()."
                          )  # pragma: no cover
    if isinstance(a, pa.Table):
        return DataFrame._from_arrow(a, rechunk=rechunk)
    elif isinstance(a, (pa.Array, pa.ChunkedArray)):
        return Series._from_arrow("", a, rechunk)
    else:
        raise ValueError(f"Expected Arrow Table or Array, got {type(a)}.")
Ejemplo n.º 18
0
def from_pandas(
    df: Union["pd.DataFrame", "pd.Series", "pd.DatetimeIndex"],
    rechunk: bool = True,
    nan_to_none: bool = True,
) -> Union[DataFrame, Series]:
    """
    Construct a Polars DataFrame or Series from a pandas DataFrame or Series.

    Requires the pandas package to be installed.

    Parameters
    ----------
    df : pandas DataFrame, Series, or DatetimeIndex
        Data represented as a pandas DataFrame, Series, or DatetimeIndex.
    rechunk : bool, default True
        Make sure that all data is contiguous.
    nan_to_none : bool, default True
        If data contains NaN values PyArrow will convert the NaN to None

    Returns
    -------
    DataFrame

    Examples
    --------
    Constructing a DataFrame from a pandas DataFrame:

    >>> import pandas as pd
    >>> pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
    >>> df = pl.from_pandas(pd_df)
    >>> df
        shape: (2, 3)
    ┌─────┬─────┬─────┐
    │ a   ┆ b   ┆ c   │
    │ --- ┆ --- ┆ --- │
    │ i64 ┆ i64 ┆ i64 │
    ╞═════╪═════╪═════╡
    │ 1   ┆ 2   ┆ 3   │
    ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
    │ 4   ┆ 5   ┆ 6   │
    └─────┴─────┴─────┘

    Constructing a Series from a pandas Series:

    >>> import pandas as pd
    >>> pd_series = pd.Series([1, 2, 3], name="pd")
    >>> df = pl.from_pandas(pd_series)
    >>> df
    shape: (3,)
    Series: 'pd' [i64]
    [
        1
        2
        3
    ]

    """
    try:
        import pandas as pd
    except ImportError as e:  # pragma: no cover
        raise ImportError(
            "'pandas' is required when using from_pandas().") from e

    if isinstance(df, (pd.Series, pd.DatetimeIndex)):
        return Series._from_pandas("", df, nan_to_none=nan_to_none)
    elif isinstance(df, pd.DataFrame):
        return DataFrame._from_pandas(df,
                                      rechunk=rechunk,
                                      nan_to_none=nan_to_none)
    else:
        raise ValueError(
            f"Expected pandas DataFrame or Series, got {type(df)}.")