Exemple #1
0
    def _read_parquet_columns(path, columns, num_splits,
                              kwargs):  # pragma: no cover
        """Use a Ray task to read columns from Parquet into a Pandas DataFrame.

        Note: Ray functions are not detected by codecov (thus pragma: no cover)

        Args:
            path: The path of the Parquet file.
            columns: The list of column names to read.
            num_splits: The number of partitions to split the column into.

        Returns:
             A list containing the split Pandas DataFrames and the Index as the last
                element. If there is not `index_col` set, then we just return the length.
                This is used to determine the total length of the DataFrame to build a
                default Index.
        """
        import pyarrow.parquet as pq

        kwargs["use_pandas_metadata"] = True
        df = pq.read_table(path, columns=columns, **kwargs).to_pandas()
        df = df[columns]
        # Append the length of the index here to build it externally
        return _split_result_for_readers(0, num_splits,
                                         df) + [len(df.index), df.dtypes]
Exemple #2
0
def _read_parquet_columns(path, columns, num_splits,
                          kwargs):  # pragma: no cover
    """
    Read columns from Parquet file into a ``pandas.DataFrame`` using Ray task.

    Parameters
    ----------
    path : str or List[str]
        The path of the Parquet file.
    columns : List[str]
        The list of column names to read.
    num_splits : int
        The number of partitions to split the column into.
    kwargs : dict
        Keyward arguments to pass into ``pyarrow.parquet.read`` function.

    Returns
    -------
    list
        A list containing the splitted ``pandas.DataFrame``-s and the Index as the last
        element.

    Notes
    -----
    ``pyarrow.parquet.read`` is used internally as the parse function.
    """
    import pyarrow.parquet as pq

    df = (pq.ParquetDataset(path, **kwargs).read(
        columns=columns, use_pandas_metadata=True).to_pandas())
    df = df[columns]
    # Append the length of the index here to build it externally
    return _split_result_for_readers(0, num_splits, df) + [len(df.index)]
Exemple #3
0
def _read_sql_with_offset_pandas_on_ray(
    partition_column,
    start,
    end,
    num_splits,
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    columns=None,
    chunksize=None,
):  # pragma: no cover
    """Use a Ray task to read a chunk of SQL source.

    Note: Ray functions are not detected by codecov (thus pragma: no cover)
    """

    from .sql import query_put_bounders

    query_with_bounders = query_put_bounders(sql, partition_column, start, end)
    pandas_df = pandas.read_sql(
        query_with_bounders,
        con,
        index_col=index_col,
        coerce_float=coerce_float,
        params=params,
        parse_dates=parse_dates,
        columns=columns,
        chunksize=chunksize,
    )
    index = len(pandas_df)
    return _split_result_for_readers(1, num_splits, pandas_df) + [index]
Exemple #4
0
    def _read_sql_with_limit_offset(num_splits, sql, con, index_col,
                                    kwargs):  # pragma: no cover
        """Use a Ray task to read a chunk of SQL source.

        Note: Ray functions are not detected by codecov (thus pragma: no cover)
        """
        pandas_df = pandas.read_sql(sql, con, index_col=index_col, **kwargs)
        if index_col is None:
            index = len(pandas_df)
        else:
            index = pandas_df.index
        return _split_result_for_readers(1, num_splits, pandas_df) + [index]
Exemple #5
0
    def _read_feather_columns(path, columns, num_splits):  # pragma: no cover
        """Use a Ray task to read columns from Feather into a Pandas DataFrame.

        Note: Ray functions are not detected by codecov (thus pragma: no cover)

        Args:
            path: The path of the Feather file.
            columns: The list of column names to read.
            num_splits: The number of partitions to split the column into.

        Returns:
             A list containing the split Pandas DataFrames and the Index as the last
                element. If there is not `index_col` set, then we just return the length.
                This is used to determine the total length of the DataFrame to build a
                default Index.
        """
        from pyarrow import feather

        df = feather.read_feather(path, columns=columns)
        # Append the length of the index here to build it externally
        return _split_result_for_readers(0, num_splits, df) + [len(df.index)]
Exemple #6
0
def _read_sql_with_offset_pandas_on_ray(
    partition_column,
    start,
    end,
    num_splits,
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    columns=None,
    chunksize=None,
):  # pragma: no cover
    """
    Read a chunk of SQL query or table into a pandas DataFrame using Ray task.

    Parameters
    ----------
    partition_column : str
        Column name used for data partitioning between the workers.
    start : int
        Lowest value to request from the `partition_column`.
    end : int
        Highest value to request from the `partition_column`.
    num_splits : int
        The number of partitions to split the column into.
    sql : str or SQLAlchemy Selectable (select or text object)
        SQL query to be executed or a table name.
    con : SQLAlchemy connectable or str
        Connection to database (sqlite3 connections are not supported).
    index_col : str or list of str, optional
        Column(s) to set as index(MultiIndex).
    coerce_float : bool, default: True
        Attempts to convert values of non-string, non-numeric objects
        (like decimal.Decimal) to floating point, useful for SQL result sets.
    params : list, tuple or dict, optional
        List of parameters to pass to ``execute`` method. The syntax used
        to pass parameters is database driver dependent. Check your
        database driver documentation for which of the five syntax styles,
        described in PEP 249's paramstyle, is supported.
    parse_dates : list or dict, optional
        The behavior is as follows:

        - List of column names to parse as dates.
        - Dict of `{column_name: format string}` where format string is
          strftime compatible in case of parsing string times, or is one of
          (D, s, ns, ms, us) in case of parsing integer timestamps.
        - Dict of `{column_name: arg dict}`, where the arg dict corresponds
          to the keyword arguments of ``pandas.to_datetime``
          Especially useful with databases without native Datetime support,
          such as SQLite.
    columns : list, optional
        List of column names to select from SQL table (only used when reading a
        table).
    chunksize : int, optional
        If specified, return an iterator where `chunksize` is the number of rows
        to include in each chunk.

    Returns
    -------
    list
        List with splitted read results and it's metadata (index, dtypes, etc.).
    """
    from .sql import query_put_bounders

    query_with_bounders = query_put_bounders(sql, partition_column, start, end)
    pandas_df = pandas.read_sql(
        query_with_bounders,
        con,
        index_col=index_col,
        coerce_float=coerce_float,
        params=params,
        parse_dates=parse_dates,
        columns=columns,
        chunksize=chunksize,
    )
    index = len(pandas_df)
    return _split_result_for_readers(1, num_splits, pandas_df) + [index]