def _read_parquet_columns(path, columns, num_splits, kwargs): # pragma: no cover """Use a Ray task to read columns from Parquet into a Pandas DataFrame. Note: Ray functions are not detected by codecov (thus pragma: no cover) Args: path: The path of the Parquet file. columns: The list of column names to read. num_splits: The number of partitions to split the column into. Returns: A list containing the split Pandas DataFrames and the Index as the last element. If there is not `index_col` set, then we just return the length. This is used to determine the total length of the DataFrame to build a default Index. """ import pyarrow.parquet as pq kwargs["use_pandas_metadata"] = True df = pq.read_table(path, columns=columns, **kwargs).to_pandas() df = df[columns] # Append the length of the index here to build it externally return _split_result_for_readers(0, num_splits, df) + [len(df.index), df.dtypes]
def _read_parquet_columns(path, columns, num_splits, kwargs): # pragma: no cover """ Read columns from Parquet file into a ``pandas.DataFrame`` using Ray task. Parameters ---------- path : str or List[str] The path of the Parquet file. columns : List[str] The list of column names to read. num_splits : int The number of partitions to split the column into. kwargs : dict Keyward arguments to pass into ``pyarrow.parquet.read`` function. Returns ------- list A list containing the splitted ``pandas.DataFrame``-s and the Index as the last element. Notes ----- ``pyarrow.parquet.read`` is used internally as the parse function. """ import pyarrow.parquet as pq df = (pq.ParquetDataset(path, **kwargs).read( columns=columns, use_pandas_metadata=True).to_pandas()) df = df[columns] # Append the length of the index here to build it externally return _split_result_for_readers(0, num_splits, df) + [len(df.index)]
def _read_sql_with_offset_pandas_on_ray( partition_column, start, end, num_splits, sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, ): # pragma: no cover """Use a Ray task to read a chunk of SQL source. Note: Ray functions are not detected by codecov (thus pragma: no cover) """ from .sql import query_put_bounders query_with_bounders = query_put_bounders(sql, partition_column, start, end) pandas_df = pandas.read_sql( query_with_bounders, con, index_col=index_col, coerce_float=coerce_float, params=params, parse_dates=parse_dates, columns=columns, chunksize=chunksize, ) index = len(pandas_df) return _split_result_for_readers(1, num_splits, pandas_df) + [index]
def _read_sql_with_limit_offset(num_splits, sql, con, index_col, kwargs): # pragma: no cover """Use a Ray task to read a chunk of SQL source. Note: Ray functions are not detected by codecov (thus pragma: no cover) """ pandas_df = pandas.read_sql(sql, con, index_col=index_col, **kwargs) if index_col is None: index = len(pandas_df) else: index = pandas_df.index return _split_result_for_readers(1, num_splits, pandas_df) + [index]
def _read_feather_columns(path, columns, num_splits): # pragma: no cover """Use a Ray task to read columns from Feather into a Pandas DataFrame. Note: Ray functions are not detected by codecov (thus pragma: no cover) Args: path: The path of the Feather file. columns: The list of column names to read. num_splits: The number of partitions to split the column into. Returns: A list containing the split Pandas DataFrames and the Index as the last element. If there is not `index_col` set, then we just return the length. This is used to determine the total length of the DataFrame to build a default Index. """ from pyarrow import feather df = feather.read_feather(path, columns=columns) # Append the length of the index here to build it externally return _split_result_for_readers(0, num_splits, df) + [len(df.index)]
def _read_sql_with_offset_pandas_on_ray( partition_column, start, end, num_splits, sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, ): # pragma: no cover """ Read a chunk of SQL query or table into a pandas DataFrame using Ray task. Parameters ---------- partition_column : str Column name used for data partitioning between the workers. start : int Lowest value to request from the `partition_column`. end : int Highest value to request from the `partition_column`. num_splits : int The number of partitions to split the column into. sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. con : SQLAlchemy connectable or str Connection to database (sqlite3 connections are not supported). index_col : str or list of str, optional Column(s) to set as index(MultiIndex). coerce_float : bool, default: True Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets. params : list, tuple or dict, optional List of parameters to pass to ``execute`` method. The syntax used to pass parameters is database driver dependent. Check your database driver documentation for which of the five syntax styles, described in PEP 249's paramstyle, is supported. parse_dates : list or dict, optional The behavior is as follows: - List of column names to parse as dates. - Dict of `{column_name: format string}` where format string is strftime compatible in case of parsing string times, or is one of (D, s, ns, ms, us) in case of parsing integer timestamps. - Dict of `{column_name: arg dict}`, where the arg dict corresponds to the keyword arguments of ``pandas.to_datetime`` Especially useful with databases without native Datetime support, such as SQLite. columns : list, optional List of column names to select from SQL table (only used when reading a table). chunksize : int, optional If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. Returns ------- list List with splitted read results and it's metadata (index, dtypes, etc.). """ from .sql import query_put_bounders query_with_bounders = query_put_bounders(sql, partition_column, start, end) pandas_df = pandas.read_sql( query_with_bounders, con, index_col=index_col, coerce_float=coerce_float, params=params, parse_dates=parse_dates, columns=columns, chunksize=chunksize, ) index = len(pandas_df) return _split_result_for_readers(1, num_splits, pandas_df) + [index]