Exemple #1
0
def _read_json(fname, num_splits, start, end, kwargs):  # pragma: no cover
    """Use a Ray task to read a chunk of a JSON into a Pandas dataframe.

    Note: Ray functions are not detected by codecov (thus pragma: no cover)

    Args:
        fname: The filename of the file to open.
        num_splits: The number of splits (partitions) to separate the DataFrame into.
        start: The start byte offset.
        end: The end byte offset.
        kwargs: The kwargs for the Pandas `read_json` function.

    Returns:
         A list containing the split Pandas DataFrames and the Index as the last
            element.
    """
    bio = file_open(fname, "rb", kwargs.pop("compression", "infer"))
    bio.seek(start)
    to_read = b"" + bio.read(end - start)
    bio.close()
    columns = kwargs.pop("columns")
    pandas_df = pandas.read_json(BytesIO(to_read), **kwargs)
    if not pandas_df.columns.equals(columns):
        raise NotImplementedError("Columns must be the same across all rows.")
    partition_columns = pandas_df.columns
    pandas_df.columns = pandas.RangeIndex(len(pandas_df.columns))
    return _split_result_for_readers(1, num_splits, pandas_df) + [
        len(pandas_df),
        pandas_df.dtypes,
        partition_columns,
    ]
Exemple #2
0
def _read_csv_with_offset_pandas_on_ray(
    fname, num_splits, start, end, kwargs, header
):  # pragma: no cover
    """Use a Ray task to read a chunk of a CSV into a Pandas DataFrame.

    Note: Ray functions are not detected by codecov (thus pragma: no cover)

    Args:
        fname: The filename of the file to open.
        num_splits: The number of splits (partitions) to separate the DataFrame into.
        start: The start byte offset.
        end: The end byte offset.
        kwargs: The kwargs for the Pandas `read_csv` function.
        header: The header of the file.

    Returns:
         A list containing the split Pandas DataFrames and the Index as the last
            element. If there is not `index_col` set, then we just return the length.
            This is used to determine the total length of the DataFrame to build a
            default Index.
    """
    index_col = kwargs.get("index_col", None)
    # pop "compression" from kwargs because bio is uncompressed
    bio = file_open(fname, "rb", kwargs.pop("compression", "infer"))
    bio.seek(start)
    to_read = header + bio.read(end - start)
    bio.close()
    pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs)
    pandas_df.columns = pandas.RangeIndex(len(pandas_df.columns))
    if index_col is not None:
        index = pandas_df.index
        # Partitions must have RangeIndex
        pandas_df.index = pandas.RangeIndex(0, len(pandas_df))
    else:
        # We will use the lengths to build the index if we are not given an
        # `index_col`.
        index = len(pandas_df)
    return _split_result_for_readers(1, num_splits, pandas_df) + [
        index,
        pandas_df.dtypes,
    ]