def _read_json(fname, num_splits, start, end, kwargs): # pragma: no cover """Use a Ray task to read a chunk of a JSON into a Pandas dataframe. Note: Ray functions are not detected by codecov (thus pragma: no cover) Args: fname: The filename of the file to open. num_splits: The number of splits (partitions) to separate the DataFrame into. start: The start byte offset. end: The end byte offset. kwargs: The kwargs for the Pandas `read_json` function. Returns: A list containing the split Pandas DataFrames and the Index as the last element. """ bio = file_open(fname, "rb", kwargs.pop("compression", "infer")) bio.seek(start) to_read = b"" + bio.read(end - start) bio.close() columns = kwargs.pop("columns") pandas_df = pandas.read_json(BytesIO(to_read), **kwargs) if not pandas_df.columns.equals(columns): raise NotImplementedError("Columns must be the same across all rows.") partition_columns = pandas_df.columns pandas_df.columns = pandas.RangeIndex(len(pandas_df.columns)) return _split_result_for_readers(1, num_splits, pandas_df) + [ len(pandas_df), pandas_df.dtypes, partition_columns, ]
def _read_csv_with_offset_pandas_on_ray( fname, num_splits, start, end, kwargs, header ): # pragma: no cover """Use a Ray task to read a chunk of a CSV into a Pandas DataFrame. Note: Ray functions are not detected by codecov (thus pragma: no cover) Args: fname: The filename of the file to open. num_splits: The number of splits (partitions) to separate the DataFrame into. start: The start byte offset. end: The end byte offset. kwargs: The kwargs for the Pandas `read_csv` function. header: The header of the file. Returns: A list containing the split Pandas DataFrames and the Index as the last element. If there is not `index_col` set, then we just return the length. This is used to determine the total length of the DataFrame to build a default Index. """ index_col = kwargs.get("index_col", None) # pop "compression" from kwargs because bio is uncompressed bio = file_open(fname, "rb", kwargs.pop("compression", "infer")) bio.seek(start) to_read = header + bio.read(end - start) bio.close() pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs) pandas_df.columns = pandas.RangeIndex(len(pandas_df.columns)) if index_col is not None: index = pandas_df.index # Partitions must have RangeIndex pandas_df.index = pandas.RangeIndex(0, len(pandas_df)) else: # We will use the lengths to build the index if we are not given an # `index_col`. index = len(pandas_df) return _split_result_for_readers(1, num_splits, pandas_df) + [ index, pandas_df.dtypes, ]