Beispiel #1
0
def read_orc(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    filters=None,
    stripes=None,
    skiprows=None,
    num_rows=None,
    use_index=True,
    decimal_cols_as_float=None,
    timestamp_type=None,
    use_python_file_object=True,
    **kwargs,
):
    """{docstring}"""
    if decimal_cols_as_float is not None:
        warnings.warn(
            "`decimal_cols_as_float` is deprecated and will be removed in "
            "the future",
            FutureWarning,
        )
    from cudf import DataFrame

    # Multiple sources are passed as a list. If a single source is passed,
    # wrap it in a list for unified processing downstream.
    if not is_list_like(filepath_or_buffer):
        filepath_or_buffer = [filepath_or_buffer]

    # Each source must have a correlating stripe list. If a single stripe list
    # is provided rather than a list of list of stripes then extrapolate that
    # stripe list across all input sources
    if stripes is not None:
        if any(not isinstance(stripe, list) for stripe in stripes):
            stripes = [stripes]

        # Must ensure a stripe for each source is specified, unless None
        if not len(stripes) == len(filepath_or_buffer):
            raise ValueError(
                "A list of stripes must be provided for each input source"
            )

    filepaths_or_buffers = []
    for source in filepath_or_buffer:
        if ioutils.is_directory(source, **kwargs):
            fs = ioutils._ensure_filesystem(
                passed_filesystem=None, path=source, **kwargs,
            )
            source = stringify_path(source)
            source = fs.sep.join([source, "*.orc"])

        tmp_source, compression = ioutils.get_filepath_or_buffer(
            path_or_data=source,
            compression=None,
            use_python_file_object=use_python_file_object,
            **kwargs,
        )
        if compression is not None:
            raise ValueError(
                "URL content-encoding decompression is not supported"
            )
        if isinstance(tmp_source, list):
            filepaths_or_buffers.extend(tmp_source)
        else:
            filepaths_or_buffers.append(tmp_source)

    if filters is not None:
        selected_stripes = _filter_stripes(
            filters, filepaths_or_buffers, stripes, skiprows, num_rows
        )

        # Return empty if everything was filtered
        if len(selected_stripes) == 0:
            return _make_empty_df(filepaths_or_buffers[0], columns)
        else:
            stripes = selected_stripes

    if engine == "cudf":
        return DataFrame._from_data(
            *liborc.read_orc(
                filepaths_or_buffers,
                columns,
                stripes,
                skiprows,
                num_rows,
                use_index,
                decimal_cols_as_float,
                timestamp_type,
            )
        )
    else:

        def read_orc_stripe(orc_file, stripe, columns):
            pa_table = orc_file.read_stripe(stripe, columns)
            if isinstance(pa_table, pa.RecordBatch):
                pa_table = pa.Table.from_batches([pa_table])
            return pa_table

        warnings.warn("Using CPU via PyArrow to read ORC dataset.")
        if len(filepath_or_buffer) > 1:
            raise NotImplementedError(
                "Using CPU via PyArrow only supports a single a "
                "single input source"
            )

        orc_file = orc.ORCFile(filepath_or_buffer[0])
        if stripes is not None and len(stripes) > 0:
            for stripe_source_file in stripes:
                pa_tables = [
                    read_orc_stripe(orc_file, i, columns)
                    for i in stripe_source_file
                ]
                pa_table = pa.concat_tables(pa_tables)
        else:
            pa_table = orc_file.read(columns=columns)
        df = cudf.DataFrame.from_arrow(pa_table)

    return df
Beispiel #2
0
def read_parquet(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    filters=None,
    row_groups=None,
    skiprows=None,
    num_rows=None,
    strings_to_categorical=False,
    use_pandas_metadata=True,
    *args,
    **kwargs,
):
    """{docstring}"""

    # Multiple sources are passed as a list. If a single source is passed,
    # wrap it in a list for unified processing downstream.
    if not is_list_like(filepath_or_buffer):
        filepath_or_buffer = [filepath_or_buffer]

    # a list of row groups per source should be passed. make the list of
    # lists that is expected for multiple sources
    if row_groups is not None:
        if not is_list_like(row_groups):
            row_groups = [[row_groups]]
        elif not is_list_like(row_groups[0]):
            row_groups = [row_groups]

    filepaths_or_buffers = []
    for source in filepath_or_buffer:
        if ioutils.is_directory(source, **kwargs):
            fs = _ensure_filesystem(passed_filesystem=None, path=source)
            source = stringify_path(source)
            source = fs.sep.join([source, "*.parquet"])

        tmp_source, compression = ioutils.get_filepath_or_buffer(
            path_or_data=source,
            compression=None,
            **kwargs,
        )
        if compression is not None:
            raise ValueError(
                "URL content-encoding decompression is not supported")
        if isinstance(tmp_source, list):
            filepath_or_buffer.extend(tmp_source)
        else:
            filepaths_or_buffers.append(tmp_source)

    if filters is not None:
        # Convert filters to ds.Expression
        filters = pq._filters_to_expression(filters)

        # Initialize ds.FilesystemDataset
        dataset = ds.dataset(filepaths_or_buffers,
                             format="parquet",
                             partitioning="hive")

        # Load IDs of filtered row groups for each file in dataset
        filtered_rg_ids = defaultdict(list)
        for fragment in dataset.get_fragments(filter=filters):
            for rg_fragment in fragment.split_by_row_group(filters):
                for rg_info in rg_fragment.row_groups:
                    filtered_rg_ids[rg_fragment.path].append(rg_info.id)

        # Initialize row_groups to be selected
        if row_groups is None:
            row_groups = [None for _ in dataset.files]

        # Store IDs of selected row groups for each file
        for i, file in enumerate(dataset.files):
            if row_groups[i] is None:
                row_groups[i] = filtered_rg_ids[file]
            else:
                row_groups[i] = filter(lambda id: id in row_groups[i],
                                       filtered_rg_ids[file])

    if engine == "cudf":
        return libparquet.read_parquet(
            filepaths_or_buffers,
            columns=columns,
            row_groups=row_groups,
            skiprows=skiprows,
            num_rows=num_rows,
            strings_to_categorical=strings_to_categorical,
            use_pandas_metadata=use_pandas_metadata,
        )
    else:
        warnings.warn("Using CPU via PyArrow to read Parquet dataset.")
        return cudf.DataFrame.from_arrow(
            pq.ParquetDataset(filepaths_or_buffers).read_pandas(
                columns=columns, *args, **kwargs))
Beispiel #3
0
def read_json(
    path_or_buf,
    engine="auto",
    dtype=True,
    lines=False,
    compression="infer",
    byte_range=None,
    *args,
    **kwargs,
):
    """{docstring}"""

    if engine == "cudf" and not lines:
        raise ValueError("cudf engine only supports JSON Lines format")
    if engine == "auto":
        engine = "cudf" if lines else "pandas"
    if engine == "cudf":
        # Multiple sources are passed as a list. If a single source is passed,
        # wrap it in a list for unified processing downstream.
        if not is_list_like(path_or_buf):
            path_or_buf = [path_or_buf]

        filepaths_or_buffers = []
        for source in path_or_buf:
            if ioutils.is_directory(source, **kwargs):
                fs = ioutils._ensure_filesystem(passed_filesystem=None,
                                                path=source)
                source = ioutils.stringify_pathlike(source)
                source = fs.sep.join([source, "*.json"])

            tmp_source, compression = ioutils.get_filepath_or_buffer(
                path_or_data=source,
                compression=compression,
                iotypes=(BytesIO, StringIO),
                **kwargs,
            )
            if isinstance(tmp_source, list):
                filepaths_or_buffers.extend(tmp_source)
            else:
                filepaths_or_buffers.append(tmp_source)

        return cudf.DataFrame._from_data(*libjson.read_json(
            filepaths_or_buffers, dtype, lines, compression, byte_range))
    else:
        warnings.warn("Using CPU via Pandas to read JSON dataset, this may "
                      "be GPU accelerated in the future")

        if not ioutils.ensure_single_filepath_or_buffer(
                path_or_data=path_or_buf,
                **kwargs,
        ):
            raise NotImplementedError(
                "`read_json` does not yet support reading "
                "multiple files via pandas")

        path_or_buf, compression = ioutils.get_filepath_or_buffer(
            path_or_data=path_or_buf,
            compression=compression,
            iotypes=(BytesIO, StringIO),
            **kwargs,
        )

        if kwargs.get("orient") == "table":
            pd_value = pd.read_json(
                path_or_buf,
                lines=lines,
                compression=compression,
                *args,
                **kwargs,
            )
        else:
            pd_value = pd.read_json(
                path_or_buf,
                lines=lines,
                dtype=dtype,
                compression=compression,
                *args,
                **kwargs,
            )
        df = cudf.from_pandas(pd_value)

    return df
Beispiel #4
0
def _process_dataset(
    paths,
    fs,
    filters=None,
    row_groups=None,
    categorical_partitions=True,
):
    # Returns:
    #     file_list - Expanded/filtered list of paths
    #     row_groups - Filtered list of row-group selections
    #     partition_keys - list of partition keys for each file
    #     partition_categories - Categories for each partition

    # The general purpose of this function is to (1) expand
    # directory input into a list of paths (using the pyarrow
    # dataset API), (2) to apply row-group filters, and (3)
    # to discover directory-partitioning information

    # Deal with case that the user passed in a directory name
    file_list = paths
    if len(paths) == 1 and ioutils.is_directory(paths[0]):
        paths = ioutils.stringify_pathlike(paths[0])

    # Convert filters to ds.Expression
    if filters is not None:
        filters = pq._filters_to_expression(filters)

    # Initialize ds.FilesystemDataset
    dataset = ds.dataset(
        paths,
        filesystem=fs,
        format="parquet",
        partitioning="hive",
    )
    file_list = dataset.files
    if len(file_list) == 0:
        raise FileNotFoundError(f"{paths} could not be resolved to any files")

    # Deal with directory partitioning
    # Get all partition keys (without filters)
    partition_categories = defaultdict(list)
    file_fragment = None
    for file_fragment in dataset.get_fragments():
        keys = ds._get_partition_keys(file_fragment.partition_expression)
        if not (keys or partition_categories):
            # Bail - This is not a directory-partitioned dataset
            break
        for k, v in keys.items():
            if v not in partition_categories[k]:
                partition_categories[k].append(v)
        if not categorical_partitions:
            # Bail - We don't need to discover all categories.
            # We only need to save the partition keys from this
            # first `file_fragment`
            break

    if partition_categories and file_fragment is not None:
        # Check/correct order of `categories` using last file_frag,
        # because `_get_partition_keys` does NOT preserve the
        # partition-hierarchy order of the keys.
        cat_keys = [
            part.split("=")[0] for part in file_fragment.path.split(fs.sep)
            if "=" in part
        ]
        if set(partition_categories) == set(cat_keys):
            partition_categories = {
                k: partition_categories[k]
                for k in cat_keys if k in partition_categories
            }

    # If we do not have partitioned data and
    # are not filtering, we can return here
    if filters is None and not partition_categories:
        return file_list, row_groups, [], {}

    # Record initial row_groups input
    row_groups_map = {}
    if row_groups is not None:
        # Make sure paths and row_groups map 1:1
        # and save the initial mapping
        if len(paths) != len(file_list):
            raise ValueError(
                "Cannot specify a row_group selection for a directory path.")
        row_groups_map = {path: rgs for path, rgs in zip(paths, row_groups)}

    # Apply filters and discover partition columns
    partition_keys = []
    if partition_categories or filters is not None:
        file_list = []
        if filters is not None:
            row_groups = []
        for file_fragment in dataset.get_fragments(filter=filters):
            path = file_fragment.path

            # Extract hive-partition keys, and make sure they
            # are orederd the same as they are in `partition_categories`
            if partition_categories:
                raw_keys = ds._get_partition_keys(
                    file_fragment.partition_expression)
                partition_keys.append([(name, raw_keys[name])
                                       for name in partition_categories.keys()
                                       ])

            # Apply row-group filtering
            selection = row_groups_map.get(path, None)
            if selection is not None or filters is not None:
                filtered_row_groups = [
                    rg_info.id
                    for rg_fragment in file_fragment.split_by_row_group(
                        filters,
                        schema=dataset.schema,
                    ) for rg_info in rg_fragment.row_groups
                ]
            file_list.append(path)
            if filters is not None:
                if selection is None:
                    row_groups.append(filtered_row_groups)
                else:
                    row_groups.append([
                        rg_id for rg_id in filtered_row_groups
                        if rg_id in selection
                    ])

    return (
        file_list,
        row_groups,
        partition_keys,
        partition_categories if categorical_partitions else {},
    )