コード例 #1
0
ファイル: avro.py プロジェクト: vyasr/cudf
def read_avro(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    skiprows=None,
    num_rows=None,
    **kwargs,
):
    """{docstring}"""

    from cudf import DataFrame

    is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer(
        path_or_data=filepath_or_buffer,
        **kwargs,
    )
    if not is_single_filepath_or_buffer:
        raise NotImplementedError(
            "`read_avro` does not yet support reading multiple files")

    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
        path_or_data=filepath_or_buffer, compression=None, **kwargs)
    if compression is not None:
        ValueError("URL content-encoding decompression is not supported")

    if engine == "cudf":
        return DataFrame._from_table(
            libcudf.avro.read_avro(filepath_or_buffer, columns, skiprows,
                                   num_rows))
    else:
        raise NotImplementedError("read_avro currently only supports cudf")
コード例 #2
0
def polyline_bounding_boxes(poly_offsets, xs, ys, expansion_radius):
    """Compute the minimum bounding-boxes for a set of polylines.

    Parameters
    ----------
    poly_offsets
        Begin indices of the first ring in each polyline (i.e. prefix-sum)
    xs
        Polyline point x-coordinates
    ys
        Polyline point y-coordinates
    expansion_radius
        radius of each polyline point

    Returns
    -------
    result : cudf.DataFrame
        minimum bounding boxes for each polyline

        x_min : cudf.Series
            the minimum x-coordinate of each bounding box
        y_min : cudf.Series
            the minimum y-coordinate of each bounding box
        x_max : cudf.Series
            the maximum x-coordinate of each bounding box
        y_max : cudf.Series
            the maximum y-coordinate of each bounding box
    """
    poly_offsets = as_column(poly_offsets, dtype="int32")
    xs, ys = normalize_point_columns(as_column(xs), as_column(ys))
    return DataFrame._from_table(
        cpp_polyline_bounding_boxes(poly_offsets, xs, ys, expansion_radius)
    )
コード例 #3
0
def _cubic_spline_coefficients(x, y, ids, prefix_sums):
    x_c = x._column
    y_c = y._column
    ids_c = ids._column
    prefix_c = prefix_sums._column
    result_table = cubicspline_coefficients(x_c, y_c, ids_c, prefix_c)
    result = DataFrame._from_table(result_table)
    return result
コード例 #4
0
def read_orc(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    stripes=None,
    skiprows=None,
    num_rows=None,
    use_index=True,
    decimals_as_float=True,
    force_decimal_scale=None,
    timestamp_type=None,
    **kwargs,
):
    """{docstring}"""

    from cudf import DataFrame

    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
        path_or_data=filepath_or_buffer, compression=None, **kwargs)
    if compression is not None:
        ValueError("URL content-encoding decompression is not supported")

    if engine == "cudf":
        df = DataFrame._from_table(
            libcudf.orc.read_orc(
                filepath_or_buffer,
                columns,
                stripes,
                skiprows,
                num_rows,
                use_index,
                decimals_as_float,
                force_decimal_scale,
                timestamp_type,
            ))
    else:

        def read_orc_stripe(orc_file, stripe, columns):
            pa_table = orc_file.read_stripe(stripe, columns)
            if isinstance(pa_table, pa.RecordBatch):
                pa_table = pa.Table.from_batches([pa_table])
            return pa_table

        warnings.warn("Using CPU via PyArrow to read ORC dataset.")
        orc_file = orc.ORCFile(filepath_or_buffer)
        if stripes is not None and len(stripes) > 0:
            pa_tables = [
                read_orc_stripe(orc_file, i, columns) for i in stripes
            ]
            pa_table = pa.concat_tables(pa_tables)
        else:
            pa_table = orc_file.read(columns=columns)
        df = cudf.DataFrame.from_arrow(pa_table)

    return df
コード例 #5
0
def points_in_spatial_window(min_x, max_x, min_y, max_y, xs, ys):
    """ Return only the subset of coordinates that fall within a
    rectangular window.

    A point `(x, y)` is inside the query window if and only if
    ``min_x < x < max_x AND min_y < y < max_y``

    The window is specified by minimum and maximum x and y
    coordinates.

    Parameters
    ----------
    min_x
        lower x-coordinate of the query window
    max_x
        upper x-coordinate of the query window
    min_y
        lower y-coordinate of the query window
    max_y
        upper y-coordinate of the query window
    xs
        column of x-coordinates that may fall within the window
    ys
        column of y-coordinates that may fall within the window

    Returns
    -------
    result : cudf.DataFrame
        subset of `(x, y)` pairs above that fall within the window

    Notes
    -----
    * Swaps ``min_x`` and ``max_x`` if ``min_x > max_x``
    * Swaps ``min_y`` and ``max_y`` if ``min_y > max_y``
    """
    xs, ys = normalize_point_columns(as_column(xs), as_column(ys))
    result = spatial_window.points_in_spatial_window(min_x, max_x, min_y,
                                                     max_y, xs, ys)
    return DataFrame._from_table(result)
コード例 #6
0
def read_avro(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    skip_rows=None,
    num_rows=None,
    **kwargs,
):
    """{docstring}"""

    from cudf import DataFrame

    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
        filepath_or_buffer, None, **kwargs)
    if compression is not None:
        ValueError("URL content-encoding decompression is not supported")

    if engine == "cudf":
        return DataFrame._from_table(
            libcudfxx.avro.read_avro(filepath_or_buffer, columns, skip_rows,
                                     num_rows))
    else:
        raise NotImplementedError("read_avro currently only supports cudf")
コード例 #7
0
ファイル: orc.py プロジェクト: gerashegalov/cudf
def read_orc(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    filters=None,
    stripes=None,
    skiprows=None,
    num_rows=None,
    use_index=True,
    decimals_as_float=True,
    force_decimal_scale=None,
    timestamp_type=None,
    **kwargs,
):
    """{docstring}"""

    from cudf import DataFrame

    is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer(
        path_or_data=filepath_or_buffer, **kwargs,
    )
    if not is_single_filepath_or_buffer:
        raise NotImplementedError(
            "`read_orc` does not yet support reading multiple files"
        )

    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
        path_or_data=filepath_or_buffer, compression=None, **kwargs
    )
    if compression is not None:
        ValueError("URL content-encoding decompression is not supported")

    if filters is not None:
        selected_stripes = _filter_stripes(
            filters, filepath_or_buffer, stripes, skiprows, num_rows
        )

        # Return empty if everything was filtered
        if len(selected_stripes) == 0:
            return _make_empty_df(filepath_or_buffer, columns)
        else:
            stripes = selected_stripes

    if engine == "cudf":
        df = DataFrame._from_table(
            libcudf.orc.read_orc(
                filepath_or_buffer,
                columns,
                stripes,
                skiprows,
                num_rows,
                use_index,
                decimals_as_float,
                force_decimal_scale,
                timestamp_type,
            )
        )
    else:

        def read_orc_stripe(orc_file, stripe, columns):
            pa_table = orc_file.read_stripe(stripe, columns)
            if isinstance(pa_table, pa.RecordBatch):
                pa_table = pa.Table.from_batches([pa_table])
            return pa_table

        warnings.warn("Using CPU via PyArrow to read ORC dataset.")
        orc_file = orc.ORCFile(filepath_or_buffer)
        if stripes is not None and len(stripes) > 0:
            pa_tables = [
                read_orc_stripe(orc_file, i, columns) for i in stripes
            ]
            pa_table = pa.concat_tables(pa_tables)
        else:
            pa_table = orc_file.read(columns=columns)
        df = cudf.DataFrame.from_arrow(pa_table)

    return df
コード例 #8
0
def quadtree_point_in_polygon(
    poly_quad_pairs,
    quadtree,
    point_indices,
    points_x,
    points_y,
    poly_offsets,
    ring_offsets,
    poly_points_x,
    poly_points_y,
):
    """ Test whether the specified points are inside any of the specified
    polygons.

    Uses the table of (polygon, quadrant) pairs returned by
    ``cuspatial.join_quadtree_and_bounding_boxes`` to ensure only the points
    in the same quadrant as each polygon are tested for intersection.

    This pre-filtering can dramatically reduce number of points tested per
    polygon, enabling faster intersection-testing at the expense of extra
    memory allocated to store the quadtree and sorted point_indices.

    Parameters
    ----------
    poly_quad_pairs: cudf.DataFrame
        Table of (polygon, quadrant) index pairs returned by
        ``cuspatial.join_quadtree_and_bounding_boxes``.
    quadtree : cudf.DataFrame
        A complete quadtree for a given area-of-interest bounding box.
    point_indices : cudf.Series
        Sorted point indices returned by ``cuspatial.quadtree_on_points``
    points_x : cudf.Series
        x-coordinates of points used to construct the quadtree.
    points_y : cudf.Series
        y-coordinates of points used to construct the quadtree.
    poly_offsets : cudf.Series
        Begin index of the first ring in each polygon.
    ring_offsets : cudf.Series
        Begin index of the first point in each ring.
    poly_points_x : cudf.Series
        Polygon point x-coodinates.
    poly_points_y : cudf.Series
        Polygon point y-coodinates.

    Returns
    -------
    result : cudf.DataFrame
        Indices for each intersecting point and polygon pair.

        point_offset : cudf.Series
            Indices of each point that intersects with a polygon.
        polygon_offset : cudf.Series
            Indices of each polygon with which a point intersected.
    """

    (
        points_x,
        points_y,
        poly_points_x,
        poly_points_y,
    ) = normalize_point_columns(
        as_column(points_x),
        as_column(points_y),
        as_column(poly_points_x),
        as_column(poly_points_y),
    )
    return DataFrame._from_table(
        spatial_join.quadtree_point_in_polygon(
            poly_quad_pairs,
            quadtree,
            as_column(point_indices, dtype="uint32"),
            points_x,
            points_y,
            as_column(poly_offsets, dtype="uint32"),
            as_column(ring_offsets, dtype="uint32"),
            poly_points_x,
            poly_points_y,
        ))
コード例 #9
0
def quadtree_point_to_nearest_polyline(
    poly_quad_pairs,
    quadtree,
    point_indices,
    points_x,
    points_y,
    poly_offsets,
    poly_points_x,
    poly_points_y,
):
    """ Finds the nearest polyline to each point in a quadrant, and computes
    the distances between each point and polyline.

    Uses the table of (polyline, quadrant) pairs returned by
    ``cuspatial.join_quadtree_and_bounding_boxes`` to ensure distances are
    computed only for the points in the same quadrant as each polyline.

    Parameters
    ----------
    poly_quad_pairs: cudf.DataFrame
        Table of (polyline, quadrant) index pairs returned by
        ``cuspatial.join_quadtree_and_bounding_boxes``.
    quadtree : cudf.DataFrame
        A complete quadtree for a given area-of-interest bounding box.
    point_indices : cudf.Series
        Sorted point indices returned by ``cuspatial.quadtree_on_points``
    points_x : cudf.Series
        x-coordinates of points used to construct the quadtree.
    points_y : cudf.Series
        y-coordinates of points used to construct the quadtree.
    poly_offsets : cudf.Series
        Begin index of the first point in each polyline.
    poly_points_x : cudf.Series
        Polyline point x-coodinates.
    poly_points_y : cudf.Series
        Polyline point y-coodinates.

    Returns
    -------
    result : cudf.DataFrame
        Indices for each point and its nearest polyline, and the distance
        between the two.

        point_offset : cudf.Series
            Indices of each point that intersects with a polyline.
        polyline_offset : cudf.Series
            Indices of each polyline with which a point intersected.
        distance : cudf.Series
            Distances between each point and its nearest polyline.
    """
    (
        points_x,
        points_y,
        poly_points_x,
        poly_points_y,
    ) = normalize_point_columns(
        as_column(points_x),
        as_column(points_y),
        as_column(poly_points_x),
        as_column(poly_points_y),
    )
    return DataFrame._from_table(
        spatial_join.quadtree_point_to_nearest_polyline(
            poly_quad_pairs,
            quadtree,
            as_column(point_indices, dtype="uint32"),
            points_x,
            points_y,
            as_column(poly_offsets, dtype="uint32"),
            poly_points_x,
            poly_points_y,
        ))
コード例 #10
0
def join_quadtree_and_bounding_boxes(quadtree, poly_bounding_boxes, x_min,
                                     x_max, y_min, y_max, scale, max_depth):
    """ Search a quadtree for polygon or polyline bounding box intersections.

    Parameters
    ----------
    quadtree : cudf.DataFrame
        A complete quadtree for a given area-of-interest bounding box.
    poly_bounding_boxes : cudf.DataFrame
        Minimum bounding boxes for a set of polygons or polylines
    x_min
        The lower-left x-coordinate of the area of interest bounding box
    x_max
        The upper-right x-coordinate of the area of interest bounding box
    min_y
        The lower-left y-coordinate of the area of interest bounding box
    max_y
        The upper-right y-coordinate of the area of interest bounding box
    scale
        Scale to apply to each point's distance from ``(x_min, y_min)``
    max_depth
        Maximum quadtree depth at which to stop testing for intersections

    Returns
    -------
    result : cudf.DataFrame
        Indices for each intersecting bounding box and leaf quadrant.

        poly_offset : cudf.Series
            Indices for each poly bbox that intersects with the quadtree.
        quad_offset : cudf.Series
            Indices for each leaf quadrant intersecting with a poly bbox.

    Notes
    -----
    * Swaps ``min_x`` and ``max_x`` if ``min_x > max_x``
    * Swaps ``min_y`` and ``max_y`` if ``min_y > max_y``
    """
    x_min, x_max, y_min, y_max = (
        min(x_min, x_max),
        max(x_min, x_max),
        min(y_min, y_max),
        max(y_min, y_max),
    )

    min_scale = max(x_max - x_min, y_max - y_min) / ((1 << max_depth) + 2)
    if scale < min_scale:
        warnings.warn("scale {} is less than required minimum ".format(scale) +
                      "scale {}. Clamping to minimum scale".format(min_scale))

    return DataFrame._from_table(
        spatial_join.join_quadtree_and_bounding_boxes(
            quadtree,
            poly_bounding_boxes,
            x_min,
            x_max,
            y_min,
            y_max,
            max(scale, min_scale),
            max_depth,
        ))