def read_avro( filepath_or_buffer, engine="cudf", columns=None, skiprows=None, num_rows=None, **kwargs, ): """{docstring}""" from cudf import DataFrame is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_avro` does not yet support reading multiple files") filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=None, **kwargs) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": return DataFrame._from_table( libcudf.avro.read_avro(filepath_or_buffer, columns, skiprows, num_rows)) else: raise NotImplementedError("read_avro currently only supports cudf")
def polyline_bounding_boxes(poly_offsets, xs, ys, expansion_radius): """Compute the minimum bounding-boxes for a set of polylines. Parameters ---------- poly_offsets Begin indices of the first ring in each polyline (i.e. prefix-sum) xs Polyline point x-coordinates ys Polyline point y-coordinates expansion_radius radius of each polyline point Returns ------- result : cudf.DataFrame minimum bounding boxes for each polyline x_min : cudf.Series the minimum x-coordinate of each bounding box y_min : cudf.Series the minimum y-coordinate of each bounding box x_max : cudf.Series the maximum x-coordinate of each bounding box y_max : cudf.Series the maximum y-coordinate of each bounding box """ poly_offsets = as_column(poly_offsets, dtype="int32") xs, ys = normalize_point_columns(as_column(xs), as_column(ys)) return DataFrame._from_table( cpp_polyline_bounding_boxes(poly_offsets, xs, ys, expansion_radius) )
def _cubic_spline_coefficients(x, y, ids, prefix_sums): x_c = x._column y_c = y._column ids_c = ids._column prefix_c = prefix_sums._column result_table = cubicspline_coefficients(x_c, y_c, ids_c, prefix_c) result = DataFrame._from_table(result_table) return result
def read_orc( filepath_or_buffer, engine="cudf", columns=None, stripes=None, skiprows=None, num_rows=None, use_index=True, decimals_as_float=True, force_decimal_scale=None, timestamp_type=None, **kwargs, ): """{docstring}""" from cudf import DataFrame filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=None, **kwargs) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": df = DataFrame._from_table( libcudf.orc.read_orc( filepath_or_buffer, columns, stripes, skiprows, num_rows, use_index, decimals_as_float, force_decimal_scale, timestamp_type, )) else: def read_orc_stripe(orc_file, stripe, columns): pa_table = orc_file.read_stripe(stripe, columns) if isinstance(pa_table, pa.RecordBatch): pa_table = pa.Table.from_batches([pa_table]) return pa_table warnings.warn("Using CPU via PyArrow to read ORC dataset.") orc_file = orc.ORCFile(filepath_or_buffer) if stripes is not None and len(stripes) > 0: pa_tables = [ read_orc_stripe(orc_file, i, columns) for i in stripes ] pa_table = pa.concat_tables(pa_tables) else: pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) return df
def points_in_spatial_window(min_x, max_x, min_y, max_y, xs, ys): """ Return only the subset of coordinates that fall within a rectangular window. A point `(x, y)` is inside the query window if and only if ``min_x < x < max_x AND min_y < y < max_y`` The window is specified by minimum and maximum x and y coordinates. Parameters ---------- min_x lower x-coordinate of the query window max_x upper x-coordinate of the query window min_y lower y-coordinate of the query window max_y upper y-coordinate of the query window xs column of x-coordinates that may fall within the window ys column of y-coordinates that may fall within the window Returns ------- result : cudf.DataFrame subset of `(x, y)` pairs above that fall within the window Notes ----- * Swaps ``min_x`` and ``max_x`` if ``min_x > max_x`` * Swaps ``min_y`` and ``max_y`` if ``min_y > max_y`` """ xs, ys = normalize_point_columns(as_column(xs), as_column(ys)) result = spatial_window.points_in_spatial_window(min_x, max_x, min_y, max_y, xs, ys) return DataFrame._from_table(result)
def read_avro( filepath_or_buffer, engine="cudf", columns=None, skip_rows=None, num_rows=None, **kwargs, ): """{docstring}""" from cudf import DataFrame filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( filepath_or_buffer, None, **kwargs) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": return DataFrame._from_table( libcudfxx.avro.read_avro(filepath_or_buffer, columns, skip_rows, num_rows)) else: raise NotImplementedError("read_avro currently only supports cudf")
def read_orc( filepath_or_buffer, engine="cudf", columns=None, filters=None, stripes=None, skiprows=None, num_rows=None, use_index=True, decimals_as_float=True, force_decimal_scale=None, timestamp_type=None, **kwargs, ): """{docstring}""" from cudf import DataFrame is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_orc` does not yet support reading multiple files" ) filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=None, **kwargs ) if compression is not None: ValueError("URL content-encoding decompression is not supported") if filters is not None: selected_stripes = _filter_stripes( filters, filepath_or_buffer, stripes, skiprows, num_rows ) # Return empty if everything was filtered if len(selected_stripes) == 0: return _make_empty_df(filepath_or_buffer, columns) else: stripes = selected_stripes if engine == "cudf": df = DataFrame._from_table( libcudf.orc.read_orc( filepath_or_buffer, columns, stripes, skiprows, num_rows, use_index, decimals_as_float, force_decimal_scale, timestamp_type, ) ) else: def read_orc_stripe(orc_file, stripe, columns): pa_table = orc_file.read_stripe(stripe, columns) if isinstance(pa_table, pa.RecordBatch): pa_table = pa.Table.from_batches([pa_table]) return pa_table warnings.warn("Using CPU via PyArrow to read ORC dataset.") orc_file = orc.ORCFile(filepath_or_buffer) if stripes is not None and len(stripes) > 0: pa_tables = [ read_orc_stripe(orc_file, i, columns) for i in stripes ] pa_table = pa.concat_tables(pa_tables) else: pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) return df
def quadtree_point_in_polygon( poly_quad_pairs, quadtree, point_indices, points_x, points_y, poly_offsets, ring_offsets, poly_points_x, poly_points_y, ): """ Test whether the specified points are inside any of the specified polygons. Uses the table of (polygon, quadrant) pairs returned by ``cuspatial.join_quadtree_and_bounding_boxes`` to ensure only the points in the same quadrant as each polygon are tested for intersection. This pre-filtering can dramatically reduce number of points tested per polygon, enabling faster intersection-testing at the expense of extra memory allocated to store the quadtree and sorted point_indices. Parameters ---------- poly_quad_pairs: cudf.DataFrame Table of (polygon, quadrant) index pairs returned by ``cuspatial.join_quadtree_and_bounding_boxes``. quadtree : cudf.DataFrame A complete quadtree for a given area-of-interest bounding box. point_indices : cudf.Series Sorted point indices returned by ``cuspatial.quadtree_on_points`` points_x : cudf.Series x-coordinates of points used to construct the quadtree. points_y : cudf.Series y-coordinates of points used to construct the quadtree. poly_offsets : cudf.Series Begin index of the first ring in each polygon. ring_offsets : cudf.Series Begin index of the first point in each ring. poly_points_x : cudf.Series Polygon point x-coodinates. poly_points_y : cudf.Series Polygon point y-coodinates. Returns ------- result : cudf.DataFrame Indices for each intersecting point and polygon pair. point_offset : cudf.Series Indices of each point that intersects with a polygon. polygon_offset : cudf.Series Indices of each polygon with which a point intersected. """ ( points_x, points_y, poly_points_x, poly_points_y, ) = normalize_point_columns( as_column(points_x), as_column(points_y), as_column(poly_points_x), as_column(poly_points_y), ) return DataFrame._from_table( spatial_join.quadtree_point_in_polygon( poly_quad_pairs, quadtree, as_column(point_indices, dtype="uint32"), points_x, points_y, as_column(poly_offsets, dtype="uint32"), as_column(ring_offsets, dtype="uint32"), poly_points_x, poly_points_y, ))
def quadtree_point_to_nearest_polyline( poly_quad_pairs, quadtree, point_indices, points_x, points_y, poly_offsets, poly_points_x, poly_points_y, ): """ Finds the nearest polyline to each point in a quadrant, and computes the distances between each point and polyline. Uses the table of (polyline, quadrant) pairs returned by ``cuspatial.join_quadtree_and_bounding_boxes`` to ensure distances are computed only for the points in the same quadrant as each polyline. Parameters ---------- poly_quad_pairs: cudf.DataFrame Table of (polyline, quadrant) index pairs returned by ``cuspatial.join_quadtree_and_bounding_boxes``. quadtree : cudf.DataFrame A complete quadtree for a given area-of-interest bounding box. point_indices : cudf.Series Sorted point indices returned by ``cuspatial.quadtree_on_points`` points_x : cudf.Series x-coordinates of points used to construct the quadtree. points_y : cudf.Series y-coordinates of points used to construct the quadtree. poly_offsets : cudf.Series Begin index of the first point in each polyline. poly_points_x : cudf.Series Polyline point x-coodinates. poly_points_y : cudf.Series Polyline point y-coodinates. Returns ------- result : cudf.DataFrame Indices for each point and its nearest polyline, and the distance between the two. point_offset : cudf.Series Indices of each point that intersects with a polyline. polyline_offset : cudf.Series Indices of each polyline with which a point intersected. distance : cudf.Series Distances between each point and its nearest polyline. """ ( points_x, points_y, poly_points_x, poly_points_y, ) = normalize_point_columns( as_column(points_x), as_column(points_y), as_column(poly_points_x), as_column(poly_points_y), ) return DataFrame._from_table( spatial_join.quadtree_point_to_nearest_polyline( poly_quad_pairs, quadtree, as_column(point_indices, dtype="uint32"), points_x, points_y, as_column(poly_offsets, dtype="uint32"), poly_points_x, poly_points_y, ))
def join_quadtree_and_bounding_boxes(quadtree, poly_bounding_boxes, x_min, x_max, y_min, y_max, scale, max_depth): """ Search a quadtree for polygon or polyline bounding box intersections. Parameters ---------- quadtree : cudf.DataFrame A complete quadtree for a given area-of-interest bounding box. poly_bounding_boxes : cudf.DataFrame Minimum bounding boxes for a set of polygons or polylines x_min The lower-left x-coordinate of the area of interest bounding box x_max The upper-right x-coordinate of the area of interest bounding box min_y The lower-left y-coordinate of the area of interest bounding box max_y The upper-right y-coordinate of the area of interest bounding box scale Scale to apply to each point's distance from ``(x_min, y_min)`` max_depth Maximum quadtree depth at which to stop testing for intersections Returns ------- result : cudf.DataFrame Indices for each intersecting bounding box and leaf quadrant. poly_offset : cudf.Series Indices for each poly bbox that intersects with the quadtree. quad_offset : cudf.Series Indices for each leaf quadrant intersecting with a poly bbox. Notes ----- * Swaps ``min_x`` and ``max_x`` if ``min_x > max_x`` * Swaps ``min_y`` and ``max_y`` if ``min_y > max_y`` """ x_min, x_max, y_min, y_max = ( min(x_min, x_max), max(x_min, x_max), min(y_min, y_max), max(y_min, y_max), ) min_scale = max(x_max - x_min, y_max - y_min) / ((1 << max_depth) + 2) if scale < min_scale: warnings.warn("scale {} is less than required minimum ".format(scale) + "scale {}. Clamping to minimum scale".format(min_scale)) return DataFrame._from_table( spatial_join.join_quadtree_and_bounding_boxes( quadtree, poly_bounding_boxes, x_min, x_max, y_min, y_max, max(scale, min_scale), max_depth, ))