def polyline_bounding_boxes(poly_offsets, xs, ys, expansion_radius): """Compute the minimum bounding-boxes for a set of polylines. Parameters ---------- poly_offsets Begin indices of the first ring in each polyline (i.e. prefix-sum) xs Polyline point x-coordinates ys Polyline point y-coordinates expansion_radius radius of each polyline point Returns ------- result : cudf.DataFrame minimum bounding boxes for each polyline x_min : cudf.Series the minimum x-coordinate of each bounding box y_min : cudf.Series the minimum y-coordinate of each bounding box x_max : cudf.Series the maximum x-coordinate of each bounding box y_max : cudf.Series the maximum y-coordinate of each bounding box """ poly_offsets = as_column(poly_offsets, dtype="int32") xs, ys = normalize_point_columns(as_column(xs), as_column(ys)) return DataFrame._from_data( *cpp_polyline_bounding_boxes(poly_offsets, xs, ys, expansion_radius) )
def _cubic_spline_coefficients(x, y, ids, prefix_sums): x_c = x._column y_c = y._column ids_c = ids._column prefix_c = prefix_sums._column return DataFrame._from_data( *cubicspline_coefficients(x_c, y_c, ids_c, prefix_c))
def join_quadtree_and_bounding_boxes(quadtree, poly_bounding_boxes, x_min, x_max, y_min, y_max, scale, max_depth): """ Search a quadtree for polygon or polyline bounding box intersections. Parameters ---------- quadtree : cudf.DataFrame A complete quadtree for a given area-of-interest bounding box. poly_bounding_boxes : cudf.DataFrame Minimum bounding boxes for a set of polygons or polylines x_min The lower-left x-coordinate of the area of interest bounding box x_max The upper-right x-coordinate of the area of interest bounding box min_y The lower-left y-coordinate of the area of interest bounding box max_y The upper-right y-coordinate of the area of interest bounding box scale Scale to apply to each point's distance from ``(x_min, y_min)`` max_depth Maximum quadtree depth at which to stop testing for intersections Returns ------- result : cudf.DataFrame Indices for each intersecting bounding box and leaf quadrant. poly_offset : cudf.Series Indices for each poly bbox that intersects with the quadtree. quad_offset : cudf.Series Indices for each leaf quadrant intersecting with a poly bbox. Notes ----- * Swaps ``min_x`` and ``max_x`` if ``min_x > max_x`` * Swaps ``min_y`` and ``max_y`` if ``min_y > max_y`` """ x_min, x_max, y_min, y_max = ( min(x_min, x_max), max(x_min, x_max), min(y_min, y_max), max(y_min, y_max), ) min_scale = max(x_max - x_min, y_max - y_min) / ((1 << max_depth) + 2) if scale < min_scale: warnings.warn("scale {} is less than required minimum ".format(scale) + "scale {}. Clamping to minimum scale".format(min_scale)) return DataFrame._from_data(*spatial_join.join_quadtree_and_bounding_boxes( quadtree, poly_bounding_boxes, x_min, x_max, y_min, y_max, max(scale, min_scale), max_depth, ))
def assert_packed_frame_equality(df): pdf = df.to_pandas() packed = pack(df) del df tbl = unpack(packed) unpacked = DataFrame._from_data(tbl._data, tbl._index) assert_eq(unpacked, pdf)
def trajectory_bounding_boxes(num_trajectories, object_ids, xs, ys): """ Compute the bounding boxes of sets of trajectories. Parameters ---------- num_trajectories number of trajectories (unique object ids) object_ids column of object (e.g., vehicle) ids xs column of x-coordinates (in kilometers) ys column of y-coordinates (in kilometers) Returns ------- result : cudf.DataFrame minimum bounding boxes (in kilometers) for each trajectory x_min : cudf.Series the minimum x-coordinate of each bounding box y_min : cudf.Series the minimum y-coordinate of each bounding box x_max : cudf.Series the maximum x-coordinate of each bounding box y_max : cudf.Series the maximum y-coordinate of each bounding box Examples -------- Compute the minimum bounding boxes of derived trajectories >>> objects, traj_offsets = trajectory.derive_trajectories( [0, 0, 1, 1], # object_id [0, 1, 2, 3], # x [0, 0, 1, 1], # y [0, 10, 0, 10] # timestamp ) >>> traj_bounding_boxes = cuspatial.trajectory_bounding_boxes( len(traj_offsets), objects['object_id'], objects['x'], objects['y'] ) >>> print(traj_bounding_boxes) x_min y_min x_max y_max 0 0.0 0.0 2.0 2.0 1 1.0 1.0 3.0 3.0 """ object_ids = as_column(object_ids, dtype=np.int32) xs, ys = normalize_point_columns(as_column(xs), as_column(ys)) return DataFrame._from_data( *cpp_trajectory_bounding_boxes(num_trajectories, object_ids, xs, ys))
def derive_trajectories(object_ids, xs, ys, timestamps): """ Derive trajectories from object ids, points, and timestamps. Parameters ---------- object_ids column of object (e.g., vehicle) ids xs column of x-coordinates (in kilometers) ys column of y-coordinates (in kilometers) timestamps column of timestamps in any resolution Returns ------- result : tuple (objects, traj_offsets) objects : cudf.DataFrame object_ids, xs, ys, and timestamps sorted by ``(object_id, timestamp)``, used by ``trajectory_bounding_boxes`` and ``trajectory_distances_and_speeds`` traj_offsets : cudf.Series offsets of discovered trajectories Examples -------- Compute sorted objects and discovered trajectories >>> objects, traj_offsets = cuspatial.derive_trajectories( [0, 1, 2, 3], # object_id [0, 0, 1, 1], # x [0, 0, 1, 1], # y [0, 10, 0, 10] # timestamp ) >>> print(traj_offsets) 0 0 1 2 >>> print(objects) object_id x y timestamp 0 0 1 0 0 1 0 0 0 10 2 1 3 1 0 3 1 2 1 10 """ object_ids = as_column(object_ids, dtype=np.int32) xs, ys = normalize_point_columns(as_column(xs), as_column(ys)) timestamps = normalize_timestamp_column(as_column(timestamps)) objects, traj_offsets = cpp_derive_trajectories(object_ids, xs, ys, timestamps) return DataFrame._from_data(*objects), Series(data=traj_offsets)
def trajectory_distances_and_speeds(num_trajectories, object_ids, xs, ys, timestamps): """ Compute the distance traveled and speed of sets of trajectories Parameters ---------- num_trajectories number of trajectories (unique object ids) object_ids column of object (e.g., vehicle) ids xs column of x-coordinates (in kilometers) ys column of y-coordinates (in kilometers) timestamps column of timestamps in any resolution Returns ------- result : cudf.DataFrame meters : cudf.Series trajectory distance (in kilometers) speed : cudf.Series trajectory speed (in meters/second) Examples -------- Compute the distances and speeds of derived trajectories >>> objects, traj_offsets = cuspatial.derive_trajectories(...) >>> dists_and_speeds = cuspatial.trajectory_distances_and_speeds( len(traj_offsets) objects['object_id'], objects['x'], objects['y'], objects['timestamp'] ) >>> print(dists_and_speeds) distance speed trajectory_id 0 1000.0 100000.000000 1 1000.0 111111.109375 """ object_ids = as_column(object_ids, dtype=np.int32) xs, ys = normalize_point_columns(as_column(xs), as_column(ys)) timestamps = normalize_timestamp_column(as_column(timestamps)) df = DataFrame._from_data(*cpp_trajectory_distances_and_speeds( num_trajectories, object_ids, xs, ys, timestamps)) df.index.name = "trajectory_id" return df
def points_in_spatial_window(min_x, max_x, min_y, max_y, xs, ys): """ Return only the subset of coordinates that fall within a rectangular window. A point `(x, y)` is inside the query window if and only if ``min_x < x < max_x AND min_y < y < max_y`` The window is specified by minimum and maximum x and y coordinates. Parameters ---------- min_x lower x-coordinate of the query window max_x upper x-coordinate of the query window min_y lower y-coordinate of the query window max_y upper y-coordinate of the query window xs column of x-coordinates that may fall within the window ys column of y-coordinates that may fall within the window Returns ------- result : cudf.DataFrame subset of `(x, y)` pairs above that fall within the window Notes ----- * Swaps ``min_x`` and ``max_x`` if ``min_x > max_x`` * Swaps ``min_y`` and ``max_y`` if ``min_y > max_y`` """ xs, ys = normalize_point_columns(as_column(xs), as_column(ys)) return DataFrame._from_data( *spatial_window.points_in_spatial_window( min_x, max_x, min_y, max_y, xs, ys ) )
def check_packed_pickled_equality(df): # basic assert_packed_frame_picklable(df) # sliced assert_packed_frame_picklable(df[:-1]) assert_packed_frame_picklable(df[1:]) assert_packed_frame_picklable(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") assert isinstance(sortvaldf.index, GenericIndex) assert_packed_frame_picklable(sortvaldf) # out-of-band if pickle.HIGHEST_PROTOCOL >= 5: buffers = [] serialbytes = pickle.dumps(pack(df), protocol=5, buffer_callback=buffers.append) for b in buffers: assert isinstance(b, pickle.PickleBuffer) tbl = unpack(pickle.loads(serialbytes, buffers=buffers)) loaded = DataFrame._from_data(tbl._data, tbl._index) assert_eq(loaded, df)
def read_orc( filepath_or_buffer, engine="cudf", columns=None, filters=None, stripes=None, skiprows=None, num_rows=None, use_index=True, decimal_cols_as_float=None, timestamp_type=None, use_python_file_object=True, **kwargs, ): """{docstring}""" if decimal_cols_as_float is not None: warnings.warn( "`decimal_cols_as_float` is deprecated and will be removed in " "the future", FutureWarning, ) from cudf import DataFrame # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(filepath_or_buffer): filepath_or_buffer = [filepath_or_buffer] # Each source must have a correlating stripe list. If a single stripe list # is provided rather than a list of list of stripes then extrapolate that # stripe list across all input sources if stripes is not None: if any(not isinstance(stripe, list) for stripe in stripes): stripes = [stripes] # Must ensure a stripe for each source is specified, unless None if not len(stripes) == len(filepath_or_buffer): raise ValueError( "A list of stripes must be provided for each input source" ) filepaths_or_buffers = [] for source in filepath_or_buffer: if ioutils.is_directory(source, **kwargs): fs = ioutils._ensure_filesystem( passed_filesystem=None, path=source, **kwargs, ) source = stringify_path(source) source = fs.sep.join([source, "*.orc"]) tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=None, use_python_file_object=use_python_file_object, **kwargs, ) if compression is not None: raise ValueError( "URL content-encoding decompression is not supported" ) if isinstance(tmp_source, list): filepaths_or_buffers.extend(tmp_source) else: filepaths_or_buffers.append(tmp_source) if filters is not None: selected_stripes = _filter_stripes( filters, filepaths_or_buffers, stripes, skiprows, num_rows ) # Return empty if everything was filtered if len(selected_stripes) == 0: return _make_empty_df(filepaths_or_buffers[0], columns) else: stripes = selected_stripes if engine == "cudf": return DataFrame._from_data( *liborc.read_orc( filepaths_or_buffers, columns, stripes, skiprows, num_rows, use_index, decimal_cols_as_float, timestamp_type, ) ) else: def read_orc_stripe(orc_file, stripe, columns): pa_table = orc_file.read_stripe(stripe, columns) if isinstance(pa_table, pa.RecordBatch): pa_table = pa.Table.from_batches([pa_table]) return pa_table warnings.warn("Using CPU via PyArrow to read ORC dataset.") if len(filepath_or_buffer) > 1: raise NotImplementedError( "Using CPU via PyArrow only supports a single a " "single input source" ) orc_file = orc.ORCFile(filepath_or_buffer[0]) if stripes is not None and len(stripes) > 0: for stripe_source_file in stripes: pa_tables = [ read_orc_stripe(orc_file, i, columns) for i in stripe_source_file ] pa_table = pa.concat_tables(pa_tables) else: pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) return df
def assert_packed_frame_serializable(df): packed = pack(df) header, frames = packed.serialize() tbl = unpack(packed.deserialize(header, frames)) loaded = DataFrame._from_data(tbl._data, tbl._index) assert_eq(loaded, df)
def assert_packed_frame_picklable(df): serialbytes = pickle.dumps(pack(df)) tbl = unpack(pickle.loads(serialbytes)) loaded = DataFrame._from_data(tbl._data, tbl._index) assert_eq(loaded, df)
def point_in_polygon( test_points_x, test_points_y, poly_offsets, poly_ring_offsets, poly_points_x, poly_points_y, ): """ Compute from a set of points and a set of polygons which points fall within which polygons. Note that `polygons_(x,y)` must be specified as closed polygons: the first and last coordinate of each polygon must be the same. Parameters ---------- test_points_x x-coordinate of test points test_points_y y-coordinate of test points poly_offsets beginning index of the first ring in each polygon poly_ring_offsets beginning index of the first point in each ring poly_points_x x closed-coordinate of polygon points poly_points_y y closed-coordinate of polygon points Examples -------- Test whether 3 points fall within either of two polygons >>> result = cuspatial.point_in_polygon( [0, -8, 6.0], # test_points_x [0, -8, 6.0], # test_points_y cudf.Series([0, 1], index=['nyc', 'hudson river']), # poly_offsets [0, 3], # ring_offsets [-10, 5, 5, -10, 0, 10, 10, 0], # poly_points_x [-10, -10, 5, 5, 0, 0, 10, 10], # poly_points_y ) # The result of point_in_polygon is a DataFrame of Boolean # values indicating whether each point (rows) falls within # each polygon (columns). >>> print(result) nyc hudson river 0 True True 1 True False 2 False True # Point 0: (0, 0) falls in both polygons # Point 1: (-8, -8) falls in the first polygon # Point 2: (6.0, 6.0) falls in the second polygon note input Series x and y will not be index aligned, but computed as sequential arrays. Returns ------- result : cudf.DataFrame A DataFrame of boolean values indicating whether each point falls within each polygon. """ if len(poly_offsets) == 0: return DataFrame() ( test_points_x, test_points_y, poly_points_x, poly_points_y, ) = normalize_point_columns( as_column(test_points_x), as_column(test_points_y), as_column(poly_points_x), as_column(poly_points_y), ) result = cpp_point_in_polygon( test_points_x, test_points_y, as_column(poly_offsets, dtype="int32"), as_column(poly_ring_offsets, dtype="int32"), poly_points_x, poly_points_y, ) result = gis_utils.pip_bitmap_column_to_binary_array( polygon_bitmap_column=result, width=len(poly_offsets) ) result = DataFrame(result) result = DataFrame._from_data( {name: col.astype("bool") for name, col in result._data.items()} ) result.columns = [x for x in list(reversed(poly_offsets.index))] result = result[list(reversed(result.columns))] return result
def quadtree_on_points( xs, ys, x_min, x_max, y_min, y_max, scale, max_depth, min_size ): """ Construct a quadtree from a set of points for a given area-of-interest bounding box. Parameters ---------- xs Column of x-coordinates for each point ys Column of y-coordinates for each point x_min The lower-left x-coordinate of the area of interest bounding box x_max The upper-right x-coordinate of the area of interest bounding box y_min The lower-left y-coordinate of the area of interest bounding box y_max The upper-right y-coordinate of the area of interest bounding box scale Scale to apply to each point's distance from ``(x_min, y_min)`` max_depth Maximum quadtree depth min_size Minimum number of points for a non-leaf quadtree node Returns ------- result : tuple (cudf.Series, cudf.DataFrame) keys_to_points : cudf.Series(dtype=np.int32) A column of sorted keys to original point indices quadtree : cudf.DataFrame A complete quadtree for the set of input points key : cudf.Series(dtype=np.int32) An int32 column of quadrant keys level : cudf.Series(dtype=np.int8) An int8 column of quadtree levels is_quad : cudf.Series(dtype=np.bool_) A boolean column indicating whether the node is a quad or leaf length : cudf.Series(dtype=np.int32) If this is a non-leaf quadrant (i.e. ``is_quad`` is ``True``), this column's value is the number of children in the non-leaf quadrant. Otherwise this column's value is the number of points contained in the leaf quadrant. offset : cudf.Series(dtype=np.int32) If this is a non-leaf quadrant (i.e. ``is_quad`` is ``True``), this column's value is the position of the non-leaf quadrant's first child. Otherwise this column's value is the position of the leaf quadrant's first point. Notes ----- * Swaps ``min_x`` and ``max_x`` if ``min_x > max_x`` * Swaps ``min_y`` and ``max_y`` if ``min_y > max_y`` Examples -------- An example of selecting the ``min_size`` and ``scale`` based on input:: >>> np.random.seed(0) >>> points = cudf.DataFrame({ "x": cudf.Series(np.random.normal(size=120)) * 500, "y": cudf.Series(np.random.normal(size=120)) * 500, }) >>> max_depth = 3 >>> min_size = 50 >>> min_x, min_y, max_x, max_y = (points["x"].min(), points["y"].min(), points["x"].max(), points["y"].max()) >>> scale = max(max_x - min_x, max_y - min_y) // (1 << max_depth) >>> print( "min_size: " + str(min_size) + "\\n" "num_points: " + str(len(points)) + "\\n" "min_x: " + str(min_x) + "\\n" "max_x: " + str(max_x) + "\\n" "min_y: " + str(min_y) + "\\n" "max_y: " + str(max_y) + "\\n" "scale: " + str(scale) + "\\n" ) min_size: 50 num_points: 120 min_x: -1577.4949079170394 max_x: 1435.877311993804 min_y: -1412.7015761122134 max_y: 1492.572387431971 scale: 301.0 >>> key_to_point, quadtree = cuspatial.quadtree_on_points( points["x"], points["y"], min_x, max_x, min_y, max_y, scale, max_depth, min_size ) >>> print(quadtree) key level is_quad length offset 0 0 0 False 15 0 1 1 0 False 27 15 2 2 0 False 12 42 3 3 0 True 4 8 4 4 0 False 5 106 5 6 0 False 6 111 6 9 0 False 2 117 7 12 0 False 1 119 8 12 1 False 22 54 9 13 1 False 18 76 10 14 1 False 9 94 11 15 1 False 3 103 >>> print(key_to_point) 0 63 1 20 2 33 3 66 4 19 ... 115 113 116 3 117 78 118 98 119 24 Length: 120, dtype: int32 """ xs, ys = normalize_point_columns(as_column(xs), as_column(ys)) x_min, x_max, y_min, y_max = ( min(x_min, x_max), max(x_min, x_max), min(y_min, y_max), max(y_min, y_max), ) min_scale = max(x_max - x_min, y_max - y_min) / ((1 << max_depth) + 2) if scale < min_scale: warnings.warn( "scale {} is less than required minimum ".format(scale) + "scale {}. Clamping to minimum scale".format(min_scale) ) key_to_point, quadtree = cpp_quadtree_on_points( xs, ys, x_min, x_max, y_min, y_max, max(scale, min_scale), max_depth, min_size, ) return Series(key_to_point), DataFrame._from_data(*quadtree)
def quadtree_point_in_polygon( poly_quad_pairs, quadtree, point_indices, points_x, points_y, poly_offsets, ring_offsets, poly_points_x, poly_points_y, ): """ Test whether the specified points are inside any of the specified polygons. Uses the table of (polygon, quadrant) pairs returned by ``cuspatial.join_quadtree_and_bounding_boxes`` to ensure only the points in the same quadrant as each polygon are tested for intersection. This pre-filtering can dramatically reduce number of points tested per polygon, enabling faster intersection-testing at the expense of extra memory allocated to store the quadtree and sorted point_indices. Parameters ---------- poly_quad_pairs: cudf.DataFrame Table of (polygon, quadrant) index pairs returned by ``cuspatial.join_quadtree_and_bounding_boxes``. quadtree : cudf.DataFrame A complete quadtree for a given area-of-interest bounding box. point_indices : cudf.Series Sorted point indices returned by ``cuspatial.quadtree_on_points`` points_x : cudf.Series x-coordinates of points used to construct the quadtree. points_y : cudf.Series y-coordinates of points used to construct the quadtree. poly_offsets : cudf.Series Begin index of the first ring in each polygon. ring_offsets : cudf.Series Begin index of the first point in each ring. poly_points_x : cudf.Series Polygon point x-coodinates. poly_points_y : cudf.Series Polygon point y-coodinates. Returns ------- result : cudf.DataFrame Indices for each intersecting point and polygon pair. polygon_index : cudf.Series Indices of each polygon with which a point intersected. point_index : cudf.Series Indices of each point that intersects with a polygon. """ ( points_x, points_y, poly_points_x, poly_points_y, ) = normalize_point_columns( as_column(points_x), as_column(points_y), as_column(poly_points_x), as_column(poly_points_y), ) return DataFrame._from_data(*spatial_join.quadtree_point_in_polygon( poly_quad_pairs, quadtree, as_column(point_indices, dtype="uint32"), points_x, points_y, as_column(poly_offsets, dtype="uint32"), as_column(ring_offsets, dtype="uint32"), poly_points_x, poly_points_y, ))
def quadtree_point_to_nearest_polyline( poly_quad_pairs, quadtree, point_indices, points_x, points_y, poly_offsets, poly_points_x, poly_points_y, ): """ Finds the nearest polyline to each point in a quadrant, and computes the distances between each point and polyline. Uses the table of (polyline, quadrant) pairs returned by ``cuspatial.join_quadtree_and_bounding_boxes`` to ensure distances are computed only for the points in the same quadrant as each polyline. Parameters ---------- poly_quad_pairs: cudf.DataFrame Table of (polyline, quadrant) index pairs returned by ``cuspatial.join_quadtree_and_bounding_boxes``. quadtree : cudf.DataFrame A complete quadtree for a given area-of-interest bounding box. point_indices : cudf.Series Sorted point indices returned by ``cuspatial.quadtree_on_points`` points_x : cudf.Series x-coordinates of points used to construct the quadtree. points_y : cudf.Series y-coordinates of points used to construct the quadtree. poly_offsets : cudf.Series Begin index of the first point in each polyline. poly_points_x : cudf.Series Polyline point x-coodinates. poly_points_y : cudf.Series Polyline point y-coodinates. Returns ------- result : cudf.DataFrame Indices for each point and its nearest polyline, and the distance between the two. point_index : cudf.Series Indices of each point that intersects with a polyline. polyline_index : cudf.Series Indices of each polyline with which a point intersected. distance : cudf.Series Distances between each point and its nearest polyline. """ ( points_x, points_y, poly_points_x, poly_points_y, ) = normalize_point_columns( as_column(points_x), as_column(points_y), as_column(poly_points_x), as_column(poly_points_y), ) return DataFrame._from_data( *spatial_join.quadtree_point_to_nearest_polyline( poly_quad_pairs, quadtree, as_column(point_indices, dtype="uint32"), points_x, points_y, as_column(poly_offsets, dtype="uint32"), poly_points_x, poly_points_y, ))