def rotate_non_max_suppression_cpu(boxes: np.ndarray, order: np.ndarray, standup_iou: np.ndarray, thresh: float): if not BoxOps.has_boost(): raise NotImplementedError( "this op require spconv built with boost, download boost, export BOOST_ROOT and rebuild." ) return BoxOps.rotate_non_max_suppression_cpu(tv.from_numpy(boxes), tv.from_numpy(order), tv.from_numpy(standup_iou), thresh)
def rbbox_iou_loss(box_corners: np.ndarray, qbox_corners: np.ndarray): if not BoxOps.has_boost(): raise NotImplementedError( "this op require spconv built with boost, download boost, export BOOST_ROOT and rebuild." ) N = box_corners.shape[0] overlap = np.zeros((N, ), dtype=box_corners.dtype) BoxOps.rbbox_iou_aligned(tv.from_numpy(box_corners), tv.from_numpy(qbox_corners), tv.from_numpy(overlap), False) return overlap
def rbbox_intersection(box_corners: np.ndarray, qbox_corners: np.ndarray, standup_iou: np.ndarray, standup_thresh: float): if not BoxOps.has_boost(): raise NotImplementedError( "this op require spconv built with boost, download boost, export BOOST_ROOT and rebuild." ) N = box_corners.shape[0] K = qbox_corners.shape[0] overlap = np.zeros((N, K), dtype=box_corners.dtype) BoxOps.rbbox_iou(tv.from_numpy(box_corners), tv.from_numpy(qbox_corners), tv.from_numpy(standup_iou), tv.from_numpy(overlap), standup_thresh, True) return overlap
def main_point_with_features(): np.random.seed(50051) # voxel gen source code: spconv/csrc/sparse/pointops.py gen = Point2VoxelCPU3d( vsize_xyz=[0.1, 0.1, 0.1], coors_range_xyz=[-80, -80, -2, 80, 80, 6], num_point_features= 4, # here num_point_features must equal to pc.shape[1] max_num_voxels=5000, max_num_points_per_voxel=5) pc = np.random.uniform(-10, 10, size=[1000, 3]) other_pc_feature = np.random.uniform(-1, 1, size=[1000, 1]) pc_with_feature = np.concatenate([pc, other_pc_feature], axis=1) pc_tv = tv.from_numpy(pc_with_feature) # generate voxels, note that voxels_tv reference to a persistent buffer in generator, # so we can't run it in multi-thread. voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel(pc_tv) voxels_np = voxels_tv.numpy_view() indices_np = indices_tv.numpy_view() num_p_in_vx_np = num_p_in_vx_tv.numpy_view() print(f"------Raw Voxels {voxels_np.shape[0]}-------") print(voxels_np[0]) # run voxel gen and FILL MEAN VALUE to voxel remain voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel_empty_mean( pc_tv) voxels_np = voxels_tv.numpy_view() indices_np = indices_tv.numpy_view() num_p_in_vx_np = num_p_in_vx_tv.numpy_view() print("------Voxels with mean filled-------") print(voxels_np[0])
def reduce_mask_count(mask: np.ndarray, width: int): mask_length_32 = (div_up(mask.shape[0], width)) * width if mask.shape[0] < mask_length_32: mask_pad = np.zeros((mask_length_32, ), dtype=mask.dtype) mask_pad[:mask.shape[0]] = mask mask = mask_pad mask = mask.reshape(-1, width) maskr = np.bitwise_or.reduce(mask, axis=1) maskr_tv = tv.from_numpy(maskr) return SpconvOps.count_bits(maskr_tv).numpy().sum() * width
def waymo_data(batch_size=1): gen = Point2VoxelCPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3, 150000, 1) # gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1, # 150000) data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz") pc = np.ascontiguousarray(data["pc"]) print(pc.shape) voxels_tv, indices_tv, _ = gen.point_to_voxel(tv.from_numpy(pc)) voxels = voxels_tv.numpy().reshape(-1, 3) coors = indices_tv.numpy() N = coors.shape[0] coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1) return voxels, coors, gen.grid_size
def generate(self, points): if self.spconv_ver == 1: voxel_output = self._voxel_generator.generate(points) if isinstance(voxel_output, dict): voxels, coordinates, num_points = \ voxel_output['voxels'], voxel_output['coordinates'], voxel_output['num_points_per_voxel'] else: voxels, coordinates, num_points = voxel_output else: assert tv is not None, f"Unexpected error, library: 'cumm' wasn't imported properly." voxel_output = self._voxel_generator.point_to_voxel(tv.from_numpy(points)) tv_voxels, tv_coordinates, tv_num_points = voxel_output # make copy with numpy(), since numpy_view() will disappear as soon as the generator is deleted voxels = tv_voxels.numpy() coordinates = tv_coordinates.numpy() num_points = tv_num_points.numpy() return voxels, coordinates, num_points
def main_cuda(): np.random.seed(50051) from spconv.utils import Point2VoxelGPU3d # voxel gen source code: spconv/csrc/sparse/pointops.py gen = Point2VoxelGPU3d(vsize_xyz=[0.1, 0.1, 0.1], coors_range_xyz=[-80, -80, -2, 80, 80, 6], num_point_features=3, max_num_voxels=5000, max_num_points_per_voxel=5) pc = np.random.uniform(-10, 10, size=[100000, 3]).astype(np.float32) pc_tv = tv.from_numpy(pc).cuda() # generate voxels, note that voxels_tv reference to a persistent buffer in generator, # so we can't run it in multi-thread. voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel_hash(pc_tv) voxels_np = voxels_tv.cpu().numpy() indices_np = indices_tv.cpu().numpy() num_p_in_vx_np = num_p_in_vx_tv.cpu().numpy() print(f"------CUDA Raw Voxels {voxels_np.shape[0]}-------") print(voxels_np[0])
def non_max_suppression_cpu(boxes: np.ndarray, order: np.ndarray, thresh: float, eps: float = 0.0): return BoxOps.non_max_suppression_cpu(tv.from_numpy(boxes), tv.from_numpy(order), thresh, eps)
def dev_subm_inds_v2(subm: bool = False, run_conv: bool = True): limit_input_n = 16384 limit_input_n = None np.random.seed(484) with (PACKAGE_ROOT.parent / "test/data/test_spconv.pkl").open("rb") as f: voxels_np, indices_np, spatial_shape = pickle.load(f) from spconv.test_utils import generate_sparse_data voxels_np = voxels_np[:limit_input_n] indices_np = indices_np[:limit_input_n] spatial_shape = [19, 18, 17] sparse_dict = generate_sparse_data(spatial_shape, [1024], 128) voxels_np = np.ascontiguousarray(sparse_dict["features"]).astype( np.float32) indices_np = np.ascontiguousarray( sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) voxels = tv.from_numpy(voxels_np).cuda() indices = tv.from_numpy(indices_np).cuda() indices_th = torch.from_numpy(indices_np).cuda() print(spatial_shape, indices_np.shape) ndim = 3 if subm: ksize = [3, 3, 3] kv = np.prod(ksize) padding = [1] * ndim stride = [1] * ndim dilation = [1] * ndim out_padding = [0] * ndim else: ksize = [2, 2, 2] kv = np.prod(ksize) padding = [0] * ndim stride = [1] * ndim dilation = [1] * ndim out_padding = [0] * ndim out_inds, pair_ref, indice_num_per_loc = ops.get_indice_pairs( indices_th, 1, spatial_shape, ConvAlgo.Native, ksize, stride, padding, dilation, out_padding, subm) indice_num_per_loc_np = indice_num_per_loc.cpu().numpy() indice_pairs_np = pair_ref.cpu().numpy() algo = ConvAlgo.MaskSplitImplicitGemm if algo == ConvAlgo.MaskImplicitGemm: num_split = 1 else: num_split = 2 for i in range(5): res = ops.get_indice_pairs_implicit_gemm(indices_th, 1, spatial_shape, algo, ksize, stride, padding, dilation, out_padding, subm) out_inds = res[0] num_inds_per_loc = res[1] pair_fwd = res[2] pair_fwd_x = pair_fwd.cpu().numpy().reshape(-1) pair_fwd_x[pair_fwd_x == -1] = 0 loc_num_np = (pair_fwd_x > 0).reshape(kv, -1).sum(1) print(loc_num_np) print(indice_num_per_loc_np) pair_bwd = res[3] pair_mask_fwd_splits = res[4] pair_mask_bwd_splits = res[5] mask_argsort_fwd_splits = res[6] mask_argsort_bwd_splits = res[7] masks = res[8] pair_mask_fwd_splits_tv = [ ops.torch_tensor_to_tv(t, dtype=tv.uint32) for t in pair_mask_fwd_splits ] valid_location_bitcount = [ SpconvOps.count_bits(t) for t in pair_mask_fwd_splits_tv ] valid_location_count = sum( [t.cpu().numpy().sum() for t in valid_location_bitcount]) reduce_length = 32 split_mask_valid_count = sum([ reduce_mask_count(t.cpu().numpy(), reduce_length) for t in pair_mask_fwd_splits_tv ]) if subm: print("SUBM", valid_location_count, split_mask_valid_count, pair_fwd.numel()) else: print("REGULAR", valid_location_count, split_mask_valid_count, pair_fwd.numel()) # return if run_conv: C = 64 K = 64 desps = CONV.desps mask_output_fwd = torch.zeros([2, div_up(out_inds.shape[0], 32)], dtype=torch.int32, device=indices_th.device) mask_output_bwd = torch.zeros([2, div_up(indices.dim(0), 32)], dtype=torch.int32, device=indices_th.device) for desp in desps: if desp.algo != GemmAlgo.Simt.value: continue # if desp.op_type == ConvOpType.kBackwardWeight.value: # continue # if desp.tile_shape ! if desp.dtype_a == dtypes.int8.tv_dtype: inp = np.random.randint(-1, 1, size=[voxels_np.shape[0], C]).astype(np.int8) weight = np.random.randint(-1, 1, size=[K, *ksize, C]).astype(np.int8) output = np.random.randint(-1, 1, size=[ out_inds.shape[0], K ]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_output)) else: inp = np.random.uniform(-1, 1, size=[ voxels_np.shape[0], C ]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_input)) weight = np.random.uniform(-1, 1, size=[K, *ksize, C]).astype( dtypes.get_npdtype_from_tvdtype(desp.dtype_weight)) output = np.random.uniform(-1, 1, size=[ out_inds.shape[0], K ]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_output)) weight_ref = weight.transpose(1, 2, 3, 0, 4) weight_ref = np.ascontiguousarray(weight_ref).reshape(-1, K, C) if desp.op_type == ConvOpType.kBackwardInput.value: inp_tv = tv.zeros(inp.shape, desp.dtype_input, 0) else: inp_tv = tv.from_numpy(inp).cuda() if desp.op_type == ConvOpType.kBackwardWeight.value: weight_tv = tv.zeros(weight.shape, desp.dtype_weight, 0) else: weight_tv = tv.from_numpy(weight).cuda() # _ = tv.zeros([5000, 10], tv.float32, 0) if desp.op_type == ConvOpType.kForward.value: output_tv = tv.zeros(output.shape, desp.dtype_output, 0) else: output_tv = tv.from_numpy(output).cuda() torch.cuda.synchronize() t = time.time() spk = 1 if desp.op_type == ConvOpType.kBackwardWeight.value: # TODO support splitk parallel spk = 32 if subm: if desp.op_type == ConvOpType.kForward.value: indice_pairs = pair_fwd elif desp.op_type == ConvOpType.kBackwardInput.value: indice_pairs = pair_bwd else: indice_pairs = pair_fwd mask_output = mask_output_fwd # print([bin(x.item()) for x in masks]) for j in range(num_split): beta = 1 if j == 1 else 0 mask_filter = 0xffffffff mask_filter = masks[j].item() reverse_mask = False if desp.op_type == ConvOpType.kBackwardWeight.value: mask_op = mask_output[j] else: mask_op = pair_mask_fwd_splits[j] if desp.op_type == ConvOpType.kBackwardInput.value: reverse_mask = True CONV.run_with_tuned_result( BestConvAlgoByProfile(desp, spk), desp.op_type, inp_tv, weight_tv, output_tv, torch_tensor_to_tv(mask_op, dtype=tv.uint32), torch_tensor_to_tv(mask_argsort_fwd_splits[j]), torch_tensor_to_tv(mask_output[j], dtype=tv.uint32), torch_tensor_to_tv(indice_pairs), reverse_mask, mask_filter=mask_filter, mask_width=32, beta=beta, verbose=True, ) else: if desp.op_type == ConvOpType.kForward.value: indice_pairs = pair_fwd # inp -> out mask_ops = pair_mask_fwd_splits mask_argsorts = mask_argsort_fwd_splits mask_output = mask_output_fwd elif desp.op_type == ConvOpType.kBackwardInput.value: indice_pairs = pair_bwd # out -> inp mask_ops = pair_mask_bwd_splits mask_argsorts = mask_argsort_bwd_splits mask_output = mask_output_bwd print([bin(x.item()) for x in masks]) else: indice_pairs = pair_fwd # inp -> out mask_ops = pair_mask_fwd_splits mask_argsorts = mask_argsort_fwd_splits mask_output = mask_output_fwd for j in range(2): beta = 1 if j == 1 else 0 mask_filter = masks[j].item() reverse_mask = False if desp.op_type == ConvOpType.kBackwardWeight.value: mask_op = mask_output[j] else: mask_op = mask_ops[j] CONV.run_with_tuned_result( BestConvAlgoByProfile(desp, spk), desp.op_type, inp_tv, weight_tv, output_tv, torch_tensor_to_tv(mask_op, dtype=tv.uint32), torch_tensor_to_tv(mask_argsorts[j]), torch_tensor_to_tv(mask_output[j], dtype=tv.uint32), torch_tensor_to_tv(indice_pairs), reverse_mask, mask_filter=mask_filter, mask_width=32, beta=beta, verbose=True, ) torch.cuda.synchronize() duration = time.time() - t if desp.op_type == ConvOpType.kForward.value: output_ref = np.zeros_like(output, dtype=np.float32) # ref algorithm for filter_offset in range(kv): if subm and filter_offset > kv // 2: nhot = indice_num_per_loc_np[kv - 1 - filter_offset] elif subm and filter_offset == kv // 2: nhot = voxels.shape[0] else: nhot = indice_num_per_loc_np[filter_offset] a_inds = indice_pairs_np[0][filter_offset][:nhot] c_inds = indice_pairs_np[1][filter_offset][:nhot] # print(a_inds_cpu[:10]) a = inp[a_inds] cc = a.astype( np.float32) @ weight_ref[filter_offset].T.astype( np.float32) output_ref[c_inds] += cc output_cpu = output_tv.cpu().numpy().astype(np.float32) duration = time.time() - t my = output_cpu.reshape(-1) print("ERROR", np.linalg.norm(output_ref.reshape(-1) - my)) elif desp.op_type == ConvOpType.kBackwardInput.value: dinput_ref = np.zeros_like(inp, dtype=np.float32) # ref algorithm for filter_offset in range(kv): if subm and filter_offset > kv // 2: nhot = indice_num_per_loc_np[kv - 1 - filter_offset] elif subm and filter_offset == kv // 2: nhot = voxels.shape[0] else: nhot = indice_num_per_loc_np[filter_offset] a_inds = indice_pairs_np[1][filter_offset][:nhot] c_inds = indice_pairs_np[0][filter_offset][:nhot] # print(a_inds_cpu[:10]) a = output[a_inds] # NK @ KC cc = a.astype( np.float32) @ weight_ref[filter_offset].astype( np.float32) dinput_ref[c_inds] += cc din_cpu = inp_tv.cpu().numpy() print( "ERROR", np.linalg.norm( din_cpu.reshape(-1) - dinput_ref.reshape(-1))) else: dw_ref = np.zeros_like(weight_ref, dtype=np.float32) # KV, K, C for filter_offset in range(kv): if subm and filter_offset > kv // 2: nhot = indice_num_per_loc_np[kv - 1 - filter_offset] elif subm and filter_offset == kv // 2: nhot = voxels.shape[0] else: nhot = indice_num_per_loc_np[filter_offset] o_inds = indice_pairs_np[1][filter_offset][:nhot] i_inds = indice_pairs_np[0][filter_offset][:nhot] # print(a_inds_cpu[:10]) out_gather = output[o_inds] # [N, K] inp_gather = inp[i_inds] # [N, C] # KN @ NC dw_res = out_gather.astype( np.float32).T @ inp_gather.astype(np.float32) dw_ref[filter_offset] = dw_res # print(indice_pairs_np_test[0]) dw_ref_kcrs = dw_ref.transpose(1, 0, 2) dw_cpu = weight_tv.cpu().numpy().reshape(K, np.prod(ksize), C) print( "ERROR", np.linalg.norm( dw_cpu.reshape(-1) - dw_ref_kcrs.reshape(-1)))