Beispiel #1
0
def rotate_non_max_suppression_cpu(boxes: np.ndarray, order: np.ndarray,
                                   standup_iou: np.ndarray, thresh: float):
    if not BoxOps.has_boost():
        raise NotImplementedError(
            "this op require spconv built with boost, download boost, export BOOST_ROOT and rebuild."
        )
    return BoxOps.rotate_non_max_suppression_cpu(tv.from_numpy(boxes),
                                                 tv.from_numpy(order),
                                                 tv.from_numpy(standup_iou),
                                                 thresh)
Beispiel #2
0
def rbbox_iou_loss(box_corners: np.ndarray, qbox_corners: np.ndarray):
    if not BoxOps.has_boost():
        raise NotImplementedError(
            "this op require spconv built with boost, download boost, export BOOST_ROOT and rebuild."
        )
    N = box_corners.shape[0]
    overlap = np.zeros((N, ), dtype=box_corners.dtype)

    BoxOps.rbbox_iou_aligned(tv.from_numpy(box_corners),
                             tv.from_numpy(qbox_corners),
                             tv.from_numpy(overlap), False)
    return overlap
Beispiel #3
0
def rbbox_intersection(box_corners: np.ndarray, qbox_corners: np.ndarray,
                       standup_iou: np.ndarray, standup_thresh: float):
    if not BoxOps.has_boost():
        raise NotImplementedError(
            "this op require spconv built with boost, download boost, export BOOST_ROOT and rebuild."
        )
    N = box_corners.shape[0]
    K = qbox_corners.shape[0]
    overlap = np.zeros((N, K), dtype=box_corners.dtype)

    BoxOps.rbbox_iou(tv.from_numpy(box_corners), tv.from_numpy(qbox_corners),
                     tv.from_numpy(standup_iou), tv.from_numpy(overlap),
                     standup_thresh, True)
    return overlap
Beispiel #4
0
def main_point_with_features():
    np.random.seed(50051)
    # voxel gen source code: spconv/csrc/sparse/pointops.py
    gen = Point2VoxelCPU3d(
        vsize_xyz=[0.1, 0.1, 0.1],
        coors_range_xyz=[-80, -80, -2, 80, 80, 6],
        num_point_features=
        4,  # here num_point_features must equal to pc.shape[1]
        max_num_voxels=5000,
        max_num_points_per_voxel=5)

    pc = np.random.uniform(-10, 10, size=[1000, 3])
    other_pc_feature = np.random.uniform(-1, 1, size=[1000, 1])
    pc_with_feature = np.concatenate([pc, other_pc_feature], axis=1)
    pc_tv = tv.from_numpy(pc_with_feature)
    # generate voxels, note that voxels_tv reference to a persistent buffer in generator,
    # so we can't run it in multi-thread.
    voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel(pc_tv)
    voxels_np = voxels_tv.numpy_view()
    indices_np = indices_tv.numpy_view()
    num_p_in_vx_np = num_p_in_vx_tv.numpy_view()
    print(f"------Raw Voxels {voxels_np.shape[0]}-------")
    print(voxels_np[0])
    # run voxel gen and FILL MEAN VALUE to voxel remain
    voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel_empty_mean(
        pc_tv)
    voxels_np = voxels_tv.numpy_view()
    indices_np = indices_tv.numpy_view()
    num_p_in_vx_np = num_p_in_vx_tv.numpy_view()
    print("------Voxels with mean filled-------")
    print(voxels_np[0])
Beispiel #5
0
def reduce_mask_count(mask: np.ndarray, width: int):
    mask_length_32 = (div_up(mask.shape[0], width)) * width
    if mask.shape[0] < mask_length_32:
        mask_pad = np.zeros((mask_length_32, ), dtype=mask.dtype)
        mask_pad[:mask.shape[0]] = mask
        mask = mask_pad
    mask = mask.reshape(-1, width)
    maskr = np.bitwise_or.reduce(mask, axis=1)
    maskr_tv = tv.from_numpy(maskr)
    return SpconvOps.count_bits(maskr_tv).numpy().sum() * width
Beispiel #6
0
def waymo_data(batch_size=1):
    gen = Point2VoxelCPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3,
                           150000, 1)
    # gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
    #                        150000)
    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
    pc = np.ascontiguousarray(data["pc"])
    print(pc.shape)
    voxels_tv, indices_tv, _ = gen.point_to_voxel(tv.from_numpy(pc))
    voxels = voxels_tv.numpy().reshape(-1, 3)
    coors = indices_tv.numpy()
    N = coors.shape[0]
    coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
    return voxels, coors, gen.grid_size
 def generate(self, points):
     if self.spconv_ver == 1:
         voxel_output = self._voxel_generator.generate(points)
         if isinstance(voxel_output, dict):
             voxels, coordinates, num_points = \
                 voxel_output['voxels'], voxel_output['coordinates'], voxel_output['num_points_per_voxel']
         else:
             voxels, coordinates, num_points = voxel_output
     else:
         assert tv is not None, f"Unexpected error, library: 'cumm' wasn't imported properly."
         voxel_output = self._voxel_generator.point_to_voxel(tv.from_numpy(points))
         tv_voxels, tv_coordinates, tv_num_points = voxel_output
         # make copy with numpy(), since numpy_view() will disappear as soon as the generator is deleted
         voxels = tv_voxels.numpy()
         coordinates = tv_coordinates.numpy()
         num_points = tv_num_points.numpy()
     return voxels, coordinates, num_points
Beispiel #8
0
def main_cuda():
    np.random.seed(50051)
    from spconv.utils import Point2VoxelGPU3d

    # voxel gen source code: spconv/csrc/sparse/pointops.py
    gen = Point2VoxelGPU3d(vsize_xyz=[0.1, 0.1, 0.1],
                           coors_range_xyz=[-80, -80, -2, 80, 80, 6],
                           num_point_features=3,
                           max_num_voxels=5000,
                           max_num_points_per_voxel=5)

    pc = np.random.uniform(-10, 10, size=[100000, 3]).astype(np.float32)
    pc_tv = tv.from_numpy(pc).cuda()
    # generate voxels, note that voxels_tv reference to a persistent buffer in generator,
    # so we can't run it in multi-thread.
    voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel_hash(pc_tv)
    voxels_np = voxels_tv.cpu().numpy()
    indices_np = indices_tv.cpu().numpy()
    num_p_in_vx_np = num_p_in_vx_tv.cpu().numpy()
    print(f"------CUDA Raw Voxels {voxels_np.shape[0]}-------")
    print(voxels_np[0])
Beispiel #9
0
def non_max_suppression_cpu(boxes: np.ndarray,
                            order: np.ndarray,
                            thresh: float,
                            eps: float = 0.0):
    return BoxOps.non_max_suppression_cpu(tv.from_numpy(boxes),
                                          tv.from_numpy(order), thresh, eps)
Beispiel #10
0
def dev_subm_inds_v2(subm: bool = False, run_conv: bool = True):
    limit_input_n = 16384
    limit_input_n = None
    np.random.seed(484)

    with (PACKAGE_ROOT.parent / "test/data/test_spconv.pkl").open("rb") as f:
        voxels_np, indices_np, spatial_shape = pickle.load(f)
        from spconv.test_utils import generate_sparse_data
        voxels_np = voxels_np[:limit_input_n]
        indices_np = indices_np[:limit_input_n]

        spatial_shape = [19, 18, 17]
        sparse_dict = generate_sparse_data(spatial_shape, [1024], 128)

        voxels_np = np.ascontiguousarray(sparse_dict["features"]).astype(
            np.float32)
        indices_np = np.ascontiguousarray(
            sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)

        voxels = tv.from_numpy(voxels_np).cuda()
        indices = tv.from_numpy(indices_np).cuda()
        indices_th = torch.from_numpy(indices_np).cuda()
    print(spatial_shape, indices_np.shape)
    ndim = 3
    if subm:
        ksize = [3, 3, 3]
        kv = np.prod(ksize)
        padding = [1] * ndim
        stride = [1] * ndim
        dilation = [1] * ndim
        out_padding = [0] * ndim
    else:
        ksize = [2, 2, 2]
        kv = np.prod(ksize)
        padding = [0] * ndim
        stride = [1] * ndim
        dilation = [1] * ndim
        out_padding = [0] * ndim
    out_inds, pair_ref, indice_num_per_loc = ops.get_indice_pairs(
        indices_th, 1, spatial_shape, ConvAlgo.Native, ksize, stride, padding,
        dilation, out_padding, subm)
    indice_num_per_loc_np = indice_num_per_loc.cpu().numpy()
    indice_pairs_np = pair_ref.cpu().numpy()
    algo = ConvAlgo.MaskSplitImplicitGemm
    if algo == ConvAlgo.MaskImplicitGemm:
        num_split = 1
    else:
        num_split = 2
    for i in range(5):
        res = ops.get_indice_pairs_implicit_gemm(indices_th, 1, spatial_shape,
                                                 algo, ksize, stride, padding,
                                                 dilation, out_padding, subm)
    out_inds = res[0]
    num_inds_per_loc = res[1]
    pair_fwd = res[2]
    pair_fwd_x = pair_fwd.cpu().numpy().reshape(-1)
    pair_fwd_x[pair_fwd_x == -1] = 0
    loc_num_np = (pair_fwd_x > 0).reshape(kv, -1).sum(1)
    print(loc_num_np)
    print(indice_num_per_loc_np)

    pair_bwd = res[3]
    pair_mask_fwd_splits = res[4]
    pair_mask_bwd_splits = res[5]
    mask_argsort_fwd_splits = res[6]
    mask_argsort_bwd_splits = res[7]
    masks = res[8]
    pair_mask_fwd_splits_tv = [
        ops.torch_tensor_to_tv(t, dtype=tv.uint32)
        for t in pair_mask_fwd_splits
    ]
    valid_location_bitcount = [
        SpconvOps.count_bits(t) for t in pair_mask_fwd_splits_tv
    ]
    valid_location_count = sum(
        [t.cpu().numpy().sum() for t in valid_location_bitcount])
    reduce_length = 32
    split_mask_valid_count = sum([
        reduce_mask_count(t.cpu().numpy(), reduce_length)
        for t in pair_mask_fwd_splits_tv
    ])
    if subm:
        print("SUBM", valid_location_count, split_mask_valid_count,
              pair_fwd.numel())
    else:
        print("REGULAR", valid_location_count, split_mask_valid_count,
              pair_fwd.numel())
    # return

    if run_conv:
        C = 64
        K = 64
        desps = CONV.desps
        mask_output_fwd = torch.zeros([2, div_up(out_inds.shape[0], 32)],
                                      dtype=torch.int32,
                                      device=indices_th.device)
        mask_output_bwd = torch.zeros([2, div_up(indices.dim(0), 32)],
                                      dtype=torch.int32,
                                      device=indices_th.device)

        for desp in desps:
            if desp.algo != GemmAlgo.Simt.value:
                continue
            # if desp.op_type == ConvOpType.kBackwardWeight.value:
            #     continue
            # if desp.tile_shape !
            if desp.dtype_a == dtypes.int8.tv_dtype:
                inp = np.random.randint(-1, 1, size=[voxels_np.shape[0],
                                                     C]).astype(np.int8)
                weight = np.random.randint(-1, 1, size=[K, *ksize,
                                                        C]).astype(np.int8)
                output = np.random.randint(-1, 1, size=[
                    out_inds.shape[0], K
                ]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_output))
            else:
                inp = np.random.uniform(-1, 1, size=[
                    voxels_np.shape[0], C
                ]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_input))
                weight = np.random.uniform(-1, 1, size=[K, *ksize, C]).astype(
                    dtypes.get_npdtype_from_tvdtype(desp.dtype_weight))
                output = np.random.uniform(-1, 1, size=[
                    out_inds.shape[0], K
                ]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_output))
            weight_ref = weight.transpose(1, 2, 3, 0, 4)
            weight_ref = np.ascontiguousarray(weight_ref).reshape(-1, K, C)
            if desp.op_type == ConvOpType.kBackwardInput.value:
                inp_tv = tv.zeros(inp.shape, desp.dtype_input, 0)
            else:
                inp_tv = tv.from_numpy(inp).cuda()
            if desp.op_type == ConvOpType.kBackwardWeight.value:
                weight_tv = tv.zeros(weight.shape, desp.dtype_weight, 0)
            else:
                weight_tv = tv.from_numpy(weight).cuda()
            # _ = tv.zeros([5000, 10], tv.float32, 0)
            if desp.op_type == ConvOpType.kForward.value:
                output_tv = tv.zeros(output.shape, desp.dtype_output, 0)
            else:
                output_tv = tv.from_numpy(output).cuda()
            torch.cuda.synchronize()
            t = time.time()
            spk = 1
            if desp.op_type == ConvOpType.kBackwardWeight.value:
                # TODO support splitk parallel
                spk = 32
            if subm:
                if desp.op_type == ConvOpType.kForward.value:
                    indice_pairs = pair_fwd
                elif desp.op_type == ConvOpType.kBackwardInput.value:
                    indice_pairs = pair_bwd
                else:
                    indice_pairs = pair_fwd
                mask_output = mask_output_fwd
                # print([bin(x.item()) for x in masks])
                for j in range(num_split):
                    beta = 1 if j == 1 else 0
                    mask_filter = 0xffffffff
                    mask_filter = masks[j].item()

                    reverse_mask = False
                    if desp.op_type == ConvOpType.kBackwardWeight.value:
                        mask_op = mask_output[j]
                    else:
                        mask_op = pair_mask_fwd_splits[j]
                    if desp.op_type == ConvOpType.kBackwardInput.value:
                        reverse_mask = True
                    CONV.run_with_tuned_result(
                        BestConvAlgoByProfile(desp, spk),
                        desp.op_type,
                        inp_tv,
                        weight_tv,
                        output_tv,
                        torch_tensor_to_tv(mask_op, dtype=tv.uint32),
                        torch_tensor_to_tv(mask_argsort_fwd_splits[j]),
                        torch_tensor_to_tv(mask_output[j], dtype=tv.uint32),
                        torch_tensor_to_tv(indice_pairs),
                        reverse_mask,
                        mask_filter=mask_filter,
                        mask_width=32,
                        beta=beta,
                        verbose=True,
                    )
            else:
                if desp.op_type == ConvOpType.kForward.value:
                    indice_pairs = pair_fwd  # inp -> out
                    mask_ops = pair_mask_fwd_splits
                    mask_argsorts = mask_argsort_fwd_splits
                    mask_output = mask_output_fwd
                elif desp.op_type == ConvOpType.kBackwardInput.value:
                    indice_pairs = pair_bwd  # out -> inp
                    mask_ops = pair_mask_bwd_splits
                    mask_argsorts = mask_argsort_bwd_splits
                    mask_output = mask_output_bwd

                    print([bin(x.item()) for x in masks])
                else:
                    indice_pairs = pair_fwd  # inp -> out
                    mask_ops = pair_mask_fwd_splits
                    mask_argsorts = mask_argsort_fwd_splits
                    mask_output = mask_output_fwd

                for j in range(2):
                    beta = 1 if j == 1 else 0
                    mask_filter = masks[j].item()
                    reverse_mask = False
                    if desp.op_type == ConvOpType.kBackwardWeight.value:
                        mask_op = mask_output[j]
                    else:
                        mask_op = mask_ops[j]

                    CONV.run_with_tuned_result(
                        BestConvAlgoByProfile(desp, spk),
                        desp.op_type,
                        inp_tv,
                        weight_tv,
                        output_tv,
                        torch_tensor_to_tv(mask_op, dtype=tv.uint32),
                        torch_tensor_to_tv(mask_argsorts[j]),
                        torch_tensor_to_tv(mask_output[j], dtype=tv.uint32),
                        torch_tensor_to_tv(indice_pairs),
                        reverse_mask,
                        mask_filter=mask_filter,
                        mask_width=32,
                        beta=beta,
                        verbose=True,
                    )

            torch.cuda.synchronize()
            duration = time.time() - t
            if desp.op_type == ConvOpType.kForward.value:
                output_ref = np.zeros_like(output, dtype=np.float32)
                # ref algorithm
                for filter_offset in range(kv):
                    if subm and filter_offset > kv // 2:
                        nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
                    elif subm and filter_offset == kv // 2:
                        nhot = voxels.shape[0]
                    else:
                        nhot = indice_num_per_loc_np[filter_offset]
                    a_inds = indice_pairs_np[0][filter_offset][:nhot]
                    c_inds = indice_pairs_np[1][filter_offset][:nhot]
                    # print(a_inds_cpu[:10])
                    a = inp[a_inds]
                    cc = a.astype(
                        np.float32) @ weight_ref[filter_offset].T.astype(
                            np.float32)
                    output_ref[c_inds] += cc

                output_cpu = output_tv.cpu().numpy().astype(np.float32)
                duration = time.time() - t
                my = output_cpu.reshape(-1)
                print("ERROR", np.linalg.norm(output_ref.reshape(-1) - my))

            elif desp.op_type == ConvOpType.kBackwardInput.value:
                dinput_ref = np.zeros_like(inp, dtype=np.float32)
                # ref algorithm
                for filter_offset in range(kv):
                    if subm and filter_offset > kv // 2:
                        nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
                    elif subm and filter_offset == kv // 2:
                        nhot = voxels.shape[0]
                    else:
                        nhot = indice_num_per_loc_np[filter_offset]
                    a_inds = indice_pairs_np[1][filter_offset][:nhot]
                    c_inds = indice_pairs_np[0][filter_offset][:nhot]

                    # print(a_inds_cpu[:10])
                    a = output[a_inds]
                    # NK @ KC
                    cc = a.astype(
                        np.float32) @ weight_ref[filter_offset].astype(
                            np.float32)
                    dinput_ref[c_inds] += cc
                din_cpu = inp_tv.cpu().numpy()
                print(
                    "ERROR",
                    np.linalg.norm(
                        din_cpu.reshape(-1) - dinput_ref.reshape(-1)))
            else:
                dw_ref = np.zeros_like(weight_ref,
                                       dtype=np.float32)  # KV, K, C
                for filter_offset in range(kv):
                    if subm and filter_offset > kv // 2:
                        nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
                    elif subm and filter_offset == kv // 2:
                        nhot = voxels.shape[0]
                    else:
                        nhot = indice_num_per_loc_np[filter_offset]
                    o_inds = indice_pairs_np[1][filter_offset][:nhot]
                    i_inds = indice_pairs_np[0][filter_offset][:nhot]
                    # print(a_inds_cpu[:10])
                    out_gather = output[o_inds]  # [N, K]
                    inp_gather = inp[i_inds]  # [N, C]
                    # KN @ NC
                    dw_res = out_gather.astype(
                        np.float32).T @ inp_gather.astype(np.float32)
                    dw_ref[filter_offset] = dw_res
                # print(indice_pairs_np_test[0])
                dw_ref_kcrs = dw_ref.transpose(1, 0, 2)
                dw_cpu = weight_tv.cpu().numpy().reshape(K, np.prod(ksize), C)

                print(
                    "ERROR",
                    np.linalg.norm(
                        dw_cpu.reshape(-1) - dw_ref_kcrs.reshape(-1)))