def to_depth_mode(points, bboxes):
    """Convert points and bboxes to Depth Coord and Depth Box mode."""
    if points is not None:
        points = Coord3DMode.convert_point(points.copy(), Coord3DMode.LIDAR,
                                           Coord3DMode.DEPTH)
    if bboxes is not None:
        bboxes = Box3DMode.convert(bboxes.clone(), Box3DMode.LIDAR,
                                   Box3DMode.DEPTH)
    return points, bboxes
def main():
    args = parse_args()

    if args.output_dir is not None:
        mkdir_or_exist(args.output_dir)

    cfg = retrieve_data_cfg(args.config, args.skip_type, args.cfg_options)
    if cfg.data.train['type'] == 'RepeatDataset':
        cfg.data.train.dataset['pipeline'] = get_loading_pipeline(
            cfg.train_pipeline)
    else:
        cfg.data.train['pipeline'] = get_loading_pipeline(cfg.train_pipeline)
    dataset = build_dataset(cfg.data.train,
                            default_args=dict(filter_empty_gt=False))
    # For RepeatDataset type, the infos are stored in dataset.dataset
    if cfg.data.train['type'] == 'RepeatDataset':
        dataset = dataset.dataset
    data_infos = dataset.data_infos

    for idx, data_info in enumerate(track_iter_progress(data_infos)):
        if cfg.dataset_type in ['KittiDataset', 'WaymoDataset']:
            pts_path = data_info['point_cloud']['velodyne_path']
        elif cfg.dataset_type in ['ScanNetDataset', 'SUNRGBDDataset']:
            pts_path = data_info['pts_path']
        elif cfg.dataset_type in ['NuScenesDataset', 'LyftDataset']:
            pts_path = data_info['lidar_path']
        else:
            raise NotImplementedError(
                f'unsupported dataset type {cfg.dataset_type}')
        file_name = osp.splitext(osp.basename(pts_path))[0]
        save_path = osp.join(args.output_dir,
                             f'{file_name}.png') if args.output_dir else None

        example = dataset.prepare_train_data(idx)
        points = example['points']._data.numpy()
        points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
                                           Coord3DMode.DEPTH)
        gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'].tensor
        if gt_bboxes is not None:
            gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
                                          Box3DMode.DEPTH)

        vis = Visualizer(points, save_path='./show.png')
        vis.add_bboxes(bbox3d=gt_bboxes, bbox_color=(0, 0, 1))

        vis.show(save_path)
        del vis
    def convert_to(self, dst, rt_mat=None):
        """Convert self to ``dst`` mode.

        Args:
            dst (:obj:`CoordMode`): The target Point mode.
            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
                matrix between different coordinates. Defaults to None.
                The conversion from `src` coordinates to `dst` coordinates
                usually comes along the change of sensors, e.g., from camera
                to LiDAR. This requires a transformation matrix.

        Returns:
            :obj:`BasePoints`: The converted point of the same type \
                in the `dst` mode.
        """
        from mmdet3d.core.bbox import Coord3DMode
        return Coord3DMode.convert_point(
            point=self, src=Coord3DMode.CAM, dst=dst, rt_mat=rt_mat)
    def forward(self, imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas,
                calibs):
        """Forward function.

        Args:
            imgs (list[torch.Tensor]): Image features.
            bboxes_2d_rescaled (list[torch.Tensor]): 2D bboxes.
            seeds_3d_depth (torch.Tensor): 3D seeds.
            img_metas (list[dict]): Meta information of images.
            calibs: Camera calibration information of the images.

        Returns:
            torch.Tensor: Concatenated cues of each point.
            torch.Tensor: Validity mask of each feature.
        """
        img_features = []
        masks = []
        for i, data in enumerate(
                zip(imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas)):
            img, bbox_2d_rescaled, seed_3d_depth, img_meta = data
            bbox_num = bbox_2d_rescaled.shape[0]
            seed_num = seed_3d_depth.shape[0]

            img_shape = img_meta['img_shape']
            img_h, img_w, _ = img_shape

            # first reverse the data transformations
            xyz_depth = apply_3d_transformation(seed_3d_depth,
                                                'DEPTH',
                                                img_meta,
                                                reverse=True)

            # then convert from depth coords to camera coords
            xyz_cam = Coord3DMode.convert_point(xyz_depth,
                                                Coord3DMode.DEPTH,
                                                Coord3DMode.CAM,
                                                rt_mat=calibs['Rt'][i])

            # project to 2d to get image coords (uv)
            uv_origin = points_cam2img(xyz_cam, calibs['K'][i])
            uv_origin = (uv_origin - 1).round()

            # rescale 2d coordinates and bboxes
            uv_rescaled = coord_2d_transform(img_meta, uv_origin, True)
            bbox_2d_origin = bbox_2d_transform(img_meta, bbox_2d_rescaled,
                                               False)

            if bbox_num == 0:
                imvote_num = seed_num * self.max_imvote_per_pixel

                # use zero features
                two_cues = torch.zeros((15, imvote_num),
                                       device=seed_3d_depth.device)
                mask_zero = torch.zeros(imvote_num - seed_num,
                                        device=seed_3d_depth.device).bool()
                mask_one = torch.ones(seed_num,
                                      device=seed_3d_depth.device).bool()
                mask = torch.cat([mask_one, mask_zero], dim=0)
            else:
                # expand bboxes and seeds
                bbox_expanded = bbox_2d_origin.view(1, bbox_num, -1).expand(
                    seed_num, -1, -1)
                seed_2d_expanded = uv_origin.view(seed_num, 1,
                                                  -1).expand(-1, bbox_num, -1)
                seed_2d_expanded_x, seed_2d_expanded_y = \
                    seed_2d_expanded.split(1, dim=-1)

                bbox_expanded_l, bbox_expanded_t, bbox_expanded_r, \
                    bbox_expanded_b, bbox_expanded_conf, bbox_expanded_cls = \
                    bbox_expanded.split(1, dim=-1)
                bbox_expanded_midx = (bbox_expanded_l + bbox_expanded_r) / 2
                bbox_expanded_midy = (bbox_expanded_t + bbox_expanded_b) / 2

                seed_2d_in_bbox_x = (seed_2d_expanded_x > bbox_expanded_l) * \
                    (seed_2d_expanded_x < bbox_expanded_r)
                seed_2d_in_bbox_y = (seed_2d_expanded_y > bbox_expanded_t) * \
                    (seed_2d_expanded_y < bbox_expanded_b)
                seed_2d_in_bbox = seed_2d_in_bbox_x * seed_2d_in_bbox_y

                # semantic cues, dim=class_num
                sem_cue = torch.zeros_like(bbox_expanded_conf).expand(
                    -1, -1, self.num_classes)
                sem_cue = sem_cue.scatter(-1, bbox_expanded_cls.long(),
                                          bbox_expanded_conf)

                # bbox center - uv
                delta_u = bbox_expanded_midx - seed_2d_expanded_x
                delta_v = bbox_expanded_midy - seed_2d_expanded_y

                seed_3d_expanded = seed_3d_depth.view(seed_num, 1, -1).expand(
                    -1, bbox_num, -1)

                z_cam = xyz_cam[..., 2:3].view(seed_num, 1,
                                               1).expand(-1, bbox_num, -1)

                delta_u = delta_u * z_cam / calibs['K'][i, 0, 0]
                delta_v = delta_v * z_cam / calibs['K'][i, 0, 0]

                imvote = torch.cat(
                    [delta_u, delta_v,
                     torch.zeros_like(delta_v)], dim=-1).view(-1, 3)

                # convert from camera coords to depth coords
                imvote = Coord3DMode.convert_point(imvote.view((-1, 3)),
                                                   Coord3DMode.CAM,
                                                   Coord3DMode.DEPTH,
                                                   rt_mat=calibs['Rt'][i])

                # apply transformation to lifted imvotes
                imvote = apply_3d_transformation(imvote,
                                                 'DEPTH',
                                                 img_meta,
                                                 reverse=False)

                seed_3d_expanded = seed_3d_expanded.reshape(imvote.shape)

                # ray angle
                ray_angle = seed_3d_expanded + imvote
                ray_angle /= torch.sqrt(torch.sum(ray_angle**2, -1) +
                                        EPS).unsqueeze(-1)

                # imvote lifted to 3d
                xz = ray_angle[:, [0, 2]] / (ray_angle[:, [1]] + EPS) \
                    * seed_3d_expanded[:, [1]] - seed_3d_expanded[:, [0, 2]]

                # geometric cues, dim=5
                geo_cue = torch.cat([xz, ray_angle],
                                    dim=-1).view(seed_num, -1, 5)

                two_cues = torch.cat([geo_cue, sem_cue], dim=-1)
                # mask to 0 if seed not in bbox
                two_cues = two_cues * seed_2d_in_bbox.float()

                feature_size = two_cues.shape[-1]
                # if bbox number is too small, append zeros
                if bbox_num < self.max_imvote_per_pixel:
                    append_num = self.max_imvote_per_pixel - bbox_num
                    append_zeros = torch.zeros(
                        (seed_num, append_num, 1),
                        device=seed_2d_in_bbox.device).bool()
                    seed_2d_in_bbox = torch.cat(
                        [seed_2d_in_bbox, append_zeros], dim=1)
                    append_zeros = torch.zeros(
                        (seed_num, append_num, feature_size),
                        device=two_cues.device)
                    two_cues = torch.cat([two_cues, append_zeros], dim=1)
                    append_zeros = torch.zeros((seed_num, append_num, 1),
                                               device=two_cues.device)
                    bbox_expanded_conf = torch.cat(
                        [bbox_expanded_conf, append_zeros], dim=1)

                # sort the valid seed-bbox pair according to confidence
                pair_score = seed_2d_in_bbox.float() + bbox_expanded_conf
                # and find the largests
                mask, indices = pair_score.topk(self.max_imvote_per_pixel,
                                                dim=1,
                                                largest=True,
                                                sorted=True)

                indices_img = indices.expand(-1, -1, feature_size)
                two_cues = two_cues.gather(dim=1, index=indices_img)
                two_cues = two_cues.transpose(1, 0)
                two_cues = two_cues.reshape(-1, feature_size).transpose(
                    1, 0).contiguous()

                # since conf is ~ (0, 1), floor gives us validity
                mask = mask.floor().int()
                mask = mask.transpose(1, 0).reshape(-1).bool()

            # clear the padding
            img = img[:, :img_shape[0], :img_shape[1]]
            img_flatten = img.reshape(3, -1).float()
            img_flatten /= 255.

            # take the normalized pixel value as texture cue
            uv_flatten = uv_rescaled[:, 1].round() * \
                img_shape[1] + uv_rescaled[:, 0].round()
            uv_expanded = uv_flatten.unsqueeze(0).expand(3, -1).long()
            txt_cue = torch.gather(img_flatten, dim=-1, index=uv_expanded)
            txt_cue = txt_cue.unsqueeze(1).expand(-1,
                                                  self.max_imvote_per_pixel,
                                                  -1).reshape(3, -1)

            # append texture cue
            img_feature = torch.cat([two_cues, txt_cue], dim=0)
            img_features.append(img_feature)
            masks.append(mask)

        return torch.stack(img_features, 0), torch.stack(masks, 0)
def test_points_conversion():
    """Test the conversion of points between different modes."""
    points_np = np.array([[
        -5.24223238e+00, 4.00209696e+01, 2.97570381e-01, 0.6666, 0.1956,
        0.4974, 0.9409
    ],
                          [
                              -2.66751588e+01, 5.59499564e+00, -9.14345860e-01,
                              0.1502, 0.3707, 0.1086, 0.6297
                          ],
                          [
                              -5.80979675e+00, 3.54092357e+01, 2.00889888e-01,
                              0.6565, 0.6248, 0.6954, 0.2538
                          ],
                          [
                              -3.13086877e+01, 1.09007628e+00, -1.94612112e-01,
                              0.2803, 0.0258, 0.4896, 0.3269
                          ]],
                         dtype=np.float32)

    # test CAM to LIDAR and DEPTH
    cam_points = CameraPoints(points_np,
                              points_dim=7,
                              attribute_dims=dict(color=[3, 4, 5], height=6))

    convert_lidar_points = cam_points.convert_to(Coord3DMode.LIDAR)
    expected_tensor = torch.tensor([[
        2.9757e-01, 5.2422e+00, -4.0021e+01, 6.6660e-01, 1.9560e-01,
        4.9740e-01, 9.4090e-01
    ],
                                    [
                                        -9.1435e-01, 2.6675e+01, -5.5950e+00,
                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
                                        6.2970e-01
                                    ],
                                    [
                                        2.0089e-01, 5.8098e+00, -3.5409e+01,
                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
                                        2.5380e-01
                                    ],
                                    [
                                        -1.9461e-01, 3.1309e+01, -1.0901e+00,
                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
                                        3.2690e-01
                                    ]])

    lidar_point_tensor = Coord3DMode.convert_point(cam_points.tensor,
                                                   Coord3DMode.CAM,
                                                   Coord3DMode.LIDAR)
    assert torch.allclose(expected_tensor, convert_lidar_points.tensor, 1e-4)
    assert torch.allclose(lidar_point_tensor, convert_lidar_points.tensor,
                          1e-4)

    convert_depth_points = cam_points.convert_to(Coord3DMode.DEPTH)
    expected_tensor = torch.tensor([[
        -5.2422e+00, 2.9757e-01, -4.0021e+01, 6.6660e-01, 1.9560e-01,
        4.9740e-01, 9.4090e-01
    ],
                                    [
                                        -2.6675e+01, -9.1435e-01, -5.5950e+00,
                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
                                        6.2970e-01
                                    ],
                                    [
                                        -5.8098e+00, 2.0089e-01, -3.5409e+01,
                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
                                        2.5380e-01
                                    ],
                                    [
                                        -3.1309e+01, -1.9461e-01, -1.0901e+00,
                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
                                        3.2690e-01
                                    ]])

    depth_point_tensor = Coord3DMode.convert_point(cam_points.tensor,
                                                   Coord3DMode.CAM,
                                                   Coord3DMode.DEPTH)
    assert torch.allclose(expected_tensor, convert_depth_points.tensor, 1e-4)
    assert torch.allclose(depth_point_tensor, convert_depth_points.tensor,
                          1e-4)

    # test LIDAR to CAM and DEPTH
    lidar_points = LiDARPoints(points_np,
                               points_dim=7,
                               attribute_dims=dict(color=[3, 4, 5], height=6))

    convert_cam_points = lidar_points.convert_to(Coord3DMode.CAM)
    expected_tensor = torch.tensor([[
        -4.0021e+01, -2.9757e-01, -5.2422e+00, 6.6660e-01, 1.9560e-01,
        4.9740e-01, 9.4090e-01
    ],
                                    [
                                        -5.5950e+00, 9.1435e-01, -2.6675e+01,
                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
                                        6.2970e-01
                                    ],
                                    [
                                        -3.5409e+01, -2.0089e-01, -5.8098e+00,
                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
                                        2.5380e-01
                                    ],
                                    [
                                        -1.0901e+00, 1.9461e-01, -3.1309e+01,
                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
                                        3.2690e-01
                                    ]])

    cam_point_tensor = Coord3DMode.convert_point(lidar_points.tensor,
                                                 Coord3DMode.LIDAR,
                                                 Coord3DMode.CAM)
    assert torch.allclose(expected_tensor, convert_cam_points.tensor, 1e-4)
    assert torch.allclose(cam_point_tensor, convert_cam_points.tensor, 1e-4)

    convert_depth_points = lidar_points.convert_to(Coord3DMode.DEPTH)
    expected_tensor = torch.tensor([[
        -4.0021e+01, -5.2422e+00, 2.9757e-01, 6.6660e-01, 1.9560e-01,
        4.9740e-01, 9.4090e-01
    ],
                                    [
                                        -5.5950e+00, -2.6675e+01, -9.1435e-01,
                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
                                        6.2970e-01
                                    ],
                                    [
                                        -3.5409e+01, -5.8098e+00, 2.0089e-01,
                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
                                        2.5380e-01
                                    ],
                                    [
                                        -1.0901e+00, -3.1309e+01, -1.9461e-01,
                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
                                        3.2690e-01
                                    ]])

    depth_point_tensor = Coord3DMode.convert_point(lidar_points.tensor,
                                                   Coord3DMode.LIDAR,
                                                   Coord3DMode.DEPTH)
    assert torch.allclose(expected_tensor, convert_depth_points.tensor, 1e-4)
    assert torch.allclose(depth_point_tensor, convert_depth_points.tensor,
                          1e-4)

    # test DEPTH to CAM and LIDAR
    depth_points = DepthPoints(points_np,
                               points_dim=7,
                               attribute_dims=dict(color=[3, 4, 5], height=6))

    convert_cam_points = depth_points.convert_to(Coord3DMode.CAM)
    expected_tensor = torch.tensor([[
        -5.2422e+00, -2.9757e-01, 4.0021e+01, 6.6660e-01, 1.9560e-01,
        4.9740e-01, 9.4090e-01
    ],
                                    [
                                        -2.6675e+01, 9.1435e-01, 5.5950e+00,
                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
                                        6.2970e-01
                                    ],
                                    [
                                        -5.8098e+00, -2.0089e-01, 3.5409e+01,
                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
                                        2.5380e-01
                                    ],
                                    [
                                        -3.1309e+01, 1.9461e-01, 1.0901e+00,
                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
                                        3.2690e-01
                                    ]])

    cam_point_tensor = Coord3DMode.convert_point(depth_points.tensor,
                                                 Coord3DMode.DEPTH,
                                                 Coord3DMode.CAM)
    assert torch.allclose(expected_tensor, convert_cam_points.tensor, 1e-4)
    assert torch.allclose(cam_point_tensor, convert_cam_points.tensor, 1e-4)

    rt_mat_provided = torch.tensor([[0.99789, -0.012698, -0.063678],
                                    [-0.012698, 0.92359, -0.38316],
                                    [0.063678, 0.38316, 0.92148]])

    depth_points_new = torch.cat([
        depth_points.tensor[:, :3] @ rt_mat_provided.t(),
        depth_points.tensor[:, 3:]
    ],
                                 dim=1)
    mat = rt_mat_provided.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
    rt_mat_provided = mat @ rt_mat_provided.transpose(1, 0)
    cam_point_tensor_new = Coord3DMode.convert_point(depth_points_new,
                                                     Coord3DMode.DEPTH,
                                                     Coord3DMode.CAM,
                                                     rt_mat=rt_mat_provided)
    assert torch.allclose(expected_tensor, cam_point_tensor_new, 1e-4)

    convert_lidar_points = depth_points.convert_to(Coord3DMode.LIDAR)
    expected_tensor = torch.tensor([[
        4.0021e+01, 5.2422e+00, 2.9757e-01, 6.6660e-01, 1.9560e-01, 4.9740e-01,
        9.4090e-01
    ],
                                    [
                                        5.5950e+00, 2.6675e+01, -9.1435e-01,
                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
                                        6.2970e-01
                                    ],
                                    [
                                        3.5409e+01, 5.8098e+00, 2.0089e-01,
                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
                                        2.5380e-01
                                    ],
                                    [
                                        1.0901e+00, 3.1309e+01, -1.9461e-01,
                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
                                        3.2690e-01
                                    ]])

    lidar_point_tensor = Coord3DMode.convert_point(depth_points.tensor,
                                                   Coord3DMode.DEPTH,
                                                   Coord3DMode.LIDAR)
    assert torch.allclose(lidar_point_tensor, convert_lidar_points.tensor,
                          1e-4)
    assert torch.allclose(lidar_point_tensor, convert_lidar_points.tensor,
                          1e-4)
def test_boxes_conversion():
    # test CAM to LIDAR and DEPTH
    cam_boxes = CameraInstance3DBoxes(
        [[1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, 1.48],
         [8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57, 1.62],
         [28.2967, -0.5557558, -1.303325, 1.47, 2.23, 1.48, -1.57],
         [26.66902, 21.82302, -1.736057, 1.56, 3.48, 1.4, -1.69],
         [31.31978, 8.162144, -1.6217787, 1.74, 3.77, 1.48, 2.79]])
    convert_lidar_boxes = Coord3DMode.convert(cam_boxes, Coord3DMode.CAM,
                                              Coord3DMode.LIDAR)

    expected_tensor = torch.tensor(
        [[-1.7501, -1.7802, -2.5162, 1.6500, 1.7500, 3.3900, 1.4800],
         [-1.6357, -8.9594, -2.4567, 1.5700, 1.5400, 4.0100, 1.6200],
         [-1.3033, -28.2967, 0.5558, 1.4800, 1.4700, 2.2300, -1.5700],
         [-1.7361, -26.6690, -21.8230, 1.4000, 1.5600, 3.4800, -1.6900],
         [-1.6218, -31.3198, -8.1621, 1.4800, 1.7400, 3.7700, 2.7900]])
    assert torch.allclose(expected_tensor, convert_lidar_boxes.tensor, 1e-3)

    convert_depth_boxes = Coord3DMode.convert(cam_boxes, Coord3DMode.CAM,
                                              Coord3DMode.DEPTH)
    expected_tensor = torch.tensor(
        [[1.7802, 1.7501, 2.5162, 1.7500, 1.6500, 3.3900, 1.4800],
         [8.9594, 1.6357, 2.4567, 1.5400, 1.5700, 4.0100, 1.6200],
         [28.2967, 1.3033, -0.5558, 1.4700, 1.4800, 2.2300, -1.5700],
         [26.6690, 1.7361, 21.8230, 1.5600, 1.4000, 3.4800, -1.6900],
         [31.3198, 1.6218, 8.1621, 1.7400, 1.4800, 3.7700, 2.7900]])
    assert torch.allclose(expected_tensor, convert_depth_boxes.tensor, 1e-3)

    # test LIDAR to CAM and DEPTH
    lidar_boxes = LiDARInstance3DBoxes(
        [[1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, 1.48],
         [8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57, 1.62],
         [28.2967, -0.5557558, -1.303325, 1.47, 2.23, 1.48, -1.57],
         [26.66902, 21.82302, -1.736057, 1.56, 3.48, 1.4, -1.69],
         [31.31978, 8.162144, -1.6217787, 1.74, 3.77, 1.48, 2.79]])
    convert_cam_boxes = Coord3DMode.convert(lidar_boxes, Coord3DMode.LIDAR,
                                            Coord3DMode.CAM)
    expected_tensor = torch.tensor(
        [[-2.5162, 1.7501, 1.7802, 3.3900, 1.6500, 1.7500, 1.4800],
         [-2.4567, 1.6357, 8.9594, 4.0100, 1.5700, 1.5400, 1.6200],
         [0.5558, 1.3033, 28.2967, 2.2300, 1.4800, 1.4700, -1.5700],
         [-21.8230, 1.7361, 26.6690, 3.4800, 1.4000, 1.5600, -1.6900],
         [-8.1621, 1.6218, 31.3198, 3.7700, 1.4800, 1.7400, 2.7900]])
    assert torch.allclose(expected_tensor, convert_cam_boxes.tensor, 1e-3)

    convert_depth_boxes = Coord3DMode.convert(lidar_boxes, Coord3DMode.LIDAR,
                                              Coord3DMode.DEPTH)
    expected_tensor = torch.tensor(
        [[-2.5162, 1.7802, -1.7501, 3.3900, 1.7500, 1.6500, 1.4800],
         [-2.4567, 8.9594, -1.6357, 4.0100, 1.5400, 1.5700, 1.6200],
         [0.5558, 28.2967, -1.3033, 2.2300, 1.4700, 1.4800, -1.5700],
         [-21.8230, 26.6690, -1.7361, 3.4800, 1.5600, 1.4000, -1.6900],
         [-8.1621, 31.3198, -1.6218, 3.7700, 1.7400, 1.4800, 2.7900]])
    assert torch.allclose(expected_tensor, convert_depth_boxes.tensor, 1e-3)

    # test DEPTH to CAM and LIDAR
    depth_boxes = DepthInstance3DBoxes(
        [[1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, 1.48],
         [8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57, 1.62],
         [28.2967, -0.5557558, -1.303325, 1.47, 2.23, 1.48, -1.57],
         [26.66902, 21.82302, -1.736057, 1.56, 3.48, 1.4, -1.69],
         [31.31978, 8.162144, -1.6217787, 1.74, 3.77, 1.48, 2.79]])
    convert_cam_boxes = Coord3DMode.convert(depth_boxes, Coord3DMode.DEPTH,
                                            Coord3DMode.CAM)
    expected_tensor = torch.tensor(
        [[1.7802, -1.7501, -2.5162, 1.7500, 1.6500, 3.3900, 1.4800],
         [8.9594, -1.6357, -2.4567, 1.5400, 1.5700, 4.0100, 1.6200],
         [28.2967, -1.3033, 0.5558, 1.4700, 1.4800, 2.2300, -1.5700],
         [26.6690, -1.7361, -21.8230, 1.5600, 1.4000, 3.4800, -1.6900],
         [31.3198, -1.6218, -8.1621, 1.7400, 1.4800, 3.7700, 2.7900]])
    assert torch.allclose(expected_tensor, convert_cam_boxes.tensor, 1e-3)

    convert_lidar_boxes = Coord3DMode.convert(depth_boxes, Coord3DMode.DEPTH,
                                              Coord3DMode.LIDAR)
    expected_tensor = torch.tensor(
        [[2.5162, -1.7802, -1.7501, 3.3900, 1.7500, 1.6500, 1.4800],
         [2.4567, -8.9594, -1.6357, 4.0100, 1.5400, 1.5700, 1.6200],
         [-0.5558, -28.2967, -1.3033, 2.2300, 1.4700, 1.4800, -1.5700],
         [21.8230, -26.6690, -1.7361, 3.4800, 1.5600, 1.4000, -1.6900],
         [8.1621, -31.3198, -1.6218, 3.7700, 1.7400, 1.4800, 2.7900]])
    assert torch.allclose(expected_tensor, convert_lidar_boxes.tensor, 1e-3)