Esempio n. 1
0
def test_vis():
    dset_name = sys.argv[1]
    assert dset_name in DatasetCatalog.list()

    meta = MetadataCatalog.get(dset_name)
    dprint("MetadataCatalog: ", meta)
    objs = meta.objs

    t_start = time.perf_counter()
    dicts = DatasetCatalog.get(dset_name)
    logger.info("Done loading {} samples with {:.3f}s.".format(len(dicts), time.perf_counter() - t_start))

    dirname = "output/{}-data-vis".format(dset_name)
    os.makedirs(dirname, exist_ok=True)
    for d in dicts:
        img = read_image_cv2(d["file_name"], format="BGR")
        depth = mmcv.imread(d["depth_file"], "unchanged") / 1000.0

        anno = d["annotations"][0]  # only one instance per image
        imH, imW = img.shape[:2]
        mask = cocosegm2mask(anno["segmentation"], imH, imW)
        bbox = anno["bbox"]
        bbox_mode = anno["bbox_mode"]
        bbox_xyxy = np.array(BoxMode.convert(bbox, bbox_mode, BoxMode.XYXY_ABS))
        kpt3d = anno["bbox3d_and_center"]
        quat = anno["quat"]
        trans = anno["trans"]
        R = quat2mat(quat)
        # 0-based label
        cat_id = anno["category_id"]
        K = d["cam"]
        kpt_2d = misc.project_pts(kpt3d, K, R, trans)
        # # TODO: visualize pose and keypoints
        label = objs[cat_id]
        # img_vis = vis_image_bboxes_cv2(img, bboxes=bboxes_xyxy, labels=labels)
        img_vis = vis_image_mask_bbox_cv2(img, [mask], bboxes=[bbox_xyxy], labels=[label])
        img_vis_kpt2d = img.copy()
        img_vis_kpt2d = misc.draw_projected_box3d(
            img_vis_kpt2d, kpt_2d, middle_color=None, bottom_color=(128, 128, 128)
        )

        xyz_info = mmcv.load(anno["xyz_path"])
        xyz = np.zeros((imH, imW, 3), dtype=np.float32)
        xyz_crop = xyz_info["xyz_crop"].astype(np.float32)
        x1, y1, x2, y2 = xyz_info["xyxy"]
        xyz[y1 : y2 + 1, x1 : x2 + 1, :] = xyz_crop
        xyz_show = get_emb_show(xyz)

        grid_show(
            [img[:, :, [2, 1, 0]], img_vis[:, :, [2, 1, 0]], img_vis_kpt2d[:, :, [2, 1, 0]], depth, xyz_show],
            ["img", "vis_img", "img_vis_kpts2d", "depth", "emb_show"],
            row=2,
            col=3,
        )
Esempio n. 2
0
        closest_rot = get_closest_rot(est_rot, gt_rot, sym_info)
    print(("calculate closest rot {}s".format((time.perf_counter() - t) / 3000)))
    closest_pose = np.copy(gt_pose)
    closest_pose[:, :3] = closest_rot

    rd_closest = re(est_rot, closest_pose[:, :3])
    print(("rot_est: {}, rot_gt: {}, closest rot_gt: {}".format(
        mat2axangle(est_rot), mat2axangle(gt_rot), mat2axangle(closest_rot))))
    print(("original rot dist: {}, closest rot dist: {}".format(rd_ori, rd_closest)))

    est_img, _ = renderer.render(obj_id, est_rot, trans)
    gt_img, _ = renderer.render(obj_id, gt_rot, trans)
    closest_img, _ = renderer.render(obj_id, closest_rot, trans)
    show_imgs = [est_img[:, :, [2, 1, 0]], gt_img[:, :, [2, 1, 0]], closest_img[:, :, [2, 1, 0]]]
    show_titles = ["est", "gt_ori", "gt_closest"]
    grid_show(show_imgs, show_titles, row=1, col=3)

    # import cv2
    # while(1):
    #     est_img = render(renderer, est_rot, trans)
    #     cv2.imshow('test', cv2.cvtColor(est_img, cv2.COLOR_RGB2BGR))
    #     q = cv2.waitKey(16)
    #     if q == ord('w'):
    #         trans[1] += 0.05
    #     elif q == ord('s'):
    #         trans[1] -= 0.05
    #     elif q == ord('a'):
    #         trans[0] -= 0.1
    #     elif q == ord('d'):
    #         trans[0] += 0.1
    #     elif q == ord('q'):
Esempio n. 3
0
def test_vis():
    dset_name = sys.argv[1]
    assert dset_name in DatasetCatalog.list()

    meta = MetadataCatalog.get(dset_name)
    dprint("MetadataCatalog: ", meta)
    objs = meta.objs

    t_start = time.perf_counter()
    dicts = DatasetCatalog.get(dset_name)
    logger.info("Done loading {} samples with {:.3f}s.".format(
        len(dicts),
        time.perf_counter() - t_start))

    dirname = "output/{}-data-vis".format(dset_name)
    os.makedirs(dirname, exist_ok=True)
    for d in dicts:
        img = read_image_cv2(d["file_name"], format="BGR")
        depth = mmcv.imread(d["depth_file"], "unchanged") / 1000.0

        imH, imW = img.shape[:2]
        annos = d["annotations"]
        masks = [
            cocosegm2mask(anno["segmentation"], imH, imW) for anno in annos
        ]
        bboxes = [anno["bbox"] for anno in annos]
        bbox_modes = [anno["bbox_mode"] for anno in annos]
        bboxes_xyxy = np.array([
            BoxMode.convert(box, box_mode, BoxMode.XYXY_ABS)
            for box, box_mode in zip(bboxes, bbox_modes)
        ])
        kpts_3d_list = [anno["bbox3d_and_center"] for anno in annos]
        quats = [anno["quat"] for anno in annos]
        transes = [anno["trans"] for anno in annos]
        Rs = [quat2mat(quat) for quat in quats]
        # 0-based label
        cat_ids = [anno["category_id"] for anno in annos]
        K = d["cam"]
        kpts_2d = [
            misc.project_pts(kpt3d, K, R, t)
            for kpt3d, R, t in zip(kpts_3d_list, Rs, transes)
        ]
        # # TODO: visualize pose and keypoints
        labels = [objs[cat_id] for cat_id in cat_ids]
        for _i in range(len(annos)):
            img_vis = vis_image_mask_bbox_cv2(img,
                                              masks[_i:_i + 1],
                                              bboxes=bboxes_xyxy[_i:_i + 1],
                                              labels=labels[_i:_i + 1])
            img_vis_kpts2d = misc.draw_projected_box3d(img_vis.copy(),
                                                       kpts_2d[_i])
            if "test" not in dset_name:
                xyz_path = annos[_i]["xyz_path"]
                xyz_info = mmcv.load(xyz_path)
                x1, y1, x2, y2 = xyz_info["xyxy"]
                xyz_crop = xyz_info["xyz_crop"].astype(np.float32)
                xyz = np.zeros((imH, imW, 3), dtype=np.float32)
                xyz[y1:y2 + 1, x1:x2 + 1, :] = xyz_crop
                xyz_show = get_emb_show(xyz)
                xyz_crop_show = get_emb_show(xyz_crop)
                img_xyz = img.copy() / 255.0
                mask_xyz = ((xyz[:, :, 0] != 0) | (xyz[:, :, 1] != 0) |
                            (xyz[:, :, 2] != 0)).astype("uint8")
                fg_idx = np.where(mask_xyz != 0)
                img_xyz[fg_idx[0], fg_idx[1], :] = xyz_show[fg_idx[0],
                                                            fg_idx[1], :3]
                img_xyz_crop = img_xyz[y1:y2 + 1, x1:x2 + 1, :]
                img_vis_crop = img_vis[y1:y2 + 1, x1:x2 + 1, :]
                # diff mask
                diff_mask_xyz = np.abs(masks[_i] - mask_xyz)[y1:y2 + 1,
                                                             x1:x2 + 1]

                grid_show(
                    [
                        img[:, :, [2, 1, 0]],
                        img_vis[:, :, [2, 1, 0]],
                        img_vis_kpts2d[:, :, [2, 1, 0]],
                        depth,
                        # xyz_show,
                        diff_mask_xyz,
                        xyz_crop_show,
                        img_xyz[:, :, [2, 1, 0]],
                        img_xyz_crop[:, :, [2, 1, 0]],
                        img_vis_crop,
                    ],
                    [
                        "img",
                        "vis_img",
                        "img_vis_kpts2d",
                        "depth",
                        "diff_mask_xyz",
                        "xyz_crop_show",
                        "img_xyz",
                        "img_xyz_crop",
                        "img_vis_crop",
                    ],
                    row=3,
                    col=3,
                )
            else:
                grid_show(
                    [
                        img[:, :, [2, 1, 0]], img_vis[:, :, [2, 1, 0]],
                        img_vis_kpts2d[:, :, [2, 1, 0]], depth
                    ],
                    ["img", "vis_img", "img_vis_kpts2d", "depth"],
                    row=2,
                    col=2,
                )
Esempio n. 4
0
    def process(self, inputs, outputs, out_dict):
        """
        Args:
            inputs: the inputs to a model.
                It is a list of dict. Each dict corresponds to an image and
                contains keys like "height", "width", "file_name", "image_id", "scene_id".
            outputs:
        """
        cfg = self.cfg
        if cfg.TEST.USE_PNP:
            if cfg.TEST.PNP_TYPE.lower() == "ransac_pnp":
                return self.process_pnp_ransac(inputs, outputs, out_dict)
            elif cfg.TEST.PNP_TYPE.lower() == "net_iter_pnp":
                return self.process_net_and_pnp(inputs,
                                                outputs,
                                                out_dict,
                                                pnp_type="iter")
            elif cfg.TEST.PNP_TYPE.lower() == "net_ransac_pnp":
                return self.process_net_and_pnp(inputs,
                                                outputs,
                                                out_dict,
                                                pnp_type="ransac")
            elif cfg.TEST.PNP_TYPE.lower() == "net_ransac_pnp_rot":
                # use rot from PnP/RANSAC and translation from Net
                return self.process_net_and_pnp(inputs,
                                                outputs,
                                                out_dict,
                                                pnp_type="ransac_rot")
            else:
                raise NotImplementedError

        out_rots = out_dict["rot"].detach().to(self._cpu_device).numpy()
        out_transes = out_dict["trans"].detach().to(self._cpu_device).numpy()

        out_i = -1
        for i, (_input, output) in enumerate(zip(inputs, outputs)):
            start_process_time = time.perf_counter()
            for inst_i in range(len(_input["roi_img"])):
                out_i += 1
                file_name = _input["file_name"][inst_i]

                scene_im_id_split = _input["scene_im_id"][inst_i].split("/")
                K = _input["cam"][inst_i].cpu().numpy().copy()

                roi_label = _input["roi_cls"][inst_i]  # 0-based label
                score = _input["score"][inst_i]
                roi_label, cls_name = self._maybe_adapt_label_cls_name(
                    roi_label)
                if cls_name is None:
                    continue

                scene_id = scene_im_id_split[0]
                im_id = int(scene_im_id_split[1])

                # get pose
                rot_est = out_rots[inst_i]
                trans_est = out_transes[inst_i]

                if cfg.DEBUG:  # visualize pose
                    pose_est = np.hstack([rot_est, trans_est.reshape(3, 1)])
                    file_name = _input["file_name"][inst_i]

                    if f"{int(scene_id)}/{im_id}" != "9/499":
                        continue

                    im_ori = mmcv.imread(file_name, "color")

                    bbox = _input["bbox_est"][inst_i].cpu().numpy().copy()
                    x1, y1, x2, y2 = bbox
                    # center = np.array([(x1 + x2) / 2, (y1 + y2) / 2])
                    # scale = max(x2 - x1, y2 - y1) * 1.5

                    test_label = _input["roi_cls"][inst_i]
                    kpt_3d = self.kpts_3d[test_label]
                    # kpt_3d = self.kpts_axis_3d[test_label]
                    kpt_2d = misc.project_pts(kpt_3d, K, rot_est, trans_est)

                    gt_dict = self.gts[cls_name][file_name]
                    gt_rot = gt_dict["R"]
                    gt_trans = gt_dict["t"]
                    kpt_2d_gt = misc.project_pts(kpt_3d, K, gt_rot, gt_trans)

                    maxx, maxy, minx, miny = 0, 0, 1000, 1000
                    for i in range(len(kpt_2d)):
                        maxx, maxy, minx, miny = (
                            max(maxx, kpt_2d[i][0]),
                            max(maxy, kpt_2d[i][1]),
                            min(minx, kpt_2d[i][0]),
                            min(miny, kpt_2d[i][1]),
                        )
                        maxx, maxy, minx, miny = (
                            max(maxx, kpt_2d_gt[i][0]),
                            max(maxy, kpt_2d_gt[i][1]),
                            min(minx, kpt_2d_gt[i][0]),
                            min(miny, kpt_2d_gt[i][1]),
                        )
                    center = np.array([(minx + maxx) / 2, (miny + maxy) / 2])
                    scale = max(maxx - minx, maxy - miny) + 5

                    out_size = 256
                    zoomed_im = crop_resize_by_warp_affine(
                        im_ori, center, scale, out_size)
                    save_path = osp.join(
                        cfg.OUTPUT_DIR, "vis",
                        "{}_{}_{:06d}_no_bbox.png".format(
                            cls_name, scene_id, im_id))
                    mmcv.mkdir_or_exist(osp.dirname(save_path))
                    mmcv.imwrite(zoomed_im, save_path)
                    # yapf: disable
                    kpt_2d = np.array(
                        [
                            [(x - (center[0] - scale / 2)) * out_size / scale,
                             (y - (center[1] - scale / 2)) * out_size / scale]
                            for [x, y] in kpt_2d
                        ]
                    )

                    kpt_2d_gt = np.array(
                        [
                            [(x - (center[0] - scale / 2)) * out_size / scale,
                             (y - (center[1] - scale / 2)) * out_size / scale]
                            for [x, y] in kpt_2d_gt
                        ]
                    )
                    # yapf: enable
                    # draw est bbox
                    linewidth = 3
                    visualizer = MyVisualizer(zoomed_im[:, :, ::-1],
                                              self._metadata)
                    # zoomed_im_vis = visualizer.draw_axis3d_and_center(
                    #     kpt_2d, linewidth=linewidth, draw_center=True
                    # )
                    # visualizer.draw_bbox3d_and_center(
                    #     kpt_2d_gt, top_color=_BLUE, bottom_color=_GREY, linewidth=linewidth, draw_center=True
                    # )
                    zoomed_im_vis = visualizer.draw_bbox3d_and_center(
                        kpt_2d,
                        top_color=_GREEN,
                        bottom_color=_GREY,
                        linewidth=linewidth,
                        draw_center=True)
                    save_path = osp.join(
                        cfg.OUTPUT_DIR, "vis",
                        "{}_{}_{:06d}_gt_est.png".format(
                            cls_name, scene_id, im_id))
                    mmcv.mkdir_or_exist(osp.dirname(save_path))
                    zoomed_im_vis.save(save_path)
                    print("zoomed_in_vis saved to:", save_path)

                    im_vis = vis_image_bboxes_cv2(im_ori, [bbox],
                                                  [f"{cls_name}_{score}"])

                    self.ren.clear()
                    self.ren.draw_background(
                        mmcv.bgr2gray(im_ori, keepdim=True))
                    self.ren.draw_model(
                        self.ren_models[self.data_ref.objects.index(cls_name)],
                        pose_est)
                    ren_im, _ = self.ren.finish()
                    grid_show(
                        [ren_im[:, :, ::-1], im_vis[:, :, ::-1]],
                        [f"ren_im_{cls_name}", f"{scene_id}/{im_id}_{score}"],
                        row=1,
                        col=2,
                    )

                output["time"] += time.perf_counter() - start_process_time

                if cls_name not in self._predictions:
                    self._predictions[cls_name] = OrderedDict()

                result = {
                    "score": score,
                    "R": rot_est,
                    "t": trans_est,
                    "time": output["time"]
                }
                self._predictions[cls_name][file_name] = result
    def main(self):
        split = self.split
        scene = self.scene  # "all" or a single scene
        sel_scene_ids = self.sel_scene_ids
        data_root = self.data_root

        for scene_id in tqdm(sel_scene_ids, postfix=f"{split}_{scene}"):
            print("split: {} scene: {}".format(split, scene_id))
            scene_root = osp.join(data_root, f"{scene_id:06d}")

            gt_dict = mmcv.load(osp.join(scene_root, "scene_gt.json"))
            # gt_info_dict = mmcv.load(osp.join(scene_root, "scene_gt_info.json"))
            # cam_dict = mmcv.load(osp.join(scene_root, "scene_camera.json"))

            for str_im_id in tqdm(gt_dict, postfix=f"{scene_id}"):
                int_im_id = int(str_im_id)

                for anno_i, anno in enumerate(gt_dict[str_im_id]):
                    obj_id = anno["obj_id"]
                    if obj_id not in idx2class:
                        continue

                    R = np.array(anno["cam_R_m2c"],
                                 dtype="float32").reshape(3, 3)
                    t = np.array(anno["cam_t_m2c"], dtype="float32") / 1000.0
                    # pose = np.hstack([R, t.reshape(3, 1)])

                    save_path = osp.join(
                        xyz_root,
                        f"{scene_id:06d}/{int_im_id:06d}_{anno_i:06d}-xyz.pkl",
                    )
                    # if osp.exists(save_path) and osp.getsize(save_path) > 0:
                    #     continue

                    render_obj_id = cls_indexes.index(obj_id)  # 0-based
                    bgr_gl, depth_gl = self.get_renderer().render(
                        render_obj_id, IM_W, IM_H, K, R, t, near, far)
                    mask = (depth_gl > 0).astype("uint8")

                    if mask.sum(
                    ) == 0:  # NOTE: this should be ignored at training phase
                        print(
                            f"not visible, split {split} scene {scene_id}, im {int_im_id} obj {idx2class[obj_id]} {obj_id}"
                        )
                        print(f"{save_path}")
                        xyz_info = {
                            "xyz_crop": np.zeros((IM_H, IM_W, 3),
                                                 dtype=np.float16),
                            "xyxy": [0, 0, IM_W - 1, IM_H - 1],
                        }
                        if VIS:
                            im_path = osp.join(
                                data_root,
                                f"{scene_id:06d}/rgb/{int_im_id:06d}.jpg",
                            )
                            im = mmcv.imread(im_path)

                            mask_path = osp.join(
                                data_root,
                                f"{scene_id:06d}/mask/{int_im_id:06d}_{anno_i:06d}.png",
                            )
                            mask_visib_path = osp.join(
                                data_root,
                                f"{scene_id:06d}/mask_visib/{int_im_id:06d}_{anno_i:06d}.png",
                            )
                            mask_gt = mmcv.imread(mask_path, "unchanged")
                            mask_visib_gt = mmcv.imread(
                                mask_visib_path, "unchanged")

                            show_ims = [
                                bgr_gl[:, :, [2, 1, 0]],
                                im[:, :, [2, 1, 0]],
                                mask_gt,
                                mask_visib_gt,
                            ]
                            show_titles = [
                                "bgr_gl",
                                "im",
                                "mask_gt",
                                "mask_visib_gt",
                            ]
                            grid_show(show_ims, show_titles, row=2, col=2)
                            raise RuntimeError(
                                f"split {split} scene {scene_id}, im {int_im_id}"
                            )
                    else:
                        x1, y1, x2, y2 = mask2bbox_xyxy(mask)
                        xyz_np = misc.calc_xyz_bp_fast(depth_gl, R, t, K)
                        xyz_crop = xyz_np[y1:y2 + 1, x1:x2 + 1]
                        xyz_info = {
                            "xyz_crop": xyz_crop.astype(
                                "float16"
                            ),  # save disk space w/o performance drop
                            "xyxy": [x1, y1, x2, y2],
                        }

                        if VIS:
                            print(
                                f"xyz_crop min {xyz_crop.min()} max {xyz_crop.max()}"
                            )
                            show_ims = [
                                bgr_gl[:, :, [2, 1, 0]],
                                get_emb_show(xyz_np),
                                get_emb_show(xyz_crop),
                            ]
                            show_titles = ["bgr_gl", "xyz", "xyz_crop"]
                            grid_show(show_ims, show_titles, row=1, col=3)

                    if not args.no_save:
                        mmcv.mkdir_or_exist(osp.dirname(save_path))
                        mmcv.dump(xyz_info, save_path)
        if self.renderer is not None:
            self.renderer.close()
def test_vis():
    dset_name = sys.argv[1]
    assert dset_name in DatasetCatalog.list()

    meta = MetadataCatalog.get(dset_name)
    dprint("MetadataCatalog: ", meta)
    objs = meta.objs

    t_start = time.perf_counter()
    dicts = DatasetCatalog.get(dset_name)
    logger.info("Done loading {} samples with {:.3f}s.".format(
        len(dicts),
        time.perf_counter() - t_start))

    dirname = "output/{}-data-vis".format(dset_name)
    os.makedirs(dirname, exist_ok=True)
    for d in dicts:
        img = read_image_cv2(d["file_name"], format="BGR")
        depth = mmcv.imread(d["depth_file"], "unchanged") / 1000.0

        imH, imW = img.shape[:2]
        annos = d["annotations"]
        masks = [
            cocosegm2mask(anno["segmentation"], imH, imW) for anno in annos
        ]
        bboxes = [anno["bbox"] for anno in annos]
        bbox_modes = [anno["bbox_mode"] for anno in annos]
        bboxes_xyxy = np.array([
            BoxMode.convert(box, box_mode, BoxMode.XYXY_ABS)
            for box, box_mode in zip(bboxes, bbox_modes)
        ])
        kpts_3d_list = [anno["bbox3d_and_center"] for anno in annos]
        quats = [anno["quat"] for anno in annos]
        transes = [anno["trans"] for anno in annos]
        Rs = [quat2mat(quat) for quat in quats]
        # 0-based label
        cat_ids = [anno["category_id"] for anno in annos]
        K = d["cam"]
        kpts_2d = [
            misc.project_pts(kpt3d, K, R, t)
            for kpt3d, R, t in zip(kpts_3d_list, Rs, transes)
        ]
        # # TODO: visualize pose and keypoints
        labels = [objs[cat_id] for cat_id in cat_ids]
        # img_vis = vis_image_bboxes_cv2(img, bboxes=bboxes_xyxy, labels=labels)
        img_vis = vis_image_mask_bbox_cv2(img,
                                          masks,
                                          bboxes=bboxes_xyxy,
                                          labels=labels)
        img_vis_kpts2d = img.copy()
        for anno_i in range(len(annos)):
            img_vis_kpts2d = misc.draw_projected_box3d(img_vis_kpts2d,
                                                       kpts_2d[anno_i])
        grid_show(
            [
                img[:, :, [2, 1, 0]], img_vis[:, :, [2, 1, 0]],
                img_vis_kpts2d[:, :, [2, 1, 0]], depth
            ],
            [f"img:{d['file_name']}", "vis_img", "img_vis_kpts2d", "depth"],
            row=2,
            col=2,
        )
Esempio n. 7
0
    def process(self, inputs, outputs, out_dict):
        """
        Args:
            inputs: the inputs to a model.
                It is a list of dict. Each dict corresponds to an image and
                contains keys like "height", "width", "file_name", "image_id", "scene_id".
            outputs: stores time
        """
        cfg = self.cfg
        if cfg.TEST.USE_PNP:
            if cfg.TEST.PNP_TYPE.lower() == "ransac_pnp":
                return self.process_pnp_ransac(inputs, outputs, out_dict)
            elif cfg.TEST.PNP_TYPE.lower() == "net_iter_pnp":
                return self.process_net_and_pnp(inputs,
                                                outputs,
                                                out_dict,
                                                pnp_type="iter")
            elif cfg.TEST.PNP_TYPE.lower() == "net_ransac_pnp":
                return self.process_net_and_pnp(inputs,
                                                outputs,
                                                out_dict,
                                                pnp_type="ransac")
            else:
                raise NotImplementedError

        out_rots = out_dict["rot"].detach().to(self._cpu_device).numpy()
        out_transes = out_dict["trans"].detach().to(self._cpu_device).numpy()

        out_i = -1
        for i, (_input, output) in enumerate(zip(inputs, outputs)):
            json_results = []
            start_process_time = time.perf_counter()
            for inst_i in range(len(_input["roi_img"])):
                out_i += 1  # the index in the flattened output
                scene_im_id_split = _input["scene_im_id"][inst_i].split("/")
                K = _input["cam"][inst_i].cpu().numpy().copy()

                roi_label = _input["roi_cls"][inst_i]  # 0-based label
                score = _input["score"][inst_i]
                roi_label, cls_name = self._maybe_adapt_label_cls_name(
                    roi_label)
                if cls_name is None:
                    continue

                # scene_id = int(scene_im_id_split[0])
                scene_id = scene_im_id_split[0]
                im_id = int(scene_im_id_split[1])
                obj_id = self.data_ref.obj2id[cls_name]

                # get pose
                rot_est = out_rots[out_i]
                trans_est = out_transes[out_i]
                pose_est = np.hstack([rot_est, trans_est.reshape(3, 1)])

                if cfg.DEBUG:  # visualize pose
                    file_name = _input["file_name"][inst_i]

                    if f"{int(scene_id)}/{im_id}" != "9/47":
                        continue

                    im_ori = mmcv.imread(file_name, "color")
                    bbox = _input["bbox_est"][inst_i].cpu().numpy().copy()
                    im_vis = vis_image_bboxes_cv2(im_ori, [bbox],
                                                  [f"{cls_name}_{score}"])

                    self.ren.clear()
                    self.ren.draw_background(
                        mmcv.bgr2gray(im_ori, keepdim=True))
                    self.ren.draw_model(
                        self.ren_models[self.data_ref.objects.index(cls_name)],
                        pose_est)
                    ren_im, _ = self.ren.finish()
                    grid_show(
                        [ren_im[:, :, ::-1], im_vis[:, :, ::-1]],
                        [f"ren_im_{cls_name}", f"{scene_id}/{im_id}_{score}"],
                        row=1,
                        col=2,
                    )

                json_results.extend(
                    self.pose_prediction_to_json(pose_est,
                                                 scene_id,
                                                 im_id,
                                                 obj_id=obj_id,
                                                 score=score,
                                                 pose_time=output["time"],
                                                 K=K))

            output["time"] += time.perf_counter() - start_process_time
            # process time for this image
            for item in json_results:
                item["time"] = output["time"]
            self._predictions.extend(json_results)