Ejemplo n.º 1
0
    def __call__(self):
        """
        Load light-weight instance annotations of all images into a list of dicts in Detectron2 format.
        Do not load heavy data into memory in this file,
        since we will load the annotations of all images into memory.
        """
        # cache the dataset_dicts to avoid loading masks from files
        hashed_file_name = hashlib.md5(
            ("".join([str(fn) for fn in self.objs]) +
             "dataset_dicts_{}_{}_{}_{}_{}_{}".format(
                 self.name, self.dataset_root,
                 self.with_masks, self.with_depth, self.with_xyz,
                 osp.abspath(__file__))).encode("utf-8")).hexdigest()
        cache_path = osp.join(
            self.dataset_root,
            "dataset_dicts_{}_{}.pkl".format(self.name, hashed_file_name))

        if osp.exists(cache_path) and self.use_cache:
            logger.info("load cached dataset dicts from {}".format(cache_path))
            return mmcv.load(cache_path)

        t_start = time.perf_counter()
        dataset_dicts = []
        self.num_instances_without_valid_segmentation = 0
        self.num_instances_without_valid_box = 0
        logger.info("loading dataset dicts: {}".format(self.name))
        # it is slow because of loading and converting masks to rle

        for scene in self.scenes:
            scene_id = int(scene)
            scene_root = osp.join(self.dataset_root, scene)

            gt_dict = mmcv.load(osp.join(scene_root, 'scene_gt.json'))
            gt_info_dict = mmcv.load(osp.join(scene_root,
                                              'scene_gt_info.json'))
            cam_dict = mmcv.load(osp.join(scene_root, 'scene_camera.json'))

            for str_im_id in tqdm(gt_dict, postfix=f"{scene_id}"):
                int_im_id = int(str_im_id)
                rgb_path = osp.join(scene_root,
                                    "rgb/{:06d}.jpg").format(int_im_id)
                assert osp.exists(rgb_path), rgb_path

                depth_path = osp.join(scene_root,
                                      "depth/{:06d}.png".format(int_im_id))
                K = np.array(cam_dict[str_im_id]['cam_K'],
                             dtype=np.float32).reshape(3, 3)
                depth_factor = 1000.0 / cam_dict[str_im_id][
                    'depth_scale']  # 10000

                record = {
                    "dataset_name": self.name,
                    'file_name': osp.relpath(rgb_path, PROJ_ROOT),
                    'depth_file': osp.relpath(depth_path, PROJ_ROOT),
                    'height': self.height,
                    'width': self.width,
                    'image_id': int_im_id,
                    "scene_im_id": "{}/{}".format(scene_id,
                                                  int_im_id),  # for evaluation
                    "cam": K,
                    "depth_factor": depth_factor,
                    "img_type": 'syn_pbr'  # NOTE: has background
                }
                insts = []
                for anno_i, anno in enumerate(gt_dict[str_im_id]):
                    obj_id = anno['obj_id']
                    if obj_id not in self.cat_ids:
                        continue
                    cur_label = self.cat2label[obj_id]  # 0-based label
                    R = np.array(anno['cam_R_m2c'],
                                 dtype='float32').reshape(3, 3)
                    t = np.array(anno['cam_t_m2c'], dtype='float32') / 1000.0
                    pose = np.hstack([R, t.reshape(3, 1)])
                    quat = mat2quat(R).astype('float32')
                    allo_q = mat2quat(egocentric_to_allocentric(pose)
                                      [:3, :3]).astype('float32')

                    proj = (record["cam"] @ t.T).T
                    proj = proj[:2] / proj[2]

                    bbox_visib = gt_info_dict[str_im_id][anno_i]['bbox_visib']
                    bbox_obj = gt_info_dict[str_im_id][anno_i]['bbox_obj']
                    x1, y1, w, h = bbox_visib
                    if self.filter_invalid:
                        if h <= 1 or w <= 1:
                            self.num_instances_without_valid_box += 1
                            continue

                    mask_file = osp.join(
                        scene_root,
                        "mask/{:06d}_{:06d}.png".format(int_im_id, anno_i))
                    mask_visib_file = osp.join(
                        scene_root, "mask_visib/{:06d}_{:06d}.png".format(
                            int_im_id, anno_i))
                    assert osp.exists(mask_file), mask_file
                    assert osp.exists(mask_visib_file), mask_visib_file
                    # load mask visib  TODO: load both mask_visib and mask_full
                    mask_single = mmcv.imread(mask_visib_file, "unchanged")
                    area = mask_single.sum()
                    if area < 3:  # filter out too small or nearly invisible instances
                        self.num_instances_without_valid_segmentation += 1
                        continue
                    mask_rle = binary_mask_to_rle(mask_single, compressed=True)

                    inst = {
                        'category_id': cur_label,  # 0-based label
                        'bbox':
                        bbox_visib,  # TODO: load both bbox_obj and bbox_visib
                        'bbox_mode': BoxMode.XYWH_ABS,
                        'pose': pose,
                        "quat": quat,
                        "trans": t,
                        "allo_quat": allo_q,
                        "centroid_2d": proj,  # absolute (cx, cy)
                        "segmentation": mask_rle,
                        "mask_full_file":
                        mask_file,  # TODO: load as mask_full, rle
                    }
                    if self.with_xyz:
                        xyz_crop_path = mask_file.replace(
                            "/mask/", "/xyz_crop/").replace(".png", ".pkl")
                        assert osp.exists(xyz_crop_path), xyz_crop_path
                        inst["xyz_crop_path"] = xyz_crop_path

                    insts.append(inst)
                if len(insts) == 0:  # filter im without anno
                    continue
                record['annotations'] = insts
                dataset_dicts.append(record)

        if self.num_instances_without_valid_segmentation > 0:
            logger.warning(
                "Filtered out {} instances without valid segmentation. "
                "There might be issues in your dataset generation process.".
                format(self.num_instances_without_valid_segmentation))
        if self.num_instances_without_valid_box > 0:
            logger.warning(
                "Filtered out {} instances without valid box. "
                "There might be issues in your dataset generation process.".
                format(self.num_instances_without_valid_box))
        ##########################
        if self.num_to_load > 0:
            self.num_to_load = min(int(self.num_to_load), len(dataset_dicts))
            dataset_dicts = dataset_dicts[:self.num_to_load]
        logger.info("loaded {} dataset dicts, using {}s".format(
            len(dataset_dicts),
            time.perf_counter() - t_start))

        mkdir_p(osp.dirname(cache_path))
        mmcv.dump(dataset_dicts, cache_path, protocol=4)
        logger.info("Dumped dataset_dicts to {}".format(cache_path))
        return dataset_dicts
Ejemplo n.º 2
0
    def __call__(self):
        """
        Load light-weight instance annotations of all images into a list of dicts in Detectron2 format.
        Do not load heavy data into memory in this file,
        since we will load the annotations of all images into memory.
        """
        # cache the dataset_dicts to avoid loading masks from files
        hashed_file_name = hashlib.md5((
            "".join([str(fn) for fn in self.objs]) +
            "dataset_dicts_{}_{}_{}_{}_{}".format(
                self.name, self.dataset_root, self.with_masks, self.with_depth,
                osp.abspath(__file__))).encode("utf-8")).hexdigest()
        cache_path = osp.join(
            self.dataset_root,
            "dataset_dicts_{}_{}.pkl".format(self.name, hashed_file_name))

        if osp.exists(cache_path) and self.use_cache:
            logger.info("load cached dataset dicts from {}".format(cache_path))
            return mmcv.load(cache_path)

        t_start = time.perf_counter()

        logger.info("loading dataset dicts: {}".format(self.name))
        self.num_instances_without_valid_segmentation = 0
        self.num_instances_without_valid_box = 0

        dataset_dicts = [
        ]  #######################################################
        im_id_global = 0

        if True:
            targets = mmcv.load(self.ann_file)
            scene_im_ids = [(item["scene_id"], item["im_id"])
                            for item in targets]
            scene_im_ids = sorted(list(set(scene_im_ids)))

            # load infos for each scene
            gt_dicts = {}
            gt_info_dicts = {}
            cam_dicts = {}
            for scene_id, im_id in scene_im_ids:
                scene_root = osp.join(self.dataset_root, f"{scene_id:06d}")
                if scene_id not in gt_dicts:
                    gt_dicts[scene_id] = mmcv.load(
                        osp.join(scene_root, 'scene_gt.json'))
                if scene_id not in gt_info_dicts:
                    gt_info_dicts[scene_id] = mmcv.load(
                        osp.join(scene_root,
                                 'scene_gt_info.json'))  # bbox_obj, bbox_visib
                if scene_id not in cam_dicts:
                    cam_dicts[scene_id] = mmcv.load(
                        osp.join(scene_root, "scene_camera.json"))

            for scene_id, im_id in tqdm(scene_im_ids):
                str_im_id = str(im_id)
                scene_root = osp.join(self.dataset_root, f"{scene_id:06d}")
                rgb_path = osp.join(scene_root, "rgb/{:06d}.png").format(im_id)
                assert osp.exists(rgb_path), rgb_path

                depth_path = osp.join(scene_root,
                                      "depth/{:06d}.png".format(im_id))

                scene_id = int(rgb_path.split('/')[-3])

                cam = np.array(cam_dicts[scene_id][str_im_id]['cam_K'],
                               dtype=np.float32).reshape(3, 3)
                depth_factor = 1000. / cam_dicts[scene_id][str_im_id][
                    'depth_scale']
                record = {
                    "dataset_name": self.name,
                    'file_name': osp.relpath(rgb_path, PROJ_ROOT),
                    'depth_file': osp.relpath(depth_path, PROJ_ROOT),
                    "depth_factor": depth_factor,
                    'height': self.height,
                    'width': self.width,
                    'image_id':
                    im_id_global,  # unique image_id in the dataset, for coco evaluation
                    "scene_im_id": "{}/{}".format(scene_id,
                                                  im_id),  # for evaluation
                    "cam": cam,
                    "img_type": 'real'
                }
                im_id_global += 1
                insts = []
                for anno_i, anno in enumerate(gt_dicts[scene_id][str_im_id]):
                    obj_id = anno['obj_id']
                    if ref.tudl.id2obj[obj_id] not in self.select_objs:
                        continue
                    cur_label = self.cat2label[obj_id]  # 0-based label
                    R = np.array(anno['cam_R_m2c'],
                                 dtype='float32').reshape(3, 3)
                    t = np.array(anno['cam_t_m2c'], dtype='float32') / 1000.0
                    pose = np.hstack([R, t.reshape(3, 1)])
                    quat = mat2quat(R).astype('float32')
                    allo_q = mat2quat(egocentric_to_allocentric(pose)
                                      [:3, :3]).astype('float32')

                    proj = (record["cam"] @ t.T).T
                    proj = proj[:2] / proj[2]

                    bbox_visib = gt_info_dicts[scene_id][str_im_id][anno_i][
                        'bbox_visib']
                    bbox_obj = gt_info_dicts[scene_id][str_im_id][anno_i][
                        'bbox_obj']
                    x1, y1, w, h = bbox_visib
                    if self.filter_invalid:
                        if h <= 1 or w <= 1:
                            self.num_instances_without_valid_box += 1
                            continue

                    mask_file = osp.join(
                        scene_root,
                        "mask/{:06d}_{:06d}.png".format(im_id, anno_i))
                    mask_visib_file = osp.join(
                        scene_root,
                        "mask_visib/{:06d}_{:06d}.png".format(im_id, anno_i))
                    assert osp.exists(mask_file), mask_file
                    assert osp.exists(mask_visib_file), mask_visib_file
                    # load mask visib  TODO: load both mask_visib and mask_full
                    mask_single = mmcv.imread(mask_visib_file, "unchanged")
                    area = mask_single.sum()
                    if area < 3:  # filter out too small or nearly invisible instances
                        self.num_instances_without_valid_segmentation += 1
                        continue
                    mask_rle = binary_mask_to_rle(mask_single, compressed=True)
                    inst = {
                        'category_id': cur_label,  # 0-based label
                        'bbox':
                        bbox_visib,  # TODO: load both bbox_obj and bbox_visib
                        'bbox_mode': BoxMode.XYWH_ABS,
                        'pose': pose,
                        "quat": quat,
                        "trans": t,
                        "allo_quat": allo_q,
                        "centroid_2d": proj,  # absolute (cx, cy)
                        "segmentation": mask_rle,
                        "mask_full_file":
                        mask_file,  # TODO: load as mask_full, rle
                    }

                    insts.append(inst)
                if len(insts) == 0:  # filter im without anno
                    continue
                record['annotations'] = insts
                dataset_dicts.append(record)

        if self.num_instances_without_valid_segmentation > 0:
            logger.warning(
                "Filtered out {} instances without valid segmentation. "
                "There might be issues in your dataset generation process.".
                format(self.num_instances_without_valid_segmentation))
        if self.num_instances_without_valid_box > 0:
            logger.warning(
                "Filtered out {} instances without valid box. "
                "There might be issues in your dataset generation process.".
                format(self.num_instances_without_valid_box))
        ##########################################################################
        if self.num_to_load > 0:
            self.num_to_load = min(int(self.num_to_load), len(dataset_dicts))
            dataset_dicts = dataset_dicts[:self.num_to_load]
        logger.info("loaded dataset dicts, num_images: {}, using {}s".format(
            len(dataset_dicts),
            time.perf_counter() - t_start))

        mmcv.dump(dataset_dicts, cache_path, protocol=4)
        logger.info("Dumped dataset_dicts to {}".format(cache_path))
        return dataset_dicts
Ejemplo n.º 3
0
    def __call__(self):  # LM_Dataset
        """Load light-weight instance annotations of all images into a list of
        dicts in Detectron2 format.

        Do not load heavy data into memory in this file, since we will
        load the annotations of all images into memory.
        """
        # cache the dataset_dicts to avoid loading masks from files
        hashed_file_name = hashlib.md5(
            ("".join([str(fn) for fn in self.objs]) +
             "dataset_dicts_{}_{}_{}_{}_{}".format(
                 self.name, self.dataset_root, self.with_masks,
                 self.with_depth, __name__)).encode("utf-8")).hexdigest()
        cache_path = osp.join(
            self.cache_dir,
            "dataset_dicts_{}_{}.pkl".format(self.name, hashed_file_name))

        if osp.exists(cache_path) and self.use_cache:
            logger.info("load cached dataset dicts from {}".format(cache_path))
            return mmcv.load(cache_path)

        t_start = time.perf_counter()

        logger.info("loading dataset dicts: {}".format(self.name))
        self.num_instances_without_valid_segmentation = 0
        self.num_instances_without_valid_box = 0
        dataset_dicts = [
        ]  # ######################################################
        assert len(self.ann_files) == len(
            self.image_prefixes
        ), f"{len(self.ann_files)} != {len(self.image_prefixes)}"
        assert len(self.ann_files) == len(
            self.xyz_prefixes
        ), f"{len(self.ann_files)} != {len(self.xyz_prefixes)}"
        for ann_file, scene_root, xyz_root in zip(tqdm(self.ann_files),
                                                  self.image_prefixes,
                                                  self.xyz_prefixes):
            # linemod each scene is an object
            with open(ann_file, "r") as f_ann:
                indices = [line.strip("\r\n")
                           for line in f_ann.readlines()]  # string ids
            gt_dict = mmcv.load(osp.join(scene_root, "scene_gt.json"))
            gt_info_dict = mmcv.load(osp.join(
                scene_root, "scene_gt_info.json"))  # bbox_obj, bbox_visib
            cam_dict = mmcv.load(osp.join(scene_root, "scene_camera.json"))
            for im_id in tqdm(indices):
                int_im_id = int(im_id)
                str_im_id = str(int_im_id)
                rgb_path = osp.join(scene_root,
                                    "rgb/{:06d}.png").format(int_im_id)
                assert osp.exists(rgb_path), rgb_path

                depth_path = osp.join(scene_root,
                                      "depth/{:06d}.png".format(int_im_id))

                scene_id = int(rgb_path.split("/")[-3])
                scene_im_id = f"{scene_id}/{int_im_id}"

                if self.debug_im_id is not None:
                    if self.debug_im_id != scene_im_id:
                        continue

                K = np.array(cam_dict[str_im_id]["cam_K"],
                             dtype=np.float32).reshape(3, 3)
                depth_factor = 1000.0 / cam_dict[str_im_id]["depth_scale"]
                if self.filter_scene:
                    if scene_id not in self.cat_ids:
                        continue
                record = {
                    "dataset_name": self.name,
                    "file_name": osp.relpath(rgb_path, PROJ_ROOT),
                    "depth_file": osp.relpath(depth_path, PROJ_ROOT),
                    "height": self.height,
                    "width": self.width,
                    "image_id": int_im_id,
                    "scene_im_id": scene_im_id,  # for evaluation
                    "cam": K,
                    "depth_factor": depth_factor,
                    "img_type": "real",
                }
                insts = []
                for anno_i, anno in enumerate(gt_dict[str_im_id]):
                    obj_id = anno["obj_id"]
                    if obj_id not in self.cat_ids:
                        continue
                    cur_label = self.cat2label[obj_id]  # 0-based label
                    R = np.array(anno["cam_R_m2c"],
                                 dtype="float32").reshape(3, 3)
                    t = np.array(anno["cam_t_m2c"], dtype="float32") / 1000.0
                    pose = np.hstack([R, t.reshape(3, 1)])
                    quat = mat2quat(R).astype("float32")

                    proj = (record["cam"] @ t.T).T
                    proj = proj[:2] / proj[2]

                    bbox_visib = gt_info_dict[str_im_id][anno_i]["bbox_visib"]
                    bbox_obj = gt_info_dict[str_im_id][anno_i]["bbox_obj"]
                    x1, y1, w, h = bbox_visib
                    if self.filter_invalid:
                        if h <= 1 or w <= 1:
                            self.num_instances_without_valid_box += 1
                            continue

                    mask_file = osp.join(
                        scene_root,
                        "mask/{:06d}_{:06d}.png".format(int_im_id, anno_i))
                    mask_visib_file = osp.join(
                        scene_root, "mask_visib/{:06d}_{:06d}.png".format(
                            int_im_id, anno_i))
                    assert osp.exists(mask_file), mask_file
                    assert osp.exists(mask_visib_file), mask_visib_file
                    # load mask visib  TODO: load both mask_visib and mask_full
                    mask_single = mmcv.imread(mask_visib_file, "unchanged")
                    area = mask_single.sum()
                    if area < 3:  # filter out too small or nearly invisible instances
                        self.num_instances_without_valid_segmentation += 1
                        continue
                    mask_rle = binary_mask_to_rle(mask_single, compressed=True)

                    inst = {
                        "category_id": cur_label,  # 0-based label
                        "bbox":
                        bbox_visib,  # TODO: load both bbox_obj and bbox_visib
                        "bbox_mode": BoxMode.XYWH_ABS,
                        "pose": pose,
                        "quat": quat,
                        "trans": t,
                        "centroid_2d": proj,  # absolute (cx, cy)
                        "segmentation": mask_rle,
                        "mask_full_file":
                        mask_file,  # TODO: load as mask_full, rle
                    }
                    if "test" not in self.name:
                        xyz_path = osp.join(
                            xyz_root, f"{int_im_id:06d}_{anno_i:06d}.pkl")
                        assert osp.exists(xyz_path), xyz_path
                        inst["xyz_path"] = xyz_path

                    model_info = self.models_info[str(obj_id)]
                    inst["model_info"] = model_info
                    # TODO: using full mask and full xyz
                    for key in ["bbox3d_and_center"]:
                        inst[key] = self.models[cur_label][key]
                    insts.append(inst)
                if len(insts) == 0:  # filter im without anno
                    continue
                record["annotations"] = insts
                dataset_dicts.append(record)

        if self.num_instances_without_valid_segmentation > 0:
            logger.warning(
                "Filtered out {} instances without valid segmentation. "
                "There might be issues in your dataset generation process.".
                format(self.num_instances_without_valid_segmentation))
        if self.num_instances_without_valid_box > 0:
            logger.warning(
                "Filtered out {} instances without valid box. "
                "There might be issues in your dataset generation process.".
                format(self.num_instances_without_valid_box))
        ##########################################################################
        if self.num_to_load > 0:
            self.num_to_load = min(int(self.num_to_load), len(dataset_dicts))
            dataset_dicts = dataset_dicts[:self.num_to_load]
        logger.info("loaded {} dataset dicts, using {}s".format(
            len(dataset_dicts),
            time.perf_counter() - t_start))

        mmcv.mkdir_or_exist(osp.dirname(cache_path))
        mmcv.dump(dataset_dicts, cache_path, protocol=4)
        logger.info("Dumped dataset_dicts to {}".format(cache_path))
        return dataset_dicts
Ejemplo n.º 4
0
    def _load_from_idx_file(self, idx_file, image_root):
        """
        idx_file: the scene/image ids
        image_root/scene contains:
            scene_gt.json
            scene_gt_info.json
            scene_camera.json
        """
        xyz_root = osp.join(image_root, "xyz_crop")
        scene_gt_dicts = {}
        scene_gt_info_dicts = {}
        scene_cam_dicts = {}
        scene_im_ids = []  # store tuples of (scene_id, im_id)
        with open(idx_file, "r") as f:
            for line in f:
                line_split = line.strip("\r\n").split("/")
                scene_id = int(line_split[0])
                im_id = int(line_split[1])
                scene_im_ids.append((scene_id, im_id))
                if scene_id not in scene_gt_dicts:
                    scene_gt_file = osp.join(image_root, f"{scene_id:06d}/scene_gt.json")
                    assert osp.exists(scene_gt_file), scene_gt_file
                    scene_gt_dicts[scene_id] = mmcv.load(scene_gt_file)

                if scene_id not in scene_gt_info_dicts:
                    scene_gt_info_file = osp.join(image_root, f"{scene_id:06d}/scene_gt_info.json")
                    assert osp.exists(scene_gt_info_file), scene_gt_info_file
                    scene_gt_info_dicts[scene_id] = mmcv.load(scene_gt_info_file)

                if scene_id not in scene_cam_dicts:
                    scene_cam_file = osp.join(image_root, f"{scene_id:06d}/scene_camera.json")
                    assert osp.exists(scene_cam_file), scene_cam_file
                    scene_cam_dicts[scene_id] = mmcv.load(scene_cam_file)
        ######################################################
        scene_im_ids = sorted(scene_im_ids)  # sort to make it reproducible
        dataset_dicts = []

        num_instances_without_valid_segmentation = 0
        num_instances_without_valid_box = 0

        for (scene_id, im_id) in tqdm(scene_im_ids):
            rgb_path = osp.join(image_root, f"{scene_id:06d}/rgb/{im_id:06d}.png")
            assert osp.exists(rgb_path), rgb_path
            str_im_id = str(im_id)

            scene_im_id = f"{scene_id}/{im_id}"

            # for ycbv/tless, load cam K from image infos
            cam_anno = np.array(scene_cam_dicts[scene_id][str_im_id]["cam_K"], dtype=np.float32).reshape(3, 3)
            depth_factor = 1000.0 / scene_cam_dicts[scene_id][str_im_id]["depth_scale"]
            # dprint(record['cam'])
            if "/train_synt/" in rgb_path:
                img_type = "syn"
            else:
                img_type = "real"
            record = {
                "dataset_name": self.name,
                "file_name": osp.relpath(rgb_path, PROJ_ROOT),
                "height": self.height,
                "width": self.width,
                "image_id": self._unique_im_id,
                "scene_im_id": scene_im_id,  # for evaluation
                "cam": cam_anno,  # self.cam,
                "depth_factor": depth_factor,
                "img_type": img_type,
            }

            if self.with_depth:
                depth_file = osp.join(image_root, f"{scene_id:06d}/depth/{im_id:06d}.png")
                assert osp.exists(depth_file), depth_file
                record["depth_file"] = osp.relpath(depth_file, PROJ_ROOT)

            insts = []
            anno_dict_list = scene_gt_dicts[scene_id][str(im_id)]
            info_dict_list = scene_gt_info_dicts[scene_id][str(im_id)]
            for anno_i, anno in enumerate(anno_dict_list):
                info = info_dict_list[anno_i]
                obj_id = anno["obj_id"]
                if obj_id not in self.cat_ids:
                    continue
                # 0-based label now
                cur_label = self.cat2label[obj_id]
                ################ pose ###########################
                R = np.array(anno["cam_R_m2c"], dtype="float32").reshape(3, 3)
                trans = np.array(anno["cam_t_m2c"], dtype="float32") / 1000.0  # mm->m
                pose = np.hstack([R, trans.reshape(3, 1)])
                quat = mat2quat(pose[:3, :3])

                ############# bbox ############################
                if "cam_old" in record:
                    # NOTE: calculate bbox from pose/points
                    bbox = misc.compute_2d_bbox_xyxy_from_pose_v2(
                        self.models[cur_label]["pts"],
                        pose,
                        self.cam,  # NOTE: use self.cam here
                        width=self.width,
                        height=self.height,
                        clip=True,
                    )
                else:
                    bbox = info["bbox_obj"]
                    x1, y1, w, h = bbox
                    x2 = x1 + w
                    y2 = y1 + h
                    x1 = max(min(x1, self.width), 0)
                    y1 = max(min(y1, self.height), 0)
                    x2 = max(min(x2, self.width), 0)
                    y2 = max(min(y2, self.height), 0)
                    bbox = [x1, y1, x2, y2]
                if self.filter_invalid:
                    bw = bbox[2] - bbox[0]
                    bh = bbox[3] - bbox[1]
                    if bh <= 1 or bw <= 1:
                        num_instances_without_valid_box += 1
                        continue

                ############## mask #######################
                if self.with_masks:  # either list[list[float]] or dict(RLE)
                    mask_visib_file = osp.join(image_root, f"{scene_id:06d}/mask_visib/{im_id:06d}_{anno_i:06d}.png")
                    assert osp.exists(mask_visib_file), mask_visib_file
                    mask = mmcv.imread(mask_visib_file, "unchanged")
                    area = mask.sum()
                    if area < 3 and self.filter_invalid:
                        num_instances_without_valid_segmentation += 1
                        continue
                    mask_rle = binary_mask_to_rle(mask)

                    mask_full_file = osp.join(image_root, f"{scene_id:06d}/mask/{im_id:06d}_{anno_i:06d}.png")
                    assert osp.exists(mask_full_file), mask_full_file

                proj = (self.cam @ trans.T).T  # NOTE: use self.cam here
                proj = proj[:2] / proj[2]

                inst = {
                    "category_id": cur_label,  # 0-based label
                    "bbox": bbox,  # TODO: load both bbox_obj and bbox_visib
                    "bbox_mode": BoxMode.XYXY_ABS,
                    "pose": pose,
                    "quat": quat,
                    "trans": trans,
                    "centroid_2d": proj,  # absolute (cx, cy)
                    "segmentation": mask_rle,
                    "mask_full_file": mask_full_file,  # TODO: load as mask_full, rle
                }

                if self.with_xyz:
                    xyz_path = osp.join(xyz_root, f"{scene_id:06d}/{im_id:06d}_{anno_i:06d}-xyz.pkl")
                    assert osp.exists(xyz_path), xyz_path
                    inst["xyz_path"] = xyz_path

                model_info = self.models_info[str(obj_id)]
                inst["model_info"] = model_info
                # TODO: using full mask and full xyz
                for key in ["bbox3d_and_center"]:
                    inst[key] = self.models[cur_label][key]
                insts.append(inst)
            if len(insts) == 0:  # and self.filter_invalid:
                continue
            record["annotations"] = insts
            dataset_dicts.append(record)
            self._unique_im_id += 1

        if num_instances_without_valid_segmentation > 0:
            logger.warning(
                "Filtered out {} instances without valid segmentation. "
                "There might be issues in your dataset generation process.".format(
                    num_instances_without_valid_segmentation
                )
            )
        if num_instances_without_valid_box > 0:
            logger.warning(
                "Filtered out {} instances without valid box. "
                "There might be issues in your dataset generation process.".format(num_instances_without_valid_box)
            )
        return dataset_dicts
Ejemplo n.º 5
0
    def __call__(self):  # LM_SYN_IMGN_Dataset
        """Load light-weight instance annotations of all images into a list of
        dicts in Detectron2 format.

        Do not load heavy data into memory in this file, since we will
        load the annotations of all images into memory.
        """
        # cache the dataset_dicts to avoid loading masks from files
        hashed_file_name = hashlib.md5(
            (
                "".join([str(fn) for fn in self.objs])
                + "dataset_dicts_{}_{}_{}_{}_{}_{}".format(
                    self.name, self.dataset_root, self.with_masks, self.with_depth, self.n_per_obj, __name__
                )
            ).encode("utf-8")
        ).hexdigest()
        cache_path = osp.join(self.dataset_root, "dataset_dicts_{}_{}.pkl".format(self.name, hashed_file_name))

        if osp.exists(cache_path) and self.use_cache:
            logger.info("load cached dataset dicts from {}".format(cache_path))
            return mmcv.load(cache_path)

        t_start = time.perf_counter()

        logger.info("loading dataset dicts: {}".format(self.name))
        self.num_instances_without_valid_segmentation = 0
        self.num_instances_without_valid_box = 0
        dataset_dicts = []  #######################################################
        assert len(self.ann_files) == len(self.image_prefixes), f"{len(self.ann_files)} != {len(self.image_prefixes)}"
        assert len(self.ann_files) == len(self.xyz_prefixes), f"{len(self.ann_files)} != {len(self.xyz_prefixes)}"
        for ann_file, scene_root, xyz_root in zip(self.ann_files, self.image_prefixes, self.xyz_prefixes):
            # linemod each scene is an object
            with open(ann_file, "r") as f_ann:
                indices = [line.strip("\r\n").split()[-1] for line in f_ann.readlines()]  # string ids
            # sample uniformly (equal space)
            if self.n_per_obj > 0:
                sample_num = min(self.n_per_obj, len(indices))
                sel_indices_idx = np.linspace(0, len(indices) - 1, sample_num, dtype=np.int32)
                sel_indices = [indices[int(_i)] for _i in sel_indices_idx]
            else:
                sel_indices = indices

            for im_id in tqdm(sel_indices):
                rgb_path = osp.join(scene_root, "{}-color.png").format(im_id)
                assert osp.exists(rgb_path), rgb_path

                depth_path = osp.join(scene_root, "{}-depth.png".format(im_id))

                obj_name = im_id.split("/")[0]
                if obj_name == "benchviseblue":
                    obj_name = "benchvise"
                obj_id = ref.lm_full.obj2id[obj_name]
                if self.filter_scene:
                    if obj_name not in self.objs:
                        continue
                record = {
                    "dataset_name": self.name,
                    "file_name": osp.relpath(rgb_path, PROJ_ROOT),
                    "depth_file": osp.relpath(depth_path, PROJ_ROOT),
                    "height": self.height,
                    "width": self.width,
                    "image_id": im_id.split("/")[-1],
                    "scene_im_id": im_id,
                    "cam": self.cam,
                    "img_type": "syn",
                }

                cur_label = self.obj2label[obj_name]  # 0-based label
                pose_path = osp.join(scene_root, "{}-pose.txt".format(im_id))
                pose = np.loadtxt(pose_path, skiprows=1)
                R = pose[:3, :3]
                t = pose[:3, 3]
                quat = mat2quat(R).astype("float32")
                proj = (record["cam"] @ t.T).T
                proj = proj[:2] / proj[2]

                depth = mmcv.imread(depth_path, "unchanged") / 1000.0
                mask = (depth > 0).astype(np.uint8)

                bbox_obj = mask2bbox_xywh(mask)
                x1, y1, w, h = bbox_obj
                if self.filter_invalid:
                    if h <= 1 or w <= 1:
                        self.num_instances_without_valid_box += 1
                        continue
                area = mask.sum()
                if area < 3:  # filter out too small or nearly invisible instances
                    self.num_instances_without_valid_segmentation += 1
                    continue
                mask_rle = binary_mask_to_rle(mask, compressed=True)

                xyz_path = osp.join(xyz_root, f"{im_id}-xyz.pkl")
                assert osp.exists(xyz_path), xyz_path
                inst = {
                    "category_id": cur_label,  # 0-based label
                    "bbox": bbox_obj,  # TODO: load both bbox_obj and bbox_visib
                    "bbox_mode": BoxMode.XYWH_ABS,
                    "pose": pose,
                    "quat": quat,
                    "trans": t,
                    "centroid_2d": proj,  # absolute (cx, cy)
                    "segmentation": mask_rle,
                    "xyz_path": xyz_path,
                }

                model_info = self.models_info[str(obj_id)]
                inst["model_info"] = model_info
                # TODO: using full mask and full xyz
                for key in ["bbox3d_and_center"]:
                    inst[key] = self.models[cur_label][key]
                record["annotations"] = [inst]
                dataset_dicts.append(record)

        if self.num_instances_without_valid_segmentation > 0:
            logger.warning(
                "Filtered out {} instances without valid segmentation. "
                "There might be issues in your dataset generation process.".format(
                    self.num_instances_without_valid_segmentation
                )
            )
        if self.num_instances_without_valid_box > 0:
            logger.warning(
                "Filtered out {} instances without valid box. "
                "There might be issues in your dataset generation process.".format(self.num_instances_without_valid_box)
            )
        ##########################################################################
        # if self.num_to_load > 0:
        #     self.num_to_load = min(int(self.num_to_load), len(dataset_dicts))
        #     random.shuffle(dataset_dicts)
        #     dataset_dicts = dataset_dicts[: self.num_to_load]
        logger.info(
            "loaded dataset dicts, num_images: {}, using {}s".format(len(dataset_dicts), time.perf_counter() - t_start)
        )

        mmcv.dump(dataset_dicts, cache_path, protocol=4)
        logger.info("Dumped dataset_dicts to {}".format(cache_path))
        return dataset_dicts
Ejemplo n.º 6
0
def save_result_of_dataset(cfg, model, data_loader, output_dir, dataset_name):
    """
    Run model (in eval mode) on the data_loader and save predictions
    Args:
        cfg: config
        model (nn.Module): a module which accepts an object from
            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.

            If you wish to evaluate a model in `training` mode instead, you can
            wrap the given model and override its behavior of `.eval()` and `.train()`.
        data_loader: an iterable object with a length.
            The elements it generates will be the inputs to the model.
    Returns:
        The return value of `evaluator.evaluate()`
    """
    cpu_device = torch.device("cpu")
    num_devices = get_world_size()
    logger = logging.getLogger(__name__)
    logger.info("Start inference on {} images".format(len(data_loader)))

    # NOTE: dataset name should be the same as TRAIN to get the correct meta
    _metadata = MetadataCatalog.get(dataset_name)
    data_ref = ref.__dict__[_metadata.ref_key]
    obj_names = _metadata.objs
    obj_ids = [data_ref.obj2id[obj_name] for obj_name in obj_names]

    result_name = "results.pkl"
    mmcv.mkdir_or_exist(output_dir)
    result_path = osp.join(output_dir, result_name)

    total = len(data_loader)  # inference data loader must have a fixed length
    results = OrderedDict()
    VIS = False

    logging_interval = 50
    num_warmup = min(5, logging_interval - 1, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0
    with inference_context(model), torch.no_grad():
        for idx, inputs in enumerate(data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0
            if VIS:
                images_ori = [_input["image"].clone() for _input in inputs]
            start_compute_time = time.perf_counter()
            outputs = model(inputs)  # NOTE: do model inference
            torch.cuda.synchronize()
            cur_compute_time = time.perf_counter() - start_compute_time
            total_compute_time += cur_compute_time

            # NOTE: process results
            for i in range(len(inputs)):
                _input = inputs[i]
                output = outputs[i]
                cur_results = {}
                instances = output["instances"]
                HAS_MASK = False
                if instances.has("pred_masks"):
                    HAS_MASK = True
                    pred_masks = instances.pred_masks  # (#objs, imH, imW)
                    pred_masks = pred_masks.detach().cpu().numpy()
                    # NOTE: time comsuming step
                    rles = [
                        binary_mask_to_rle(pred_masks[_k])
                        for _k in range(len(pred_masks))
                    ]

                instances = instances.to(cpu_device)
                boxes = instances.pred_boxes.tensor.clone().detach().cpu(
                ).numpy()  # xyxy

                scores = instances.scores.tolist()
                labels = instances.pred_classes.detach().cpu().numpy()

                obj_ids = [
                    data_ref.obj2id[obj_names[int(label)]] for label in labels
                ]
                ego_quats = instances.pred_ego_quats.detach().cpu().numpy()
                ego_rots = [
                    quat2mat(ego_quats[k]) for k in range(len(ego_quats))
                ]
                transes = instances.pred_transes.detach().cpu().numpy()

                cur_results = {
                    "time": cur_compute_time / len(inputs),
                    "obj_ids": obj_ids,
                    "scores": scores,
                    "boxes": boxes,  # xyxy
                    "Rs": ego_rots,
                    "ts": transes,  # m
                }
                if HAS_MASK:
                    cur_results["masks"] = rles

                if VIS:
                    import cv2
                    from lib.vis_utils.image import vis_image_mask_bbox_cv2

                    image = (images_ori[i].detach().cpu().numpy().transpose(
                        1, 2, 0) + 0.5).astype("uint8")
                    img_vis = vis_image_mask_bbox_cv2(
                        image,
                        pred_masks,
                        boxes,
                        labels=[obj_names[int(label)] for label in labels])
                    cv2.imshow("img", img_vis.astype("uint8"))
                    cv2.waitKey()
                results[_input["scene_im_id"]] = cur_results

            if (idx + 1) % logging_interval == 0:
                duration = time.perf_counter() - start_time
                seconds_per_img = duration / (idx + 1 - num_warmup)
                eta = datetime.timedelta(seconds=int(seconds_per_img *
                                                     (total - num_warmup) -
                                                     duration))
                logger.info(
                    "Inference done {}/{}. {:.4f} s / img. ETA={}".format(
                        idx + 1, total, seconds_per_img, str(eta)))

    # Measure the time only for this worker (before the synchronization barrier)
    total_time = int(time.perf_counter() - start_time)
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
        "Total inference time: {} ({:.6f} s / img per device, on {} devices)".
        format(total_time_str, total_time / (total - num_warmup), num_devices))
    total_compute_time_str = str(
        datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)"
        .format(total_compute_time_str,
                total_compute_time / (total - num_warmup), num_devices))

    mmcv.dump(results, result_path)
    logger.info("Results saved to {}".format(result_path))
Ejemplo n.º 7
0
    def __call__(self):  # LM_BLENDER
        """Load light-weight instance annotations of all images into a list of
        dicts in Detectron2 format.

        Do not load heavy data into memory in this file, since we will
        load the annotations of all images into memory.
        """
        # cache the dataset_dicts to avoid loading masks from files
        hashed_file_name = hashlib.md5(
            (
                "".join([str(fn) for fn in self.objs])
                + "dataset_dicts_{}_{}_{}_{}_{}_{}_{}".format(
                    self.name,
                    self.dataset_root,
                    self.with_masks,
                    self.with_depth,
                    self.with_xyz,
                    self.n_per_obj,
                    __name__,
                )
            ).encode("utf-8")
        ).hexdigest()
        cache_path = osp.join(self.cache_dir, "dataset_dicts_{}_{}.pkl".format(self.name, hashed_file_name))

        if osp.exists(cache_path) and self.use_cache:
            logger.info("load cached dataset dicts from {}".format(cache_path))
            return mmcv.load(cache_path)

        t_start = time.perf_counter()

        logger.info("loading dataset dicts: {}".format(self.name))
        self.num_instances_without_valid_segmentation = 0
        self.num_instances_without_valid_box = 0
        dataset_dicts = []  #######################################################
        assert len(self.ann_files) == len(self.image_prefixes), f"{len(self.ann_files)} != {len(self.image_prefixes)}"

        for ann_file, scene_root in zip(tqdm(self.ann_files), self.image_prefixes):
            # each scene is an object
            assert osp.exists(ann_file), ann_file
            scene_gt_dict = mmcv.load(ann_file)
            # sample uniformly (equal space)
            indices = list(scene_gt_dict.keys())
            if self.n_per_obj > 0:
                sample_num = min(self.n_per_obj, len(scene_gt_dict))
                sel_indices_idx = np.linspace(0, len(scene_gt_dict) - 1, sample_num, dtype=np.int32)
                sel_indices = [indices[int(_i)] for _i in sel_indices_idx]
            else:
                sel_indices = indices

            for str_im_id in tqdm(sel_indices):
                int_im_id = int(str_im_id)
                rgb_path = osp.join(scene_root, "{}.jpg").format(str_im_id)
                assert osp.exists(rgb_path), rgb_path

                depth_path = osp.join(scene_root, "{}_depth_opengl.png".format(str_im_id))

                obj_name = osp.basename(ann_file).split("_")[0]  # obj_gt.json
                obj_id = ref.lm_full.obj2id[obj_name]
                if obj_name not in self.objs:
                    continue

                record = {
                    "dataset_name": self.name,
                    "file_name": osp.relpath(rgb_path, PROJ_ROOT),
                    "depth_file": osp.relpath(depth_path, PROJ_ROOT),
                    "height": self.height,
                    "width": self.width,
                    "image_id": int_im_id,
                    "scene_im_id": f"{obj_id}/{int_im_id}",
                    "cam": self.cam,
                    "img_type": "syn_blender",  # has bg
                }

                cur_label = self.obj2label[obj_name]  # 0-based label
                anno = scene_gt_dict[str_im_id][0]  # only one object
                R = np.array(anno["cam_R_m2c"]).reshape(3, 3)
                t = np.array(anno["cam_t_m2c"]).reshape(-1) / 1000
                pose = np.hstack([R, t.reshape(3, 1)])
                quat = mat2quat(R).astype("float32")
                proj = (record["cam"] @ t.T).T
                proj = proj[:2] / proj[2]

                bbox_visib = anno["bbox_visib"]
                x1, y1, w, h = bbox_visib
                if self.filter_invalid:
                    if h <= 1 or w <= 1:
                        self.num_instances_without_valid_box += 1
                        continue

                mask_path = osp.join(scene_root, "{}_mask_opengl.png".format(str_im_id))
                mask = mmcv.imread(mask_path, "unchanged")
                mask = (mask > 0).astype(np.uint8)

                area = mask.sum()
                if area < 3:  # filter out too small or nearly invisible instances
                    self.num_instances_without_valid_segmentation += 1
                    continue
                mask_rle = binary_mask_to_rle(mask, compressed=True)

                xyz_path = osp.join(scene_root, "{}_xyz_bop.pkl".format(str_im_id))
                assert osp.exists(xyz_path), xyz_path

                visib_fract = anno.get("visib_fract", 1.0)
                inst = {
                    "category_id": cur_label,  # 0-based label
                    "bbox": bbox_visib,  # TODO: load both bbox_obj and bbox_visib
                    "bbox_mode": BoxMode.XYWH_ABS,
                    "pose": pose,
                    "quat": quat,
                    "trans": t,
                    "centroid_2d": proj,  # absolute (cx, cy)
                    "segmentation": mask_rle,
                    "xyz_path": xyz_path,
                    "visib_fract": visib_fract,
                }

                model_info = self.models_info[str(obj_id)]
                inst["model_info"] = model_info
                for key in ["bbox3d_and_center"]:
                    inst[key] = self.models[cur_label][key]
                record["annotations"] = [inst]
                dataset_dicts.append(record)

        if self.num_instances_without_valid_segmentation > 0:
            logger.warning(
                "Filtered out {} instances without valid segmentation. "
                "There might be issues in your dataset generation process.".format(
                    self.num_instances_without_valid_segmentation
                )
            )
        if self.num_instances_without_valid_box > 0:
            logger.warning(
                "Filtered out {} instances without valid box. "
                "There might be issues in your dataset generation process.".format(self.num_instances_without_valid_box)
            )
        ##########################################################################
        # if self.num_to_load > 0:
        #     self.num_to_load = min(int(self.num_to_load), len(dataset_dicts))
        #     random.shuffle(dataset_dicts)
        #     dataset_dicts = dataset_dicts[: self.num_to_load]
        logger.info(
            "loaded dataset dicts, num_images: {}, using {}s".format(len(dataset_dicts), time.perf_counter() - t_start)
        )

        mmcv.dump(dataset_dicts, cache_path, protocol=4)
        logger.info("Dumped dataset_dicts to {}".format(cache_path))
        return dataset_dicts
Ejemplo n.º 8
0
    def _load_from_idx_file(self, idx_file, image_root):
        """
        idx_file: the scene/image ids
        image_root/scene contains:
            scene_gt.json
            scene_gt_info.json
            scene_camera.json
        """
        scene_gt_dicts = {}
        scene_gt_info_dicts = {}
        scene_cam_dicts = {}
        scene_im_ids = []  # store tuples of (scene_id, im_id)
        with open(idx_file, 'r') as f:
            for line in f:
                line_split = line.strip('\r\n').split('/')
                scene_id = int(line_split[0])
                im_id = int(line_split[1])
                scene_im_ids.append((scene_id, im_id))
                if scene_id not in scene_gt_dicts:
                    scene_gt_file = osp.join(image_root, f'{scene_id:06d}/scene_gt.json')
                    assert osp.exists(scene_gt_file), scene_gt_file
                    scene_gt_dicts[scene_id] = mmcv.load(scene_gt_file)

                if scene_id not in scene_gt_info_dicts:
                    scene_gt_info_file = osp.join(image_root, f'{scene_id:06d}/scene_gt_info.json')
                    assert osp.exists(scene_gt_info_file), scene_gt_info_file
                    scene_gt_info_dicts[scene_id] = mmcv.load(scene_gt_info_file)

                if scene_id not in scene_cam_dicts:
                    scene_cam_file = osp.join(image_root, f'{scene_id:06d}/scene_camera.json')
                    assert osp.exists(scene_cam_file), scene_cam_file
                    scene_cam_dicts[scene_id] = mmcv.load(scene_cam_file)
        ######################################################
        scene_im_ids = sorted(scene_im_ids)  # sort to make it reproducible
        dataset_dicts = []

        num_instances_without_valid_segmentation = 0
        num_instances_without_valid_box = 0

        for (scene_id, im_id) in tqdm(scene_im_ids):
            rgb_path = osp.join(image_root, f'{scene_id:06d}/rgb/{im_id:06d}.png')
            assert osp.exists(rgb_path), rgb_path
            # for ycbv/tless, load cam K from image infos
            cam_anno = np.array(scene_cam_dicts[scene_id][str(im_id)]["cam_K"], dtype="float32").reshape(3, 3)
            # dprint(record['cam'])
            if '/train_synt/' in rgb_path:
                img_type = 'syn'
            else:
                img_type = 'real'
            record = {
                "dataset_name": self.name,
                'file_name': osp.relpath(rgb_path, PROJ_ROOT),
                'height': self.height,
                'width': self.width,
                'image_id': self._unique_im_id,
                "scene_im_id": "{}/{}".format(scene_id, im_id),  # for evaluation
                "cam": cam_anno,  # self.cam,
                "img_type": img_type
            }

            if self.with_depth:
                depth_file = osp.join(image_root, f'{scene_id:06d}/depth/{im_id:06d}.png')
                assert osp.exists(depth_file), depth_file
                record["depth_file"] = osp.relpath(depth_file, PROJ_ROOT)

            insts = []
            anno_dict_list = scene_gt_dicts[scene_id][str(im_id)]
            info_dict_list = scene_gt_info_dicts[scene_id][str(im_id)]
            for anno_i, anno in enumerate(anno_dict_list):
                info = info_dict_list[anno_i]
                obj_id = anno['obj_id']
                if obj_id not in self.cat_ids:
                    continue
                # 0-based label now
                cur_label = self.cat2label[obj_id]
                ################ pose ###########################
                R = np.array(anno['cam_R_m2c'], dtype='float32').reshape(3, 3)
                trans = np.array(anno['cam_t_m2c'], dtype='float32') / 1000.0  # mm->m
                pose = np.hstack([R, trans.reshape(3, 1)])
                quat = mat2quat(pose[:3, :3])
                allo_q = mat2quat(egocentric_to_allocentric(pose)[:3, :3])

                ############# bbox ############################
                if True:
                    bbox = info['bbox_obj']
                    x1, y1, w, h = bbox
                    x2 = x1 + w
                    y2 = y1 + h
                    x1 = max(min(x1, self.width), 0)
                    y1 = max(min(y1, self.height), 0)
                    x2 = max(min(x2, self.width), 0)
                    y2 = max(min(y2, self.height), 0)
                    bbox = [x1, y1, x2, y2]
                if self.filter_invalid:
                    bw = bbox[2] - bbox[0]
                    bh = bbox[3] - bbox[1]
                    if bh <= 1 or bw <= 1:
                        num_instances_without_valid_box += 1
                        continue

                ############## mask #######################
                if self.with_masks:  # either list[list[float]] or dict(RLE)
                    mask_visib_file = osp.join(image_root, f'{scene_id:06d}/mask_visib/{im_id:06d}_{anno_i:06d}.png')
                    assert osp.exists(mask_visib_file), mask_visib_file
                    mask = mmcv.imread(mask_visib_file, 'unchanged')
                    if mask.sum() < 1 and self.filter_invalid:
                        num_instances_without_valid_segmentation += 1
                        continue
                    mask_rle = binary_mask_to_rle(mask)

                    mask_full_file = osp.join(image_root, f'{scene_id:06d}/mask/{im_id:06d}_{anno_i:06d}.png')
                    assert osp.exists(mask_full_file), mask_full_file

                proj = (self.cam @ trans.T).T  # NOTE: use self.cam here
                proj = proj[:2] / proj[2]

                inst = {
                    'category_id': cur_label,  # 0-based label
                    'bbox': bbox,  # TODO: load both bbox_obj and bbox_visib
                    'bbox_mode': BoxMode.XYXY_ABS,
                    "quat": quat,
                    "trans": trans,
                    "allo_quat": allo_q,
                    "centroid_2d": proj,  # absolute (cx, cy)
                    "segmentation": mask_rle,
                    "mask_full_file": mask_full_file,  # TODO: load as mask_full, rle
                }

                insts.append(inst)

            if len(insts) == 0:  # and self.filter_invalid:
                continue
            record["annotations"] = insts
            dataset_dicts.append(record)
            self._unique_im_id += 1

        if num_instances_without_valid_segmentation > 0:
            logger.warn("Filtered out {} instances without valid segmentation. "
                        "There might be issues in your dataset generation process.".format(
                            num_instances_without_valid_segmentation))
        if num_instances_without_valid_box > 0:
            logger.warn(
                "Filtered out {} instances without valid box. "
                "There might be issues in your dataset generation process.".format(num_instances_without_valid_box))
        return dataset_dicts