コード例 #1
0
class YTVOSDataset:
    def __init__(self, img_folder, ann_file, transforms, return_masks,
                 num_frames):
        self.img_folder = img_folder
        self.ann_file = ann_file
        self._transforms = transforms
        self.return_masks = return_masks
        self.num_frames = num_frames
        self.prepare = ConvertCocoPolysToMask(return_masks)
        self.ytvos = YTVOS(ann_file)
        self.cat_ids = self.ytvos.getCatIds()
        self.vid_ids = self.ytvos.getVidIds()
        self.vid_infos = []
        for i in self.vid_ids:
            info = self.ytvos.loadVids([i])[0]
            info['filenames'] = info['file_names']
            self.vid_infos.append(info)
        self.img_ids = []
        for idx, vid_info in enumerate(self.vid_infos):
            for frame_id in range(len(vid_info['filenames'])):
                self.img_ids.append((idx, frame_id))

    def __len__(self):
        return len(self.img_ids)

    def __getitem__(self, idx):
        vid, frame_id = self.img_ids[idx]
        vid_id = self.vid_infos[vid]['id']
        img = []
        vid_len = len(self.vid_infos[vid]['file_names'])
        inds = list(range(self.num_frames))
        inds = [i % vid_len for i in inds][::-1]
        # if random
        # random.shuffle(inds)
        for j in range(self.num_frames):
            img_path = os.path.join(
                str(self.img_folder),
                self.vid_infos[vid]['file_names'][frame_id - inds[j]])
            img.append(Image.open(img_path).convert('RGB'))
        ann_ids = self.ytvos.getAnnIds(vidIds=[vid_id])
        target = self.ytvos.loadAnns(ann_ids)
        target = {
            'image_id': idx,
            'video_id': vid,
            'frame_id': frame_id,
            'annotations': target
        }
        target = self.prepare(img[0], target, inds, self.num_frames)
        if self._transforms is not None:
            img, target = self._transforms(img, target)
        return torch.cat(img, dim=0), target
コード例 #2
0
ファイル: ytvos_tasuf.py プロジェクト: janghyuk-choi/TASUF
class YTVOSDatasetTASUF(CustomDataset):
    CLASSES = ('person', 'giant_panda', 'lizard', 'parrot', 'skateboard',
               'sedan', 'ape', 'dog', 'snake', 'monkey', 'hand', 'rabbit',
               'duck', 'cat', 'cow', 'fish', 'train', 'horse', 'turtle',
               'bear', 'motorbike', 'giraffe', 'leopard', 'fox', 'deer', 'owl',
               'surfboard', 'airplane', 'truck', 'zebra', 'tiger', 'elephant',
               'snowboard', 'boat', 'shark', 'mouse', 'frog', 'eagle',
               'earless_seal', 'tennis_racket')

    def __init__(self,
                 ann_file,
                 img_prefix,
                 img_scale,
                 img_norm_cfg,
                 size_divisor=None,
                 proposal_file=None,
                 num_max_proposals=1000,
                 flip_ratio=0,
                 with_mask=True,
                 with_crowd=True,
                 with_label=True,
                 with_track=False,
                 extra_aug=None,
                 aug_ref_bbox_param=None,
                 resize_keep_ratio=True,
                 test_mode=False):

        self.max_gap = 3

        # prefix of images path
        self.img_prefix = img_prefix

        # load annotations (and proposals)
        self.vid_infos = self.load_annotations(ann_file)
        img_ids = []
        for idx, vid_info in enumerate(self.vid_infos):
            for frame_id in range(len(vid_info['filenames'])):
                img_ids.append((idx, frame_id))
        self.img_ids = img_ids
        if proposal_file is not None:
            self.proposals = self.load_proposals(proposal_file)
        else:
            self.proposals = None
        # filter images with no annotation during training
        if not test_mode:
            valid_inds = [
                i for i, (v, f) in enumerate(self.img_ids)
                if len(self.get_ann_info(v, f)['bboxes'])
            ]
            self.img_ids = [self.img_ids[i] for i in valid_inds]

        # (long_edge, short_edge) or [(long1, short1), (long2, short2), ...]
        self.img_scales = img_scale if isinstance(img_scale,
                                                  list) else [img_scale]
        assert mmcv.is_list_of(self.img_scales, tuple)
        # normalization configs
        self.img_norm_cfg = img_norm_cfg

        # max proposals per image
        self.num_max_proposals = num_max_proposals
        # flip ratio
        self.flip_ratio = flip_ratio
        assert flip_ratio >= 0 and flip_ratio <= 1
        # padding border to ensure the image size can be divided by
        # size_divisor (used for FPN)
        self.size_divisor = size_divisor

        # with mask or not (reserved field, takes no effect)
        self.with_mask = with_mask
        # some datasets provide bbox annotations as ignore/crowd/difficult,
        # if `with_crowd` is True, then these info is returned.
        self.with_crowd = with_crowd
        # with label is False for RPN
        self.with_label = with_label
        self.with_track = with_track
        # params for augmenting bbox in the reference frame
        self.aug_ref_bbox_param = aug_ref_bbox_param
        # in test mode or not
        self.test_mode = test_mode

        # set group flag for the sampler
        if not self.test_mode:
            self._set_group_flag()
        # transforms
        self.img_transform = ImageTransform(size_divisor=self.size_divisor,
                                            **self.img_norm_cfg)
        self.bbox_transform = BboxTransform()
        self.mask_transform = MaskTransform()
        self.numpy2tensor = Numpy2Tensor()

        # if use extra augmentation
        if extra_aug is not None:
            self.extra_aug = ExtraAugmentation(**extra_aug)
        else:
            self.extra_aug = None

        # image rescale if keep ratio
        self.resize_keep_ratio = resize_keep_ratio

        # [JH]
        self.max_bboxes_per_frame = 0

    def __len__(self):
        return len(self.img_ids)

    def __getitem__(self, idx):
        if self.test_mode:
            return self.prepare_test_img(self.img_ids[idx])
        data = self.prepare_train_img(self.img_ids[idx])
        while data == None:
            try:
                data = self.prepare_train_img(self.img_ids[idx + 1])
            except:
                data = self.prepare_train_img(self.img_ids[idx - 1])
        return data

    def load_annotations(self, ann_file):
        self.ytvos = YTVOS(ann_file)
        self.cat_ids = self.ytvos.getCatIds()
        self.cat2label = {
            cat_id: i + 1
            for i, cat_id in enumerate(self.cat_ids)
        }
        self.vid_ids = self.ytvos.getVidIds()
        vid_infos = []
        for i in self.vid_ids:
            info = self.ytvos.loadVids([i])[0]
            info['filenames'] = info['file_names']
            vid_infos.append(info)
        return vid_infos

    def get_ann_info(self, idx, frame_id):
        vid_id = self.vid_infos[idx]['id']
        ann_ids = self.ytvos.getAnnIds(vidIds=[vid_id])
        ann_info = self.ytvos.loadAnns(ann_ids)
        return self._parse_ann_info(ann_info, frame_id)

    def _set_group_flag(self):
        """Set flag according to image aspect ratio.

        Images with aspect ratio greater than 1 will be set as group 1,
        otherwise group 0.
        """
        self.flag = np.zeros(len(self), dtype=np.uint8)
        for i in range(len(self)):
            vid_id, _ = self.img_ids[i]
            vid_info = self.vid_infos[vid_id]
            if vid_info['width'] / vid_info['height'] > 1:
                self.flag[i] = 1

    def bbox_aug(self, bbox, img_size):
        assert self.aug_ref_bbox_param is not None
        center_off = self.aug_ref_bbox_param[0]
        size_perturb = self.aug_ref_bbox_param[1]

        n_bb = bbox.shape[0]
        # bbox center offset
        center_offs = (2 * np.random.rand(n_bb, 2) - 1) * center_off
        # bbox resize ratios
        resize_ratios = (2 * np.random.rand(n_bb, 2) - 1) * size_perturb + 1
        # bbox: x1, y1, x2, y2
        centers = (bbox[:, :2] + bbox[:, 2:]) / 2.
        sizes = bbox[:, 2:] - bbox[:, :2]
        new_centers = centers + center_offs * sizes
        new_sizes = sizes * resize_ratios
        new_x1y1 = new_centers - new_sizes / 2.
        new_x2y2 = new_centers + new_sizes / 2.
        c_min = [0, 0]
        c_max = [img_size[1], img_size[0]]
        new_x1y1 = np.clip(new_x1y1, c_min, c_max)
        new_x2y2 = np.clip(new_x2y2, c_min, c_max)
        bbox = np.hstack((new_x1y1, new_x2y2)).astype(np.float32)
        return bbox

    def sample_ref(self, idx):
        # sample another frame in the same sequence as reference
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        sample_range = range(len(vid_info['filenames']))
        valid_samples = []
        for i in sample_range:
            # check if the frame id is valid
            ref_idx = (vid, i)
            if i != frame_id and ref_idx in self.img_ids:
                valid_samples.append(ref_idx)
        assert len(valid_samples) > 0
        return random.choice(valid_samples)

    # sampling req sequence for TASUF
    # sequence length from 1 to 8
    # sequence direction => backward or forward
    def sample_ref_seq(self, idx):
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        vid_len = len(vid_info['filenames'])
        seq_len = random.randint(1, 8)
        if frame_id < seq_len:
            valid_samples = self.sample_ref_range(frame_id,
                                                  vid_len,
                                                  seq_len,
                                                  backward=True)
        elif frame_id + seq_len > vid_len:
            valid_samples = self.sample_ref_range(0,
                                                  frame_id,
                                                  seq_len,
                                                  backward=False)
        else:
            if random.random() < 0.5:
                valid_samples = self.sample_ref_range(frame_id,
                                                      vid_len,
                                                      seq_len,
                                                      backward=True)
            else:
                valid_samples = self.sample_ref_range(0,
                                                      frame_id,
                                                      seq_len,
                                                      backward=False)
        return valid_samples

    def sample_ref_range(self, start, end, seq_len, backward=False):
        sample_range = list(range(start, end))
        while len(sample_range) < seq_len:
            sample_range *= 2
        valid_samples = random.sample(sample_range, seq_len)
        valid_samples.sort()
        # [JW]
        for i, v in enumerate(valid_samples[:-1]):
            if valid_samples[i + 1] - v > self.max_gap:
                gap_modulation = valid_samples[i + 1] - v - self.max_gap
                for j in range(i + 1, len(valid_samples)):
                    valid_samples[j] -= gap_modulation
        if backward:
            valid_samples.reverse()
        return valid_samples

    def prepare_train_img(self, idx):
        # prepare a pair of image in a sequence
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        # load image
        img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames'][frame_id]))
        basename = osp.basename(vid_info['filenames'][frame_id])

        ref_frame_id_seq = self.sample_ref_seq(idx)
        ref_img_seq =\
             [mmcv.imread(osp.join(self.img_prefix, vid_info['filenames'][ref_frame_id]))
             for ref_frame_id in ref_frame_id_seq]

        # load proposals if necessary
        if self.proposals is not None:
            proposals = self.proposals[idx][:self.num_max_proposals]
            # TODO: Handle empty proposals properly. Currently images with
            # no proposals are just ignored, but they can be used for
            # training in concept.
            if len(proposals) == 0:
                return None
            if not (proposals.shape[1] == 4 or proposals.shape[1] == 5):
                raise AssertionError(
                    'proposals should have shapes (n, 4) or (n, 5), '
                    'but found {}'.format(proposals.shape))
            if proposals.shape[1] == 5:
                scores = proposals[:, 4, None]
                proposals = proposals[:, :4]
            else:
                scores = None

        ann = self.get_ann_info(vid, frame_id)

        ref_ann_seq =\
             [self.get_ann_info(vid, ref_frame_id) for ref_frame_id in ref_frame_id_seq]

        gt_bboxes = ann['bboxes']
        gt_labels = ann['labels']

        ref_bboxes_seq = []
        for i, ref_ann in enumerate(ref_ann_seq):
            ref_bboxes = ref_ann['bboxes']
            if len(ref_bboxes) == 0:
                return None
            ref_bboxes_seq.append(ref_bboxes)

        # obj ids attribute does not exist in current annotation
        # need to add it
        ref_ids_seq = [ref_ann['obj_ids'] for ref_ann in ref_ann_seq]

        gt_ids = ann['obj_ids']
        # compute matching of reference frame with current frame
        # 0 denote there is no matching
        id_set = set()
        for ref_ids in ref_ids_seq:
            id_set = id_set.union(set(ref_ids))
        id_set = sorted(list(id_set))
        gt_pids_seq = []
        for ref_ids in ref_ids_seq:
            gt_pids_seq.append([id_set.index(i) + 1 for i in ref_ids])
        gt_pids_seq.append(
            [id_set.index(i) + 1 if i in id_set else 0 for i in gt_ids])

        if self.with_crowd:
            gt_bboxes_ignore = ann['bboxes_ignore']

        # skip the image if there is no valid gt bbox
        if len(gt_bboxes) == 0:
            return None

        # extra augmentation
        if self.extra_aug is not None:
            img, gt_bboxes, gt_labels = self.extra_aug(img, gt_bboxes,
                                                       gt_labels)

        # apply transforms
        flip = True if np.random.rand() < self.flip_ratio else False
        img_scale = random_scale(self.img_scales)  # sample a scale
        img, img_shape, pad_shape, scale_factor = self.img_transform(
            img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
        img = img.copy()

        for i, ref_img in enumerate(ref_img_seq):
            ref_img, ref_img_shape, _, ref_scale_factor = self.img_transform(
                ref_img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
            ref_img = ref_img.copy()
            ref_img_seq[i] = ref_img

        if self.proposals is not None:
            proposals = self.bbox_transform(proposals, img_shape, scale_factor,
                                            flip)
            proposals = np.hstack([proposals, scores
                                   ]) if scores is not None else proposals
        gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor,
                                        flip)
        for i, ref_bboxes in enumerate(ref_bboxes_seq):
            ref_bboxes = self.bbox_transform(ref_bboxes, ref_img_shape,
                                             ref_scale_factor, flip)
            ref_bboxes_seq[i] = ref_bboxes
        if self.aug_ref_bbox_param is not None:
            for i, ref_bboxes in enumerate(ref_bboxes_seq):
                ref_bboxes = self.bbox_aug(ref_bboxes, ref_img_shape)
                ref_bboxes_seq[i] = ref_bboexs

        if self.with_crowd:
            gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape,
                                                   scale_factor, flip)
        if self.with_mask:
            gt_masks = self.mask_transform(ann['masks'], pad_shape,
                                           scale_factor, flip)

        ori_shape = (vid_info['height'], vid_info['width'], 3)
        img_meta = dict(ori_shape=ori_shape,
                        img_shape=img_shape,
                        pad_shape=pad_shape,
                        scale_factor=scale_factor,
                        flip=flip)

        ref_img_DC_seq = []
        for ref_img in ref_img_seq:
            ref_img_DC_seq.append(DC(to_tensor(ref_img), stack=True))
        ref_bboxes_DC_seq = []
        for ref_bboxes in ref_bboxes_seq:
            ref_bboxes_DC_seq.append(DC(to_tensor(ref_bboxes)))
        data = dict(img=DC(to_tensor(img), stack=True),
                    ref_img=ref_img_DC_seq,
                    img_meta=DC(img_meta, cpu_only=True),
                    gt_bboxes=DC(to_tensor(gt_bboxes)),
                    ref_bboxes=ref_bboxes_DC_seq)
        if self.proposals is not None:
            data['proposals'] = DC(to_tensor(proposals))
        if self.with_label:
            data['gt_labels'] = DC(to_tensor(gt_labels))
        if self.with_track:
            gt_pids_DC_seq = []
            for gt_pids in gt_pids_seq:
                gt_pids_DC_seq.append(DC(to_tensor(gt_pids)))
            data['gt_pids'] = gt_pids_DC_seq
        if self.with_crowd:
            data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore))
        if self.with_mask:
            data['gt_masks'] = DC(gt_masks, cpu_only=True)

        return data

    def prepare_test_img(self, idx):
        """Prepare an image for testing (multi-scale and flipping)"""
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames'][frame_id]))
        proposal = None

        def prepare_single(img, frame_id, scale, flip, proposal=None):
            _img, img_shape, pad_shape, scale_factor = self.img_transform(
                img, scale, flip, keep_ratio=self.resize_keep_ratio)
            _img = to_tensor(_img)
            _img_meta = dict(ori_shape=(vid_info['height'], vid_info['width'],
                                        3),
                             img_shape=img_shape,
                             pad_shape=pad_shape,
                             is_first=(frame_id == 0),
                             video_id=vid,
                             frame_id=frame_id,
                             scale_factor=scale_factor,
                             flip=flip)
            if proposal is not None:
                if proposal.shape[1] == 5:
                    score = proposal[:, 4, None]
                    proposal = proposal[:, :4]
                else:
                    score = None
                _proposal = self.bbox_transform(proposal, img_shape,
                                                scale_factor, flip)
                _proposal = np.hstack([_proposal, score
                                       ]) if score is not None else _proposal
                _proposal = to_tensor(_proposal)
            else:
                _proposal = None
            return _img, _img_meta, _proposal

        imgs = []
        img_metas = []
        proposals = []
        for scale in self.img_scales:
            _img, _img_meta, _proposal = prepare_single(
                img, frame_id, scale, False, proposal)
            imgs.append(_img)
            img_metas.append(DC(_img_meta, cpu_only=True))
            proposals.append(_proposal)
            if self.flip_ratio > 0:
                _img, _img_meta, _proposal = prepare_single(
                    img, scale, True, proposal)
                imgs.append(_img)
                img_metas.append(DC(_img_meta, cpu_only=True))
                proposals.append(_proposal)
        data = dict(img=imgs, img_meta=img_metas)
        return data

    def _parse_ann_info(self, ann_info, frame_id, with_mask=True):
        """Parse bbox and mask annotation.

        Args:
            ann_info (list[dict]): Annotation info of an image.
            with_mask (bool): Whether to parse mask annotations.

        Returns:
            dict: A dict containing the following keys: bboxes, bboxes_ignore,
                labels, masks, mask_polys, poly_lens.
        """
        gt_bboxes = []
        gt_labels = []
        gt_ids = []
        gt_bboxes_ignore = []
        # Two formats are provided.
        # 1. mask: a binary map of the same size of the image.
        # 2. polys: each mask consists of one or several polys, each poly is a
        # list of float.
        if with_mask:
            gt_masks = []
            gt_mask_polys = []
            gt_poly_lens = []
        for i, ann in enumerate(ann_info):
            # each ann is a list of masks
            # ann:
            # bbox: list of bboxes
            # segmentation: list of segmentation
            # category_id
            # area: list of area
            bbox = ann['bboxes'][frame_id]
            area = ann['areas'][frame_id]
            segm = ann['segmentations'][frame_id]
            if bbox is None: continue
            x1, y1, w, h = bbox
            if area <= 0 or w < 1 or h < 1:
                continue
            bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
            if ann['iscrowd']:
                gt_bboxes_ignore.append(bbox)
            else:
                gt_bboxes.append(bbox)
                gt_ids.append(ann['id'])
                gt_labels.append(self.cat2label[ann['category_id']])
            if with_mask:
                gt_masks.append(self.ytvos.annToMask(ann, frame_id))
                mask_polys = [
                    p for p in segm if len(p) >= 6
                ]  # valid polygons have >= 3 points (6 coordinates)
                poly_lens = [len(p) for p in mask_polys]
                gt_mask_polys.append(mask_polys)
                gt_poly_lens.extend(poly_lens)
        if gt_bboxes:
            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
            gt_labels = np.array(gt_labels, dtype=np.int64)
        else:
            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
            gt_labels = np.array([], dtype=np.int64)

        if gt_bboxes_ignore:
            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
        else:
            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)

        ann = dict(bboxes=gt_bboxes,
                   labels=gt_labels,
                   obj_ids=gt_ids,
                   bboxes_ignore=gt_bboxes_ignore)

        if with_mask:
            ann['masks'] = gt_masks
            # poly format is not used in the current implementation
            ann['mask_polys'] = gt_mask_polys
            ann['poly_lens'] = gt_poly_lens
        return ann
コード例 #3
0
class YTVOSDataset(CustomDataset):
    CLASSES = ('person', 'giant_panda', 'lizard', 'parrot', 'skateboard',
               'sedan', 'ape', 'dog', 'snake', 'monkey', 'hand', 'rabbit',
               'duck', 'cat', 'cow', 'fish', 'train', 'horse', 'turtle',
               'bear', 'motorbike', 'giraffe', 'leopard', 'fox', 'deer', 'owl',
               'surfboard', 'airplane', 'truck', 'zebra', 'tiger', 'elephant',
               'snowboard', 'boat', 'shark', 'mouse', 'frog', 'eagle',
               'earless_seal', 'tennis_racket')

    def __init__(self,
                 ann_file,
                 img_prefix,
                 img_scale,
                 img_norm_cfg,
                 size_divisor=None,
                 proposal_file=None,
                 num_max_proposals=1000,
                 flip_ratio=0,
                 with_mask=True,
                 with_crowd=True,
                 with_label=True,
                 with_track=False,
                 extra_aug=None,
                 aug_ref_bbox_param=None,
                 resize_keep_ratio=True,
                 test_mode=False,
                 every_frame=False,
                 is_flow=False,
                 flow_test=False):
        # prefix of images path
        self.img_prefix = img_prefix

        # load annotations (and proposals)
        self.vid_infos = self.load_annotations(ann_file)

        self.every_frame = every_frame
        self.is_flow = is_flow
        self.flow_test = flow_test
        if self.flow_test or self.is_flow:
            self.cuda = True
        self.cuda = False
        if self.cuda:
            from mmcv import Config
            from mmdet.models import build_detector
            from mmcv.runner import load_checkpoint
            cfg = Config.fromfile(
                "../configs/masktrack_rcnn_r50_fpn_1x_flow_youtubevos.py")
            self.det_model = build_detector(cfg.model,
                                            train_cfg=cfg.train_cfg,
                                            test_cfg=cfg.test_cfg)
            load_checkpoint(self.det_model,
                            "../results/20200312-180434/epoch_9.pth")
            self.det_model = self.det_model.cuda()
            self.det_model.eval()
            for param in self.det_model.parameters():
                param.requires_grad = False

        # Set indexes for data loading
        img_ids = []  # training frames which have annotations
        img_ids_all = []  # all training frames
        img_ids_pairs = []  # flow data pairs
        for idx, vid_info in enumerate(self.vid_infos):
            vid_name = vid_info['filenames'][0].split('/')[0]
            folder_path = osp.join(self.img_prefix, vid_name)
            files = os.listdir(folder_path)
            files.sort()
            vid_info['filenames_all'] = [
                osp.join(vid_name, file) for file in files
            ]
            for _id in range(len(files)):
                img_ids_all.append((idx, _id))
                is_anno = vid_info['filenames_all'][_id] in vid_info[
                    'filenames']
                if is_anno and _id > 0:  # having annotation and is not the first frame.
                    ann_idx = vid_info['filenames'].index(
                        vid_info['filenames_all'][_id])
                    ann = self.get_ann_info(idx, ann_idx)
                    gt_bboxes = ann['bboxes']
                    # skip the image if there is no valid gt bbox
                    if len(gt_bboxes) == 0:
                        continue
                    # random select key frame
                    key_id = _id - np.random.randint(1, min(10, _id))
                    img_ids_pairs.append(((idx, key_id), (idx, _id)))
            for frame_id in range(len(vid_info['filenames'])):
                img_ids.append((idx, frame_id))

        self.img_ids = img_ids
        self.img_ids_all = img_ids_all
        self.img_ids_pairs = img_ids_pairs

        if proposal_file is not None:
            self.proposals = self.load_proposals(proposal_file)
        else:
            self.proposals = None
        # filter images with no annotation during training
        if not test_mode:
            valid_inds = [
                i for i, (v, f) in enumerate(self.img_ids)
                if len(self.get_ann_info(v, f)['bboxes'])
            ]
            self.img_ids = [self.img_ids[i] for i in valid_inds]

        # (long_edge, short_edge) or [(long1, short1), (long2, short2), ...]
        self.img_scales = img_scale if isinstance(img_scale,
                                                  list) else [img_scale]
        assert mmcv.is_list_of(self.img_scales, tuple)
        # normalization configs
        self.img_norm_cfg = img_norm_cfg

        # max proposals per image
        self.num_max_proposals = num_max_proposals
        # flip ratio
        self.flip_ratio = flip_ratio
        assert flip_ratio >= 0 and flip_ratio <= 1
        # padding border to ensure the image size can be divided by
        # size_divisor (used for FPN)
        self.size_divisor = size_divisor

        # with mask or not (reserved field, takes no effect)
        self.with_mask = with_mask
        # some datasets provide bbox annotations as ignore/crowd/difficult,
        # if `with_crowd` is True, then these info is returned.
        self.with_crowd = with_crowd
        # with label is False for RPN
        self.with_label = with_label
        self.with_track = with_track
        # params for augmenting bbox in the reference frame
        self.aug_ref_bbox_param = aug_ref_bbox_param
        # in test mode or not
        self.test_mode = test_mode

        # set group flag for the sampler
        if not self.test_mode:
            self._set_group_flag()
        # transforms
        self.img_transform = ImageTransform(size_divisor=self.size_divisor,
                                            **self.img_norm_cfg)
        self.bbox_transform = BboxTransform()
        self.mask_transform = MaskTransform()
        self.numpy2tensor = Numpy2Tensor()

        # if use extra augmentation
        if extra_aug is not None:
            self.extra_aug = ExtraAugmentation(**extra_aug)
        else:
            self.extra_aug = None

        # image rescale if keep ratio
        self.resize_keep_ratio = resize_keep_ratio

    def __len__(self):
        if self.every_frame:
            return len(self.img_ids_all)
        elif self.is_flow:
            return len(self.img_ids_pairs)
        else:
            return len(self.img_ids)

    def __getitem__(self, idx):
        if self.test_mode:
            if self.every_frame:
                return self.prepare_test_img(self.img_ids_all[idx])
            else:
                return self.prepare_test_img(self.img_ids[idx])
        if self.is_flow:
            if self.flow_test:
                data = self.prepare_train_flow_test_img(
                    self.img_ids_pairs[idx])
            else:
                data = self.prepare_train_flow_img(self.img_ids_pairs[idx])
        else:
            data = self.prepare_train_img(self.img_ids[idx])
        return data

    def load_annotations(self, ann_file):
        self.ytvos = YTVOS(ann_file)
        self.cat_ids = self.ytvos.getCatIds()
        self.cat2label = {
            cat_id: i + 1
            for i, cat_id in enumerate(self.cat_ids)
        }
        self.vid_ids = self.ytvos.getVidIds()
        vid_infos = []
        for i in self.vid_ids:
            info = self.ytvos.loadVids([i])[0]
            info['filenames'] = info['file_names']
            vid_infos.append(info)
        return vid_infos

    def get_ann_info(self, idx, frame_id):
        vid_id = self.vid_infos[idx]['id']
        ann_ids = self.ytvos.getAnnIds(vidIds=[vid_id])
        ann_info = self.ytvos.loadAnns(ann_ids)
        return self._parse_ann_info(ann_info, frame_id)

    def _set_group_flag(self):
        """Set flag according to image aspect ratio.

        Images with aspect ratio greater than 1 will be set as group 1,
        otherwise group 0.
        """
        self.flag = np.zeros(len(self), dtype=np.uint8)
        for i in range(len(self)):
            vid_id, _ = self.img_ids[i]
            vid_info = self.vid_infos[vid_id]
            if vid_info['width'] / vid_info['height'] > 1:
                self.flag[i] = 1

    def bbox_aug(self, bbox, img_size):
        assert self.aug_ref_bbox_param is not None
        center_off = self.aug_ref_bbox_param[0]
        size_perturb = self.aug_ref_bbox_param[1]

        n_bb = bbox.shape[0]
        # bbox center offset
        center_offs = (2 * np.random.rand(n_bb, 2) - 1) * center_off
        # bbox resize ratios
        resize_ratios = (2 * np.random.rand(n_bb, 2) - 1) * size_perturb + 1
        # bbox: x1, y1, x2, y2
        centers = (bbox[:, :2] + bbox[:, 2:]) / 2.
        sizes = bbox[:, 2:] - bbox[:, :2]
        new_centers = centers + center_offs * sizes
        new_sizes = sizes * resize_ratios
        new_x1y1 = new_centers - new_sizes / 2.
        new_x2y2 = new_centers + new_sizes / 2.
        c_min = [0, 0]
        c_max = [img_size[1], img_size[0]]
        new_x1y1 = np.clip(new_x1y1, c_min, c_max)
        new_x2y2 = np.clip(new_x2y2, c_min, c_max)
        bbox = np.hstack((new_x1y1, new_x2y2)).astype(np.float32)
        return bbox

    def sample_ref(self, idx):
        # sample another frame in the same sequence as reference
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        sample_range = range(len(vid_info['filenames']))
        valid_samples = []
        for i in sample_range:
            # check if the frame id is valid
            ref_idx = (vid, i)
            if i != frame_id and ref_idx in self.img_ids:
                valid_samples.append(ref_idx)
        assert len(valid_samples) > 0
        return random.choice(valid_samples)

    def prepare_train_flow_test_img(self, idx):

        # prepare a pair of image in a sequence
        vid, key_frame_id = idx[0]
        _, cur_frame_id = idx[1]
        vid_info = self.vid_infos[vid]

        # load image
        key_img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames_all'][key_frame_id]))
        cur_img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames_all'][cur_frame_id]))
        h_orig, w_orig, _ = key_img.shape
        basename = osp.basename(vid_info['filenames_all'][key_frame_id])

        # apply transforms
        flip = True if np.random.rand() < self.flip_ratio else False
        img_scale = random_scale(self.img_scales)  # sample a scale
        cur_img, img_shape, pad_shape, scale_factor = self.img_transform(
            cur_img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
        if (type(scale_factor)) != float:
            scale_factor = tuple(scale_factor)
        cur_img = cur_img.copy()
        key_img, key_img_shape, _, ref_scale_factor = self.img_transform(
            key_img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
        key_img = key_img.copy()

        # trans = torchvision.transforms.ToTensor()
        key_img = torch.from_numpy(key_img).cuda()
        cur_img = torch.from_numpy(cur_img).cuda()

        def resize(feat_map, size=(48, 64)):
            """Resize feature map to certain size."""
            key_feature = torch.nn.functional.interpolate(feat_map,
                                                          size,
                                                          mode='bilinear',
                                                          align_corners=True)
            return key_feature

        img_size = (384, 640)
        if key_img.shape[-2:] != img_size:
            key_img = resize(key_img.unsqueeze(0), img_size).squeeze(0)
            cur_img = resize(cur_img.unsqueeze(0), img_size).squeeze(0)

        key_feature_maps, _ = self.det_model.extract_feat(key_img.unsqueeze(0))
        cur_feature_maps, _ = self.det_model.extract_feat(cur_img.unsqueeze(0))

        key_feature_maps = [
            feat_map.squeeze(0) for feat_map in key_feature_maps
        ]
        cur_feature_maps = [
            feat_map.squeeze(0) for feat_map in cur_feature_maps
        ]

        data = dict(key_img=key_img,
                    cur_img=cur_img,
                    key_img_feats=key_feature_maps,
                    cur_img_feats=cur_feature_maps)
        return data

    def prepare_train_flow_img(self, idx):

        # prepare a pair of image in a sequence
        vid, key_frame_id = idx[0]
        _, cur_frame_id = idx[1]
        vid_info = self.vid_infos[vid]

        # load image
        key_img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames_all'][key_frame_id]))
        cur_img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames_all'][cur_frame_id]))
        h_orig, w_orig, _ = cur_img.shape
        basename = osp.basename(vid_info['filenames_all'][key_frame_id])

        # load proposals if necessary
        if self.proposals is not None:
            proposals = self.proposals[idx][:self.num_max_proposals]
            # TODO: Handle empty proposals properly. Currently images with
            # no proposals are just ignored, but they can be used for
            # training in concept.
            if len(proposals) == 0:
                return None
            if not (proposals.shape[1] == 4 or proposals.shape[1] == 5):
                raise AssertionError(
                    'proposals should have shapes (n, 4) or (n, 5), '
                    'but found {}'.format(proposals.shape))
            if proposals.shape[1] == 5:
                scores = proposals[:, 4, None]
                proposals = proposals[:, :4]
            else:
                scores = None
        ann_idx = vid_info['filenames'].index(
            vid_info['filenames_all'][cur_frame_id])
        ann = self.get_ann_info(vid, ann_idx)
        gt_bboxes = ann['bboxes']
        gt_labels = ann['labels']

        if self.with_crowd:
            gt_bboxes_ignore = ann['bboxes_ignore']

        # skip the image if there is no valid gt bbox
        if len(gt_bboxes) == 0:
            return None

        # extra augmentation
        if self.extra_aug is not None:
            cur_img, gt_bboxes, gt_labels = self.extra_aug(
                cur_img, gt_bboxes, gt_labels)

        # apply transforms
        flip = True if np.random.rand() < self.flip_ratio else False

        img_scales = [(1280, 720), (640, 360)]
        # img_scale = random_scale(self.img_scales)  # sample a scale
        cur_img, img_shape, pad_shape, scale_factor = self.img_transform(
            cur_img, img_scales[1], flip, keep_ratio=self.resize_keep_ratio)
        if (type(scale_factor)) != float:
            scale_factor = tuple(scale_factor)
        cur_img = cur_img.copy()
        key_img, key_img_shape, _, key_scale_factor = self.img_transform(
            key_img, img_scales[0], flip, keep_ratio=self.resize_keep_ratio)
        key_img = key_img.copy()
        if self.proposals is not None:
            proposals = self.bbox_transform(proposals, img_shape, scale_factor,
                                            flip)
            proposals = np.hstack([proposals, scores
                                   ]) if scores is not None else proposals
        gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor,
                                        flip)

        if self.with_crowd:
            gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape,
                                                   scale_factor, flip)
        if self.with_mask:
            if w_orig > h_orig:
                h, w = img_shape[0], img_shape[1]
                _scale_factor = tuple([w, h, w, h])
            else:
                _scale_factor = scale_factor
            gt_masks = self.mask_transform(ann['masks'], pad_shape,
                                           _scale_factor, flip)

        ori_shape = (vid_info['height'], vid_info['width'], 3)
        img_meta = dict(ori_shape=ori_shape,
                        img_shape=img_shape,
                        pad_shape=pad_shape,
                        scale_factor=scale_factor,
                        is_first=(cur_frame_id == 0),
                        flip=flip)

        data = dict(
            img=DC(to_tensor(key_img), stack=True),
            ref_img=DC(to_tensor(cur_img), stack=True),
            img_meta=DC(img_meta, cpu_only=True),
            gt_bboxes=DC(to_tensor(gt_bboxes)),
            # ref_bboxes=DC(to_tensor(ref_bboxes))
        )
        if self.proposals is not None:
            data['proposals'] = DC(to_tensor(proposals))
        if self.with_label:
            data['gt_labels'] = DC(to_tensor(gt_labels))
        # if self.with_track:
        #     data['gt_pids'] = DC(to_tensor(gt_pids))
        if self.with_crowd:
            data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore))
        if self.with_mask:
            data['gt_masks'] = DC(gt_masks, cpu_only=True)
        data['train_flow'] = True

        if self.cuda:
            key_img_cuda = torch.from_numpy(key_img).cuda()
            cur_img_cuda = torch.from_numpy(cur_img).cuda()

            def resize(feat_map, size=(48, 64)):
                """Resize feature map to certain size."""
                key_feature = torch.nn.functional.interpolate(
                    feat_map, size, mode='bilinear', align_corners=True)
                return key_feature

            img_size = (384, 640)
            if key_img_cuda.shape[-2:] != img_size:
                key_img_cuda = resize(key_img_cuda.unsqueeze(0),
                                      img_size).squeeze(0)
                cur_img_cuda = resize(cur_img_cuda.unsqueeze(0),
                                      img_size).squeeze(0)

            key_feature_maps, _ = self.det_model.extract_feat(
                key_img_cuda.unsqueeze(0))
            cur_feature_maps, _ = self.det_model.extract_feat(
                cur_img_cuda.unsqueeze(0))

            key_feature_maps = [
                feat_map.squeeze(0) for feat_map in key_feature_maps
            ]
            cur_feature_maps = [
                feat_map.squeeze(0) for feat_map in cur_feature_maps
            ]

            data['key_feature_maps'] = key_feature_maps
            data['cur_feature_maps'] = cur_feature_maps

        return data

    def prepare_train_img(self, idx):
        # prepare a pair of image in a sequence
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        # load image
        if self.is_flow or self.every_frame:
            img = mmcv.imread(
                osp.join(self.img_prefix, vid_info['filenames_all'][frame_id]))
        else:
            img = mmcv.imread(
                osp.join(self.img_prefix, vid_info['filenames'][frame_id]))
        h_orig, w_orig, _ = img.shape
        basename = osp.basename(vid_info['filenames'][frame_id])
        _, ref_frame_id = self.sample_ref(idx)
        ref_img = mmcv.imread(
            osp.join(self.img_prefix, vid_info['filenames'][ref_frame_id]))
        # load proposals if necessary
        if self.proposals is not None:
            proposals = self.proposals[idx][:self.num_max_proposals]
            # TODO: Handle empty proposals properly. Currently images with
            # no proposals are just ignored, but they can be used for
            # training in concept.
            if len(proposals) == 0:
                return None
            if not (proposals.shape[1] == 4 or proposals.shape[1] == 5):
                raise AssertionError(
                    'proposals should have shapes (n, 4) or (n, 5), '
                    'but found {}'.format(proposals.shape))
            if proposals.shape[1] == 5:
                scores = proposals[:, 4, None]
                proposals = proposals[:, :4]
            else:
                scores = None

        ann = self.get_ann_info(vid, frame_id)
        ref_ann = self.get_ann_info(vid, ref_frame_id)
        gt_bboxes = ann['bboxes']
        gt_labels = ann['labels']
        ref_bboxes = ref_ann['bboxes']
        # obj ids attribute does not exist in current annotation
        # need to add it
        ref_ids = ref_ann['obj_ids']
        gt_ids = ann['obj_ids']
        # compute matching of reference frame with current frame
        # 0 denote there is no matching
        gt_pids = [ref_ids.index(i) + 1 if i in ref_ids else 0 for i in gt_ids]
        if self.with_crowd:
            gt_bboxes_ignore = ann['bboxes_ignore']

        # skip the image if there is no valid gt bbox
        if len(gt_bboxes) == 0:
            return None

        # extra augmentation
        if self.extra_aug is not None:
            img, gt_bboxes, gt_labels = self.extra_aug(img, gt_bboxes,
                                                       gt_labels)

        # apply transforms
        flip = True if np.random.rand() < self.flip_ratio else False
        img_scale = random_scale(self.img_scales)  # sample a scale
        img, img_shape, pad_shape, scale_factor = self.img_transform(
            img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
        if (type(scale_factor)) != float:
            scale_factor = tuple(scale_factor)
        img = img.copy()
        ref_img, ref_img_shape, _, ref_scale_factor = self.img_transform(
            ref_img, img_scale, flip, keep_ratio=self.resize_keep_ratio)
        ref_img = ref_img.copy()
        if self.proposals is not None:
            proposals = self.bbox_transform(proposals, img_shape, scale_factor,
                                            flip)
            proposals = np.hstack([proposals, scores
                                   ]) if scores is not None else proposals
        gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor,
                                        flip)
        ref_bboxes = self.bbox_transform(ref_bboxes, ref_img_shape,
                                         ref_scale_factor, flip)
        if self.aug_ref_bbox_param is not None:
            ref_bboxes = self.bbox_aug(ref_bboxes, ref_img_shape)
        if self.with_crowd:
            gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape,
                                                   scale_factor, flip)
        if self.with_mask:
            if w_orig > h_orig:
                h, w = img_shape[0], img_shape[1]
                _scale_factor = tuple([w, h, w, h])
            else:
                _scale_factor = scale_factor
            gt_masks = self.mask_transform(ann['masks'], pad_shape,
                                           _scale_factor, flip)

        ori_shape = (vid_info['height'], vid_info['width'], 3)
        img_meta = dict(ori_shape=ori_shape,
                        img_shape=img_shape,
                        pad_shape=pad_shape,
                        scale_factor=scale_factor,
                        is_first=(frame_id == 0),
                        flip=flip)

        data = dict(img=DC(to_tensor(img), stack=True),
                    ref_img=DC(to_tensor(ref_img), stack=True),
                    img_meta=DC(img_meta, cpu_only=True),
                    gt_bboxes=DC(to_tensor(gt_bboxes)),
                    ref_bboxes=DC(to_tensor(ref_bboxes)))
        if self.proposals is not None:
            data['proposals'] = DC(to_tensor(proposals))
        if self.with_label:
            data['gt_labels'] = DC(to_tensor(gt_labels))
        if self.with_track:
            data['gt_pids'] = DC(to_tensor(gt_pids))
        if self.with_crowd:
            data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore))
        if self.with_mask:
            data['gt_masks'] = DC(gt_masks, cpu_only=True)
        return data

    def prepare_test_img(self, idx):
        """Prepare an image for testing (multi-scale and flipping)"""
        vid, frame_id = idx
        vid_info = self.vid_infos[vid]
        is_anno = True
        if self.every_frame:
            img = mmcv.imread(
                osp.join(self.img_prefix, vid_info['filenames_all'][frame_id]))
            is_anno = vid_info['filenames_all'][frame_id] in vid_info[
                'filenames']
        else:
            img = mmcv.imread(
                osp.join(self.img_prefix, vid_info['filenames'][frame_id]))
        proposal = None

        if self.every_frame:
            file_name = vid_info['filenames_all'][frame_id]
        else:
            file_name = vid_info['filenames'][frame_id]

        def prepare_single(img,
                           frame_id,
                           scale,
                           flip,
                           file_name,
                           proposal=None,
                           is_anno=True):
            _img, img_shape, pad_shape, scale_factor = self.img_transform(
                img, scale, flip, keep_ratio=self.resize_keep_ratio)
            _img = to_tensor(_img)
            _img_meta = dict(ori_shape=(vid_info['height'], vid_info['width'],
                                        3),
                             img_shape=img_shape,
                             pad_shape=pad_shape,
                             is_first=(frame_id == 0),
                             video_id=vid,
                             file_name=file_name,
                             frame_id=frame_id,
                             scale_factor=scale_factor,
                             flip=flip,
                             is_anno=is_anno)
            if proposal is not None:
                if proposal.shape[1] == 5:
                    score = proposal[:, 4, None]
                    proposal = proposal[:, :4]
                else:
                    score = None
                _proposal = self.bbox_transform(proposal, img_shape,
                                                scale_factor, flip)
                _proposal = np.hstack([_proposal, score
                                       ]) if score is not None else _proposal
                _proposal = to_tensor(_proposal)
            else:
                _proposal = None
            return _img, _img_meta, _proposal

        imgs = []
        img_metas = []
        proposals = []
        for scale in self.img_scales:
            _img, _img_meta, _proposal = prepare_single(
                img, frame_id, scale, False, file_name, proposal, is_anno)
            imgs.append(_img)
            img_metas.append(DC(_img_meta, cpu_only=True))
            proposals.append(_proposal)
            if self.flip_ratio > 0:
                _img, _img_meta, _proposal = prepare_single(
                    img, scale, True, file_name, proposal, is_anno)
                imgs.append(_img)
                img_metas.append(DC(_img_meta, cpu_only=True))
                proposals.append(_proposal)
        data = dict(img=imgs, img_meta=img_metas)
        return data

    def _parse_ann_info(self, ann_info, frame_id, with_mask=True):
        """Parse bbox and mask annotation.

        Args:
            ann_info (list[dict]): Annotation info of an image.
            with_mask (bool): Whether to parse mask annotations.

        Returns:
            dict: A dict containing the following keys: bboxes, bboxes_ignore,
                labels, masks, mask_polys, poly_lens.
        """
        gt_bboxes = []
        gt_labels = []
        gt_ids = []
        gt_bboxes_ignore = []
        # Two formats are provided.
        # 1. mask: a binary map of the same size of the image.
        # 2. polys: each mask consists of one or several polys, each poly is a
        # list of float.
        if with_mask:
            gt_masks = []
            gt_mask_polys = []
            gt_poly_lens = []
        for i, ann in enumerate(ann_info):
            # each ann is a list of masks
            # ann:
            # bbox: list of bboxes
            # segmentation: list of segmentation
            # category_id
            # area: list of area
            bbox = ann['bboxes'][frame_id]
            area = ann['areas'][frame_id]
            segm = ann['segmentations'][frame_id]
            if bbox is None: continue
            x1, y1, w, h = bbox
            if area <= 0 or w < 1 or h < 1:
                continue
            bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
            if ann['iscrowd']:
                gt_bboxes_ignore.append(bbox)
            else:
                gt_bboxes.append(bbox)
                gt_ids.append(ann['id'])
                gt_labels.append(self.cat2label[ann['category_id']])
            if with_mask:
                gt_masks.append(self.ytvos.annToMask(ann, frame_id))
                mask_polys = [
                    p for p in segm if len(p) >= 6
                ]  # valid polygons have >= 3 points (6 coordinates)
                poly_lens = [len(p) for p in mask_polys]
                gt_mask_polys.append(mask_polys)
                gt_poly_lens.extend(poly_lens)
        if gt_bboxes:
            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
            gt_labels = np.array(gt_labels, dtype=np.int64)
        else:
            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
            gt_labels = np.array([], dtype=np.int64)

        if gt_bboxes_ignore:
            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
        else:
            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)

        ann = dict(bboxes=gt_bboxes,
                   labels=gt_labels,
                   obj_ids=gt_ids,
                   bboxes_ignore=gt_bboxes_ignore)

        if with_mask:
            ann['masks'] = gt_masks
            # poly format is not used in the current implementation
            ann['mask_polys'] = gt_mask_polys
            ann['poly_lens'] = gt_poly_lens
        return ann
コード例 #4
0
class YTVOSDataset(CustomDataset):
    CLASSES = ('person', 'giant_panda', 'lizard', 'parrot', 'skateboard',
               'sedan', 'ape', 'dog', 'snake', 'monkey', 'hand', 'rabbit',
               'duck', 'cat', 'cow', 'fish', 'train', 'horse', 'turtle',
               'bear', 'motorbike', 'giraffe', 'leopard', 'fox', 'deer', 'owl',
               'surfboard', 'airplane', 'truck', 'zebra', 'tiger', 'elephant',
               'snowboard', 'boat', 'shark', 'mouse', 'frog', 'eagle',
               'earless_seal', 'tennis_racket')

    def __init__(self,
                 ann_file,
                 pipeline,
                 data_root=None,
                 img_prefix='',
                 of_prefix=None,
                 seg_prefix=None,
                 proposal_file=None,
                 test_mode=False,
                 filter_empty_gt=True,
                 seq_len=0,
                 step=1):

        # prefix of images path
        self.ann_file = ann_file
        self.data_root = data_root
        self.img_prefix = img_prefix
        self.of_prefix = of_prefix
        self.seg_prefix = seg_prefix
        self.proposal_file = proposal_file
        self.test_mode = test_mode
        self.filter_empty_gt = filter_empty_gt

        self.seq_len = seq_len

        # join paths if data_root is specified
        if self.data_root is not None:
            if not osp.isabs(self.ann_file):
                self.ann_file = osp.join(self.data_root, self.ann_file)
            if not (self.img_prefix is None or osp.isabs(self.img_prefix)):
                self.img_prefix = osp.join(self.data_root, self.img_prefix)
            if not (self.of_prefix is None or osp.isabs(self.of_prefix)):
                self.of_prefix = osp.join(self.data_root, self.of_prefix)
            if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)):
                self.seg_prefix = osp.join(self.data_root, self.seg_prefix)
            if not (self.proposal_file is None
                    or osp.isabs(self.proposal_file)):
                self.proposal_file = osp.join(self.data_root,
                                              self.proposal_file)

        # load annotations (and proposals)
        self.vid_infos = self.load_annotations(ann_file)

        self.sample_ids = []
        self._frame_ids = {}
        for vid_id, vid_info in self.vid_infos.items():

            #video_img_ids = []
            video_sample_ids = []
            for frame_id in range(len(vid_info['filenames'])):
                idx = (vid_id, frame_id)
                if test_mode or len(self.get_ann_info(idx)['masks']):
                    video_sample_ids.append(idx)
            if len(video_sample_ids) >= seq_len:
                self._frame_ids[vid_id] = video_sample_ids
                if seq_len > 0:
                    self.sample_ids = self.sample_ids + video_sample_ids[
                        0::step]
                    #if test_mode:
                    #    self.sample_ids = self.sample_ids+ [video_sample_ids[-1]]

                else:
                    self.sample_ids = self.sample_ids + [video_sample_ids[-1]]

        if proposal_file is not None:
            self.proposals = self.load_proposals(proposal_file)
        else:
            self.proposals = None

        # set group flag for the sampler
        if not self.test_mode:
            self._set_group_flag()
        # processing pipeline
        self.pipeline = Compose(pipeline)

    def __len__(self):
        return len(self.sample_ids)

    def _rand_another(self, idx):
        pool = np.where(self.flag == self.flag[idx])[0]
        return np.random.choice(pool)

    def __getitem__(self, idx):
        if self.test_mode:
            return self.prepare_test_img(self.sample_ids[idx])
        while True:
            data = self.prepare_train_img(self.sample_ids[idx])
            if data is None:
                idx = self._rand_another(idx)
                continue
            return data

    def load_annotations(self, ann_file):
        self.ytvos = YTVOS(ann_file)
        self.cat_ids = self.ytvos.getCatIds()
        self.cat2label = {
            cat_id: i + 1
            for i, cat_id in enumerate(self.cat_ids)
        }
        self.vid_ids = self.ytvos.getVidIds()
        vid_infos = {}
        for i in self.vid_ids:
            info = self.ytvos.loadVids([i])[0]
            info['filenames'] = info['file_names']
            vid_infos[i] = info
        return vid_infos

    def get_ann_info(self, idx):
        vid_id, frame_id = idx
        #vid_id = self.vid_infos[idx]['id']
        ann_ids = self.ytvos.getAnnIds(vidIds=[vid_id])
        ann_info = self.ytvos.loadAnns(ann_ids)
        return self._parse_ann_info(self.get_image_info(idx), ann_info,
                                    frame_id)

    def get_image_info(self, idx):
        vid_id, frame_id = idx
        vid_info = self.vid_infos[vid_id]
        return dict(filename=vid_info['filenames'][frame_id],
                    height=vid_info['height'],
                    width=vid_info['width'],
                    video_id=vid_id,
                    frame_id=frame_id)

    def pre_pipeline(self, results, idx, prev_results=None):
        results['img_prefix'] = self.img_prefix
        results['of_prefix'] = self.of_prefix
        results['seg_prefix'] = self.seg_prefix
        results['proposal_file'] = self.proposal_file
        results['bbox_fields'] = []
        results['mask_fields'] = []
        results['seg_fields'] = []
        video_id, frame_id = idx
        results['video_id'] = video_id
        results['frame_id'] = frame_id
        if prev_results is not None:
            results.update(prev_results)

    def _set_group_flag(self):
        """Set flag according to image aspect ratio.
        Images with aspect ratio greater than 1 will be set as group 1,
        otherwise group 0.
        """
        self.flag = np.zeros(len(self.sample_ids), dtype=np.uint8)
        for vid_id, _ in self.sample_ids:
            vid_info = self.vid_infos[vid_id]
            if vid_info['width'] / vid_info['height'] > 1:
                self.flag[vid_id] = 1

    def sample_ref_seq(self, idx):
        vid, fid = idx
        ref_frame_ids = self._frame_ids[vid]
        frame_index = ref_frame_ids.index(idx)
        ref_frame_ids_1 = ref_frame_ids[max(frame_index -
                                            (self.seq_len), 0):frame_index]
        ref_frame_ids_2 = ref_frame_ids[frame_index + 1:frame_index +
                                        (self.seq_len)]
        ref_frame_ids = ref_frame_ids_1 + ref_frame_ids_2
        if len(ref_frame_ids) < self.seq_len - 1:
            print('frame_index', frame_index, frame_index - self.seq_len,
                  frame_index + self.seq_len, ref_frame_ids)
            return None, 0
        ref_frame_ids = random.sample(ref_frame_ids, self.seq_len - 1)
        ref_frame_ids.append((vid, fid))
        ref_frame_ids.sort()
        frame_index = ref_frame_ids.index(idx)
        return ref_frame_ids, frame_index

    def test_ref_seq(self, idx):
        vid, fid = idx
        ref_frame_ids = self._frame_ids[vid]
        frame_index = ref_frame_ids.index(idx)
        ref_frame_ids = ref_frame_ids[frame_index:frame_index + self.seq_len]
        if len(ref_frame_ids) < self.seq_len:
            print('frame_index', frame_index, frame_index - self.seq_len,
                  frame_index + self.seq_len, ref_frame_ids)
            ref_frame_ids = None
        else:
            ref_frame_ids.sort()
        return ref_frame_ids

    def prepare_train_img(self, idx):
        # prepare a pair of image in a sequence

        seq = []
        flip_keys = ['flip', 'flip_direction']
        prev_results = None
        samples, ref_frame_index = self.sample_ref_seq(idx)
        if samples is None:
            return None

        for sample_idx in samples:

            img_info = self.get_image_info(sample_idx)
            ann_info = self.get_ann_info(sample_idx)
            results = dict(img_info=img_info, ann_info=ann_info)
            if self.proposals is not None:
                results['proposals'] = self.proposals[idx]
            self.pre_pipeline(results, sample_idx, prev_results=prev_results)
            seq.append(self.pipeline(results))
            if prev_results is None:
                prev_results = {
                    flip_key: results[flip_key]
                    for flip_key in flip_keys
                }
        return dict(img=seq[0]['img'],
                    img_meta=seq[0]['img_meta'],
                    inp_seq=seq,
                    ref_frame_index=ref_frame_index)

    def prepare_test_img(self, idx):

        vid, fid = idx
        ref_frame_ids = self._frame_ids[vid]
        ref_frame_ids.sort()
        #frame_idx = ref_frame_ids.index((str(vid), fid))
        frame_idx = ref_frame_ids.index((vid, fid))

        ref_frame_ids = ref_frame_ids[frame_idx:frame_idx + self.seq_len]
        seq = []
        for sample_idx in ref_frame_ids:

            img_info = self.get_image_info(sample_idx)
            results = dict(img_info=img_info)
            if self.proposals is not None:
                results['proposals'] = self.proposals[idx]
            self.pre_pipeline(results, sample_idx)
            seq.append(self.pipeline(results))

        return dict(img=seq[0]['img'],
                    img_meta=seq[0]['img_meta'],
                    inp_seq=seq)

    def _parse_ann_info(self, img_info, ann_info, frame_id):
        """Parse bbox and mask annotation.
        Args:
            ann_info (list[dict]): Annotation info of an image.
            with_mask (bool): Whether to parse mask annotations.
        Returns:
            dict: A dict containing the following keys: bboxes, bboxes_ignore,
                labels, masks, seg_map. "masks" are raw annotations and not
                decoded into binary masks.
        """
        gt_bboxes = []
        gt_labels = []
        gt_ids = []
        gt_bboxes_ignore = []
        gt_masks_ann = []
        for i, ann in enumerate(ann_info):
            # each ann is a list of masks
            # ann:
            # bbox: list of bboxes
            # segmentation: list of segmentation
            # category_id
            # area: list of area
            bbox = ann['bboxes'][frame_id]
            area = ann['areas'][frame_id]
            segm = ann['segmentations'][frame_id]
            if bbox is None: continue
            x1, y1, w, h = bbox
            if area <= 0 or w < 1 or h < 1:
                continue
            bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
            if ann.get('iscrowd', False):
                gt_bboxes_ignore.append(bbox)
            else:
                gt_bboxes.append(bbox)
                gt_ids.append(ann['id'])
                gt_labels.append(self.cat2label[ann['category_id']])
                gt_masks_ann.append(ann['segmentations'][frame_id])
        if gt_bboxes:
            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
            gt_labels = np.array(gt_labels, dtype=np.int64)
            gt_ids = np.array(gt_ids, dtype=np.int64)
        else:
            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
            gt_labels = np.array([], dtype=np.int64)
            gt_ids = np.array([], dtype=np.int64)

        if gt_bboxes_ignore:
            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
        else:
            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)

        seg_map = img_info['filename'].replace('jpg', 'png')

        ann = dict(bboxes=gt_bboxes,
                   labels=gt_labels,
                   inst_ids=gt_ids,
                   bboxes_ignore=gt_bboxes_ignore,
                   masks=gt_masks_ann,
                   seg_map=seg_map)

        return ann

    def _segm2json(self, results_list):
        """Dump the detection results to a json file.

        There are 3 types of results: proposals, bbox predictions, mask
        predictions, and they have different data types. This method will
        automatically recognize the type, and dump them to json files.

        Args:
            results (list[list | tuple | ndarray]): Testing results of the
                dataset.
            outfile_prefix (str): The filename prefix of the json files. If the
                prefix is "somepath/xxx", the json files will be named
                "somepath/xxx.bbox.json", "somepath/xxx.segm.json",
                "somepath/xxx.proposal.json".

        Returns:
            dict[str: str]: Possible keys are "bbox", "segm", "proposal", and
                values are corresponding filenames.
        """

        results = []
        for rl_det, rl_seg in results_list:

            for d, s in zip(rl_det, rl_seg):
                results.append((d, s))

        json_results = []
        vid_objs = {}
        res_idx = 0
        for idx in range(len(self)):
            # assume results is ordered

            vid_id, fr_id = self.sample_ids[idx]
            ref_frame_ids = self._frame_ids[vid_id]
            vid_len = len(ref_frame_ids)
            for frame_id in range(fr_id, min(fr_id + self.seq_len, vid_len)):

                is_last = frame_id == vid_len - 1

                det, seg = results[res_idx]
                res_idx += 1
                for obj_id in det:
                    bbox = det[obj_id]['bbox']

                    if obj_id in seg:
                        segm = seg[obj_id]
                        label = det[obj_id]['label']

                        if obj_id not in vid_objs:
                            vid_objs[obj_id] = {
                                'scores': [],
                                'cats': [],
                                'segms': {}
                            }
                        vid_objs[obj_id]['scores'].append(bbox[4])
                        vid_objs[obj_id]['cats'].append(label)
                        segm['counts'] = segm['counts'].decode()
                        vid_objs[obj_id]['segms'][frame_id] = segm
                if is_last:
                    # store results of  the current video
                    for obj_id, obj in vid_objs.items():
                        data = dict()

                        data['video_id'] = vid_id
                        data['score'] = np.array(obj['scores']).mean().item()
                        # majority voting for sequence category
                        data['category_id'] = np.bincount(np.array(
                            obj['cats'])).argmax().item() + 1
                        vid_seg = []
                        for fid in range(frame_id + 1):
                            if fid in obj['segms']:
                                vid_seg.append(obj['segms'][fid])
                            else:
                                vid_seg.append(None)
                        data['segmentations'] = vid_seg
                        first = False
                        json_results.append(data)
                    vid_objs = {}

        return [], json_results

    def results2json(self, results, outfile_prefix):
        """Dump the detection results to a json file.

        There are 3 types of results: proposals, bbox predictions, mask
        predictions, and they have different data types. This method will
        automatically recognize the type, and dump them to json files.

        Args:
            results (list[list | tuple | ndarray]): Testing results of the
                dataset.
            outfile_prefix (str): The filename prefix of the json files. If the
                prefix is "somepath/xxx", the json files will be named
                "somepath/xxx.bbox.json", "somepath/xxx.segm.json",
                "somepath/xxx.proposal.json".

        Returns:
            dict[str: str]: Possible keys are "bbox", "segm", "proposal", and
                values are corresponding filenames.
        """
        result_files = dict()
        if isinstance(results[0], list):
            json_results = self._det2json(results)
            result_files['bbox'] = '{}.{}.json'.format(outfile_prefix, 'bbox')
            result_files['proposal'] = '{}.{}.json'.format(
                outfile_prefix, 'bbox')
            mmcv.dump(json_results, result_files['bbox'])
        elif isinstance(results[0], tuple):
            json_results = self._segm2json(results)
            result_files['bbox'] = '{}.{}.json'.format(outfile_prefix, 'bbox')
            result_files['proposal'] = '{}.{}.json'.format(
                outfile_prefix, 'bbox')
            result_files['segm'] = '{}.{}.json'.format(outfile_prefix, 'segm')
            mmcv.dump(json_results[0], result_files['bbox'])
            mmcv.dump(json_results[1], result_files['segm'])
        elif isinstance(results[0], np.ndarray):
            json_results = self._proposal2json(results)
            result_files['proposal'] = '{}.{}.json'.format(
                outfile_prefix, 'proposal')
            mmcv.dump(json_results, result_files['proposal'])
        else:
            raise TypeError('invalid type of results')
        return result_files

    def format_results(self, results, jsonfile_prefix=None, **kwargs):
        """Format the results to json (standard format for COCO evaluation).

        Args:
            results (list): Testing results of the dataset.
            jsonfile_prefix (str | None): The prefix of json files. It includes
                the file path and the prefix of filename, e.g., "a/b/prefix".
                If not specified, a temp file will be created. Default: None.

        Returns:
            tuple: (result_files, tmp_dir), result_files is a dict containing
                the json filepaths, tmp_dir is the temporal directory created
                for saving json files when jsonfile_prefix is not specified.
        """
        assert isinstance(results, list), 'results must be a list'
        '''
        assert len(results) == len(self), (
            'The length of results is not equal to the dataset len: {} != {}'.
            format(len(results), len(self)))
        '''

        if jsonfile_prefix is None:
            tmp_dir = tempfile.TemporaryDirectory()
            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
        else:
            tmp_dir = None
        result_files = self.results2json(results, jsonfile_prefix)
        return result_files, tmp_dir

    def evaluate(self,
                 results,
                 metric='bbox',
                 logger=None,
                 jsonfile_prefix=None,
                 classwise=False,
                 proposal_nums=(100, 300, 1000),
                 iou_thrs=np.arange(0.5, 0.96, 0.05)):
        """Evaluation in COCO protocol.

        Args:
            results (list): Testing results of the dataset.
            metric (str | list[str]): Metrics to be evaluated.
            logger (logging.Logger | str | None): Logger used for printing
                related information during evaluation. Default: None.
            jsonfile_prefix (str | None): The prefix of json files. It includes
                the file path and the prefix of filename, e.g., "a/b/prefix".
                If not specified, a temp file will be created. Default: None.
            classwise (bool): Whether to evaluating the AP for each class.
            proposal_nums (Sequence[int]): Proposal number used for evaluating
                recalls, such as recall@100, recall@1000.
                Default: (100, 300, 1000).
            iou_thrs (Sequence[float]): IoU threshold used for evaluating
                recalls. If set to a list, the average recall of all IoUs will
                also be computed. Default: 0.5.

        Returns:
            dict[str: float]
        """

        metrics = metric if isinstance(metric, list) else [metric]
        allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
        for metric in metrics:
            if metric not in allowed_metrics:
                raise KeyError('metric {} is not supported'.format(metric))

        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)

        eval_results = {}
        ytvosGt = self.ytvos
        for metric in metrics:
            msg = 'Evaluating {}...'.format(metric)
            if logger is None:
                msg = '\n' + msg
            print_log(msg, logger=logger)

            if metric == 'proposal_fast':
                ar = self.fast_eval_recall(results,
                                           proposal_nums,
                                           iou_thrs,
                                           logger='silent')
                log_msg = []
                for i, num in enumerate(proposal_nums):
                    eval_results['AR@{}'.format(num)] = ar[i]
                    log_msg.append('\nAR@{}\t{:.4f}'.format(num, ar[i]))
                log_msg = ''.join(log_msg)
                print_log(log_msg, logger=logger)
                continue

            if metric not in result_files:
                raise KeyError('{} is not in results'.format(metric))
            try:
                ytvosDt = ytvosGt.loadRes(result_files[metric])
            except IndexError:
                print_log('The testing results of the whole dataset is empty.',
                          logger=logger,
                          level=logging.ERROR)
                break

            iou_type = 'bbox' if metric == 'proposal' else metric
            ytvosEval = YTVOSeval(ytvosGt, ytvosDt, iou_type)
            vid_ids = self.ytvos.getVidIds()
            ytvosEval.params.vidIds = vid_ids
            if metric == 'proposal':
                ytvosEval.params.useCats = 0
                ytvosEval.params.maxDets = list(proposal_nums)
                ytvosEval.evaluate()
                ytvosEval.accumulate()
                ytvosEval.summarize()
                metric_items = [
                    'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000', 'AR_m@1000',
                    'AR_l@1000'
                ]
                for i, item in enumerate(metric_items):
                    val = float('{:.3f}'.format(ytvosEval.stats[i + 6]))
                    eval_results[item] = val
            else:
                ytvosEval.evaluate()
                ytvosEval.accumulate()
                ytvosEval.summarize()
                if classwise:  # Compute per-category AP
                    pass  # TODO
                metric_items = [
                    'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
                ]
                for i in range(len(metric_items)):
                    key = '{}_{}'.format(metric, metric_items[i])
                    val = float('{:.3f}'.format(ytvosEval.stats[i]))
                    eval_results[key] = val
                eval_results['{}_mAP_copypaste'.format(metric)] = (
                    '{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
                    '{ap[4]:.3f} {ap[5]:.3f}').format(ap=ytvosEval.stats[:6])
        if tmp_dir is not None:
            tmp_dir.cleanup()
        return eval_results
コード例 #5
0
class YoutubeVIS(data.Dataset):
    """`YoutubeVIS <https://youtube-vos.org/dataset/vis/>`_ Dataset.
    Args:
        root (string): Root directory where images are downloaded to.
        set_name (string): Name of the specific set of COCO images.
        transform (callable, optional): A function/transform that augments the
                                        raw images`
        target_transform (callable, optional): A function/transform that takes
        in the target (bbox) and transforms it.
        prep_crowds (bool): Whether or not to prepare crowds for the evaluation step.
    """
    def __init__(self,
                 image_path,
                 info_file,
                 configs,
                 transform=None,
                 target_transform=YoutubeVISAnnotationTransform(),
                 dataset_name='YouTube VIS',
                 has_gt=True):
        # Do this here because we have too many things named COCO
        from pycocotools.ytvos import YTVOS

        self.root = image_path
        self.configs = configs

        logger = logging.getLogger("yolact.dataset")
        logger.info('Loading annotations into memory...')
        tic = time.time()
        with contextlib.redirect_stdout(io.StringIO()):
            self.coco = YTVOS(info_file)

        self.ids = list(self.coco.vidToAnns.keys())
        if len(self.ids) == 0 or not has_gt:
            self.ids = list(self.coco.vids.keys())

        logger.info('{} videos loaded in {:0.2f}s.'.format(
            len(self.ids),
            time.time() - tic))

        self.transform = transform
        self.target_transform = target_transform

        self.name = dataset_name
        self.has_gt = has_gt

    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            tuple: Tuple (image, (target, masks, num_crowds)).
                   target is the object returned by ``coco.loadAnns``.
        """
        video_frames, extra_data = self.pull_video(index)
        video_frames = [(
            im,
            (gt, masks, num_crowds),
        ) for im, gt, masks, h, w, num_crowds in video_frames]
        return video_frames, extra_data

    def pull_video(self,
                   index,
                   return_on_failure=False,
                   full_video=False,
                   max_images=-1):
        """
        Args:
            index (int): Index
        Returns:
            tuple: Tuple (image, target, masks, height, width, crowd).
                   target is the object returned by ``coco.loadAnns``.
            Note that if no crowd annotations exist, crowd will be None
        """
        vid_id = self.ids[index]

        seq_len = self.configs.images_per_video

        # sample vid_id with enough length
        while True:
            vid = self.coco.loadVids(vid_id)[0]
            annot_length = len(vid['file_names'])
            if not full_video and annot_length < seq_len:
                continue  # FIXME: need to set new vid_id right?
            vid_name = vid['file_names'][0].split('/')[0]

            # Generate target starts.
            if self.has_gt:
                target = self.coco.vidToAnns[vid_id]
                ann_ids = self.coco.getAnnIds(vidIds=vid_id)

                # Target has {'segmentation', 'area', iscrowd', 'image_id', 'bboxes', 'category_id'}
                target = self.coco.loadAnns(ann_ids)
            else:
                target = []

            # Separate out crowd annotations. These are annotations that signify a large crowd of
            # objects of said class, where there is no annotation for each individual object. Both
            # during testing and training, consider these crowds as neutral.
            crowd = [x for x in target if ('iscrowd' in x and x['iscrowd'])]
            target = [
                x for x in target if not ('iscrowd' in x and x['iscrowd'])
            ]
            num_crowds = len(crowd)

            for x in crowd:
                x['category_id'] = -1

            # This is so we ensure that all crowd annotations are at the end of the array
            target += crowd
            # Generate target ends.

            # shuffling and sample a small range of video here
            if full_video:
                annot_idx = np.arange(0, annot_length, 1)
                frame_idx = np.asarray([
                    int(vid['file_names'][idx][-9:-4])
                    for idx in range(annot_length)
                ])
                if self.configs.use_all_frames:
                    key_frame_idx = frame_idx
                    frame_idx = np.arange(frame_idx[0], frame_idx[-1] + 1, 1)
                    have_annot = np.asarray(
                        [int(idx in key_frame_idx) for idx in frame_idx])
                    annot_idx = np.add.accumulate(have_annot) * have_annot - 1

                if max_images != -1:
                    eval_frames = min(max_images, len(frame_idx))
                    # start_idx = np.random.randint(0, len(frame_idx) - eval_frames + 1)
                    start_idx = 0
                    frame_idx = frame_idx[start_idx:start_idx + eval_frames]
                    annot_idx = annot_idx[start_idx:start_idx + eval_frames]
            elif self.configs.use_all_frames:
                rand_idx = np.arange(0, annot_length - seq_len)
                np.random.shuffle(rand_idx)

                direction = 1
                if self.configs.all_frame_direction == 'allway':
                    if np.random.rand() > 0.5: direction *= -1
                elif self.configs.all_frame_direction == 'forward':
                    # Note: forward warping needs to sample a 'previous frame'
                    direction *= -1
                elif self.configs.all_frame_direction == 'backward':
                    pass
                else:
                    raise ValueError("Unexpected frame direction: %s" %
                                     self.configs.all_frame_direction)

                start_idx = rand_idx[0]
                if direction < 0:
                    start_idx += self.configs.images_per_video
                start_frame_idx = int(vid['file_names'][start_idx][-9:-4])
                annot_idx = [start_idx]
                frame_idx = [start_frame_idx]

                # if self.configs.images_per_video > 1:
                #     num_extra_frames = self.configs.images_per_video - 1
                #     extra_annot_idx = [start_idx + direction * offset_idx
                #                        for offset_idx in range(1, num_extra_frames + 1)]
                #     extra_frame_idx = [int(vid['file_names'][extra_idx][-9:-4])
                #                        for extra_idx in extra_annot_idx]
                #
                #     annot_idx += extra_annot_idx
                #     frame_idx += extra_frame_idx

                extra_frame_idx = []
                extra_annot_idx = []
                if self.configs.images_per_video > 0:
                    offset_lb, offset_ub = self.configs.frame_offset_lb, self.configs.frame_offset_ub
                    lb, ub = int(vid['file_names'][0][-9:-4]), int(
                        vid['file_names'][-1][-9:-4])
                    fidx = frame_idx[-1]
                    lb, ub = lb - fidx, ub - fidx
                    if direction == -1:
                        ub = -offset_lb
                        lb = max(lb, -offset_ub)
                    else:
                        lb = offset_lb
                        ub = min(ub, offset_ub)
                    assert lb <= ub + 1, "{}, {}".format(lb, ub)
                    assert self.configs.frame_offset_multiplier == 1, "frame_offset_multiplier deprecated."
                    for _ in range(self.configs.images_per_video):
                        frame_diff = np.random.randint(lb, ub + 1)
                        ref_idx = fidx + frame_diff
                        assert int(
                            vid['file_names'][0][-9:-4]) <= ref_idx <= int(
                                vid['file_names'][-1]
                                [-9:-4]), "{} <= {} <= {}".format(
                                    int(vid['file_names'][0][-9:-4]), ref_idx,
                                    int(vid['file_names'][-1][-9:-4]))
                        # frame_diff = self.configs.frame_offset_multiplier * np.random.randint(self.configs.frame_offset_lb, self.configs.frame_offset_ub + 1)
                        # ref_idx = np.clip(frame_idx[-1] + frame_diff * direction,
                        #                   int(vid['file_names'][0][-9:-4]), int(vid['file_names'][-1][-9:-4]))
                        extra_frame_idx += [ref_idx]
                        extra_annot_idx += [-1]

                extra_frame_idx = list(sorted(extra_frame_idx, reverse=True))

                annot_idx += extra_annot_idx
                frame_idx += extra_frame_idx
                annot_idx = np.asarray(annot_idx)
                frame_idx = np.asarray(frame_idx)
            else:
                rand_idx = np.arange(0, annot_length - seq_len + 1)
                np.random.shuffle(rand_idx)
                start_idx = rand_idx[0]

                annot_idx = np.arange(start_idx, start_idx + seq_len, 1)
                frame_idx = np.asarray(
                    [int(vid['file_names'][idx][-9:-4]) for idx in annot_idx])

            has_targets = all([
                self.target_in_frame(target, annot_id, true_on_reference=True)
                for annot_id in annot_idx
            ])
            if has_targets: break
            if return_on_failure: return None
            # print("Not all frame of video %s[%d-%d] has targets, re-selecting video." %
            #       (vid['file_names'][0].split('/')[0], start_idx, start_idx + frm_len))
            index = np.random.randint(len(self))
            vid_id = self.ids[index]

        frame_results = []
        extra_data = []

        while True:
            try:
                for idx, (frame_id, annot_id) in enumerate(
                        zip(frame_idx.tolist(), annot_idx.tolist())):
                    extra = {}
                    # FIXME: little bit hacky for full frames, maybe fix this using annotation files
                    frame_id_str = "%05d" % frame_id
                    file_name = vid['file_names'][0]
                    file_name = file_name[:-9] + frame_id_str + file_name[-4:]
                    prev_frame_id = frame_idx[idx - 1] if idx > 0 else -1
                    prev_annot_id = annot_idx[idx - 1] if idx > 0 else -1
                    if idx == 0:
                        seeds, (im, gt, masks, h, w,
                                num_crowds) = self.pull_frame(
                                    vid_name, (frame_id, annot_id),
                                    (prev_frame_id, prev_annot_id),
                                    file_name,
                                    target,
                                    num_crowds,
                                    require_seeds=True)
                    else:
                        im, gt, masks, h, w, num_crowds = self.pull_frame(
                            vid_name, (frame_id, annot_id),
                            (prev_frame_id, prev_annot_id),
                            file_name,
                            target,
                            num_crowds,
                            seeds=seeds)

                    extra['idx'] = (
                        frame_id,
                        annot_id,
                    )
                    frame_results.append((
                        im,
                        gt,
                        masks,
                        h,
                        w,
                        num_crowds,
                    ))
                    extra_data.append(extra)
            except ValueError as e:
                logger = logging.getLogger("yolact.dataset")
                logger.warning('Resampling with reseed signal...')
                frame_results.clear()
                extra_data.clear()
                continue
            break

        return frame_results, extra_data

    def __len__(self):
        return len(self.ids)

    @staticmethod
    def target_in_frame(target, frame_id, true_on_reference=False):
        if frame_id < 0:
            return true_on_reference
        if len(target) > 0:
            for obj in target:
                if obj['segmentations'][frame_id] is not None:
                    return True
        return False

    def pull_frame(self,
                   vid_name,
                   frame_annot_id,
                   prev_frame_annot_id,
                   file_name,
                   target,
                   num_crowds,
                   require_seeds=False,
                   seeds=None):
        frame_id, annot_id = frame_annot_id
        prev_frame_id, prev_annot_id = prev_frame_annot_id
        path = osp.join(self.root, file_name)
        assert osp.exists(path), 'Image path does not exist: {}'.format(path)

        img = cv2.imread(path)
        height, width, _ = img.shape

        target_is_in_frame = self.target_in_frame(target, annot_id)

        if target_is_in_frame:
            # Pool all the masks for this image into one [num_objects,height,width] matrix

            # masks = [np.zeros(height * width, dtype=np.uint8).reshape(-1) if obj['segmentations'][frame_id] is None  # all-zero mask on None
            #          else self.coco.annToMask(obj, frame_id).reshape(-1) for obj in target]
            masks = [
                self.coco.annToMask(obj, annot_id).reshape(-1)
                for obj in target if obj['segmentations'][annot_id] is not None
            ]
            masks = np.vstack(masks)
            masks = masks.reshape(-1, height, width)

        if self.target_transform is not None and target_is_in_frame:
            target = self.target_transform(target, annot_id, width, height)

        if self.transform is not None:
            if "Video" in type(self.transform).__name__:
                if target_is_in_frame:
                    target = np.array(target)
                    return_transform = self.transform(
                        img,
                        masks,
                        target[:, :4], {
                            'num_crowds': num_crowds,
                            'labels': target[:, 4]
                        },
                        require_seeds=require_seeds,
                        seeds=seeds)

                    if require_seeds:
                        seeds, (img, masks, boxes, labels) = return_transform
                    else:
                        img, masks, boxes, labels = return_transform

                    # I stored num_crowds in labels so I didn't have to modify the entirety of augmentations
                    num_crowds = labels['num_crowds']
                    labels = labels['labels']

                    target = np.hstack((boxes, np.expand_dims(labels, axis=1)))

                    if target.shape[0] == 0:
                        logger = logging.getLogger("yolact.dataset")
                        logger.warning(
                            'Augmentation output an example with no ground truth. Resampling...'
                        )
                        raise ValueError("reseed")
                else:
                    try:
                        return_transform = self.transform(
                            img,
                            np.zeros((1, height, width), dtype=np.float),
                            np.array([[0., 0., 1., 1.]]), {
                                'num_crowds': 0,
                                'labels': np.array([0])
                            },
                            require_seeds=require_seeds,
                            seeds=seeds)
                    except ValueError:
                        assert False, "Unexpected reseed captured with no-target instances."

                    if require_seeds:
                        seeds, (img, _, _, _) = return_transform
                    else:
                        img, _, _, _ = return_transform

                    masks = None
                    target = None
            else:
                if target_is_in_frame:
                    target = np.array(target)
                    img, masks, boxes, labels = self.transform(
                        img, masks, target[:, :4], {
                            'num_crowds': num_crowds,
                            'labels': target[:, 4]
                        })

                    # I stored num_crowds in labels so I didn't have to modify the entirety of augmentations
                    num_crowds = labels['num_crowds']
                    labels = labels['labels']

                    target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
                else:
                    img, _, _, _ = self.transform(
                        img, np.zeros((1, height, width), dtype=np.float),
                        np.array([[0, 0, 1, 1]]), {
                            'num_crowds': 0,
                            'labels': np.array([0])
                        })
                    masks = None
                    target = None

        return_tuple = torch.from_numpy(img).permute(
            2, 0, 1), target, masks, height, width, num_crowds
        if require_seeds:
            return seeds, return_tuple
        else:
            return return_tuple

    def pull_image(self, index):
        '''Returns the original image object at index in PIL form

        Note: not using self.__getitem__(), as any transformations passed in
        could mess up this functionality.

        Argument:
            index (int): index of img to show
        Return:
            cv2 img
        '''
        img_id = self.ids[index]
        path = self.coco.loadImgs(img_id)[0]['file_name']
        return cv2.imread(osp.join(self.root, path), cv2.IMREAD_COLOR)

    def pull_anno(self, index):
        '''Returns the original annotation of image at index

        Note: not using self.__getitem__(), as any transformations passed in
        could mess up this functionality.

        Argument:
            index (int): index of img to get annotation of
        Return:
            list:  [img_id, [(label, bbox coords),...]]
                eg: ('001718', [('dog', (96, 13, 438, 332))])
        '''
        img_id = self.ids[index]
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        return self.coco.loadAnns(ann_ids)

    def __repr__(self):
        fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
        fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
        fmt_str += '    Root Location: {}\n'.format(self.root)
        tmp = '    Transforms (if any): '
        fmt_str += '{0}{1}\n'.format(
            tmp,
            self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
        tmp = '    Target Transforms (if any): '
        fmt_str += '{0}{1}'.format(
            tmp,
            self.target_transform.__repr__().replace('\n',
                                                     '\n' + ' ' * len(tmp)))
        return fmt_str