Esempio n. 1
0
    def forward_train(self,
                      img,
                      img_meta,
                      gt_bboxes,
                      gt_labels,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      proposals=None):
        x = self.extract_feat(img)

        losses = dict()

        # RPN forward and loss
        if self.with_rpn:
            rpn_outs = self.rpn_head(x)
            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
                                          self.train_cfg.rpn)
            rpn_losses = self.rpn_head.loss(
                *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
            losses.update(rpn_losses)

            proposal_cfg = self.train_cfg.get('rpn_proposal',
                                              self.test_cfg.rpn)
            proposal_inputs = rpn_outs + (img_meta, proposal_cfg)
            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
        else:
            proposal_list = proposals

        # assign gts and sample proposals
        if self.with_bbox or self.with_mask:
            bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner)
            bbox_sampler = build_sampler(
                self.train_cfg.rcnn.sampler, context=self)
            num_imgs = img.size(0)
            if gt_bboxes_ignore is None:
                gt_bboxes_ignore = [None for _ in range(num_imgs)]
            sampling_results = []
            for i in range(num_imgs):
                assign_result = bbox_assigner.assign(proposal_list[i],
                                                     gt_bboxes[i],
                                                     gt_bboxes_ignore[i],
                                                     gt_labels[i])
                sampling_result = bbox_sampler.sample(
                    assign_result,
                    proposal_list[i],
                    gt_bboxes[i],
                    gt_labels[i],
                    feats=[lvl_feat[i][None] for lvl_feat in x])
                sampling_results.append(sampling_result)

        # bbox head forward and loss
        if self.with_bbox:
            rois = bbox2roi([res.bboxes for res in sampling_results])
            # TODO: a more flexible way to decide which feature maps to use
            bbox_feats = self.bbox_roi_extractor(
                x[:self.bbox_roi_extractor.num_inputs], rois)
            if self.with_shared_head:
                bbox_feats = self.shared_head(bbox_feats)
            cls_score, bbox_pred = self.bbox_head(bbox_feats)

            bbox_targets = self.bbox_head.get_target(sampling_results,
                                                     gt_bboxes, gt_labels,
                                                     self.train_cfg.rcnn)
            bbox_targets = (bbox_targets[0],)+bbox_targets[3:]
            loss_bbox = self.bbox_head.loss(cls_score, bbox_pred,
                                            *bbox_targets)
            losses.update(loss_bbox)

        # mask head forward and loss
        if self.with_mask:
            if not self.share_roi_extractor:
                pos_rois = bbox2roi(
                    [res.pos_bboxes for res in sampling_results])
                mask_feats = self.mask_roi_extractor(
                    x[:self.mask_roi_extractor.num_inputs], pos_rois)
                if self.with_shared_head:
                    mask_feats = self.shared_head(mask_feats)
            else:
                pos_inds = []
                device = bbox_feats.device
                for res in sampling_results:
                    pos_inds.append(
                        torch.ones(
                            res.pos_bboxes.shape[0],
                            device=device,
                            dtype=torch.uint8))
                    pos_inds.append(
                        torch.zeros(
                            res.neg_bboxes.shape[0],
                            device=device,
                            dtype=torch.uint8))
                pos_inds = torch.cat(pos_inds)
                mask_feats = bbox_feats[pos_inds]
            mask_pred = self.mask_head(mask_feats)

            mask_targets = self.mask_head.get_target(sampling_results,
                                                     gt_masks,
                                                     self.train_cfg.rcnn)
            pos_labels = torch.cat(
                [res.pos_gt_labels for res in sampling_results])
            loss_mask = self.mask_head.loss(mask_pred, mask_targets,
                                            pos_labels)
            losses.update(loss_mask)

        return losses
Esempio n. 2
0
    def __init__(
            self,
            num_classes,
            in_channels,
            num_query=100,
            num_reg_fcs=2,
            transformer=None,
            sync_cls_avg_factor=False,
            positional_encoding=dict(type='SinePositionalEncoding',
                                     num_feats=128,
                                     normalize=True),
            loss_cls=dict(type='CrossEntropyLoss',
                          bg_cls_weight=0.1,
                          use_sigmoid=False,
                          loss_weight=1.0,
                          class_weight=1.0),
            loss_bbox=dict(type='L1Loss', loss_weight=5.0),
            loss_iou=dict(type='GIoULoss', loss_weight=2.0),
            train_cfg=dict(assigner=dict(
                type='HungarianAssigner',
                cls_cost=dict(type='ClassificationCost', weight=1.),
                reg_cost=dict(type='BBoxL1Cost', weight=5.0),
                iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
            test_cfg=dict(max_per_img=100),
            init_cfg=None,
            **kwargs):
        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
        # since it brings inconvenience when the initialization of
        # `AnchorFreeHead` is called.
        super(AnchorFreeHead, self).__init__(init_cfg)
        self.bg_cls_weight = 0
        self.sync_cls_avg_factor = sync_cls_avg_factor
        class_weight = loss_cls.get('class_weight', None)
        if class_weight is not None and (self.__class__ is DETRHead):
            assert isinstance(class_weight, float), 'Expected ' \
                'class_weight to have type float. Found ' \
                f'{type(class_weight)}.'
            # NOTE following the official DETR rep0, bg_cls_weight means
            # relative classification weight of the no-object class.
            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
            assert isinstance(bg_cls_weight, float), 'Expected ' \
                'bg_cls_weight to have type float. Found ' \
                f'{type(bg_cls_weight)}.'
            class_weight = torch.ones(num_classes + 1) * class_weight
            # set background class as the last indice
            class_weight[num_classes] = bg_cls_weight
            loss_cls.update({'class_weight': class_weight})
            if 'bg_cls_weight' in loss_cls:
                loss_cls.pop('bg_cls_weight')
            self.bg_cls_weight = bg_cls_weight

        if train_cfg:
            assert 'assigner' in train_cfg, 'assigner should be provided '\
                'when train_cfg is set.'
            assigner = train_cfg['assigner']
            assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'], \
                'The classification weight for loss and matcher should be' \
                'exactly the same.'
            assert loss_bbox['loss_weight'] == assigner['reg_cost'][
                'weight'], 'The regression L1 weight for loss and matcher ' \
                'should be exactly the same.'
            assert loss_iou['loss_weight'] == assigner['iou_cost']['weight'], \
                'The regression iou weight for loss and matcher should be' \
                'exactly the same.'
            self.assigner = build_assigner(assigner)
            # DETR sampling=False, so use PseudoSampler
            sampler_cfg = dict(type='PseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)
        self.num_query = num_query
        self.num_classes = num_classes
        self.in_channels = in_channels
        self.num_reg_fcs = num_reg_fcs
        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        self.fp16_enabled = False
        self.loss_cls = build_loss(loss_cls)
        self.loss_bbox = build_loss(loss_bbox)
        self.loss_iou = build_loss(loss_iou)

        if self.loss_cls.use_sigmoid:
            self.cls_out_channels = num_classes
        else:
            self.cls_out_channels = num_classes + 1
        self.act_cfg = transformer.get('act_cfg',
                                       dict(type='ReLU', inplace=True))
        self.activate = build_activation_layer(self.act_cfg)
        self.positional_encoding = build_positional_encoding(
            positional_encoding)
        self.transformer = build_transformer(transformer)
        self.embed_dims = self.transformer.embed_dims
        assert 'num_feats' in positional_encoding
        num_feats = positional_encoding['num_feats']
        assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
            f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
            f' and {num_feats}.'
        self._init_layers()
Esempio n. 3
0
    def forward_train(self,
                      img,
                      img_t,
                      img_meta,
                      gt_bboxes,
                      gt_bboxes_ignore,
                      gt_labels,
                      gt_masks=None,
                      proposals=None):
        x = self.extract_feat(img, img_t)

        losses = dict()

        # RPN forward and loss
        if self.with_rpn:
            rpn_outs = self.rpn_head(x)
            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
                                          self.train_cfg.rpn)
            rpn_losses = self.rpn_head.loss(*rpn_loss_inputs)
            losses.update(rpn_losses)
            """
            edited by Yuan,original code use same nms config of RPN during training and testing.
            they are different now, more proposals can be output to next step during training.
            """
            proposal_inputs = rpn_outs + (img_meta, self.train_cfg.rpn.nms)
            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
            # print('number of proposals:{}'.format(len(proposal_list[0])))
        else:
            proposal_list = proposals

        # assign gts and sample proposals
        if self.with_bbox or self.with_mask:
            bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner)
            bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler,
                                         context=self)
            num_imgs = img.size(0)
            sampling_results = []
            for i in range(num_imgs):
                assign_result = bbox_assigner.assign(proposal_list[i],
                                                     gt_bboxes[i],
                                                     gt_bboxes_ignore[i],
                                                     gt_labels[i])
                sampling_result = bbox_sampler.sample(
                    assign_result,
                    proposal_list[i],
                    gt_bboxes[i],
                    gt_labels[i],
                    feats=[lvl_feat[i][None] for lvl_feat in x])
                sampling_results.append(sampling_result)

        # bbox head forward and loss
        if self.with_bbox:
            rois = bbox2roi([res.bboxes for res in sampling_results])
            # TODO: a more flexible way to decide which feature maps to use
            bbox_feats = self.bbox_roi_extractor(
                x[:self.bbox_roi_extractor.num_inputs], rois)
            if self.with_upper_neck:
                bbox_feats = self.upper_neck(bbox_feats)
            cls_score, bbox_pred = self.bbox_head(bbox_feats)

            bbox_targets = self.bbox_head.get_target(sampling_results,
                                                     gt_bboxes, gt_labels,
                                                     self.train_cfg.rcnn)
            loss_bbox = self.bbox_head.loss(cls_score, bbox_pred,
                                            *bbox_targets)
            losses.update(loss_bbox)

        # mask head forward and loss
        if self.with_mask:
            if not self.shared_roi_extractor:
                pos_rois = bbox2roi(
                    [res.pos_bboxes for res in sampling_results])
                mask_feats = self.mask_roi_extractor(
                    x[:self.mask_roi_extractor.num_inputs], pos_rois)
                if self.with_upper_neck:
                    mask_feats = self.upper_neck(mask_feats)
            else:
                pos_inds = []
                device = bbox_feats.device
                for res in sampling_results:
                    pos_inds.append(
                        torch.ones(res.pos_bboxes.shape[0],
                                   device=device,
                                   dtype=torch.uint8))
                    pos_inds.append(
                        torch.zeros(res.neg_bboxes.shape[0],
                                    device=device,
                                    dtype=torch.uint8))
                pos_inds = torch.cat(pos_inds)
                mask_feats = bbox_feats[pos_inds]
            mask_pred = self.mask_head(mask_feats)

            mask_targets = self.mask_head.get_target(sampling_results,
                                                     gt_masks,
                                                     self.train_cfg.rcnn)
            pos_labels = torch.cat(
                [res.pos_gt_labels for res in sampling_results])
            loss_mask = self.mask_head.loss(mask_pred, mask_targets,
                                            pos_labels)
            losses.update(loss_mask)

        return losses
Esempio n. 4
0
    def forward_debug(self,
                      img,
                      img_meta,
                      gt_bboxes,
                      gt_labels,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      proposals=None):
        """
        Args:
            img (Tensor): of shape (N, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.

            img_meta (list[dict]): list of image info dict where each dict has:
                'img_shape', 'scale_factor', 'flip', and my also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                `mmdet/datasets/pipelines/formatting.py:Collect`.

            gt_bboxes (list[Tensor]): each item are the truth boxes for each
                image in [tl_x, tl_y, br_x, br_y] format.

            gt_labels (list[Tensor]): class indices corresponding to each box

            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
                boxes can be ignored when computing the loss.

            gt_masks (None | Tensor) : true segmentation masks for each box
                used if the architecture supports a segmentation task.

            proposals : override rpn proposals with custom proposals. Use when
                `with_rpn` is False.

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        x = self.extract_feat(img)
        losses = dict()

        # RPN forward and loss
        if self.with_rpn:
            rpn_outs = self.rpn_head(x)
            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
                                          self.train_cfg.rpn)
            rpn_losses = self.rpn_head.loss(*rpn_loss_inputs,
                                            gt_bboxes_ignore=gt_bboxes_ignore)
            losses.update(rpn_losses)

            proposal_cfg = self.train_cfg.get('rpn_proposal',
                                              self.test_cfg.rpn)
            proposal_inputs = rpn_outs + (img_meta, proposal_cfg)
            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
        else:
            proposal_list = proposals

        # assign gts and sample proposals
        if self.with_bbox or self.with_mask:
            bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner)
            bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler,
                                         context=self)
            num_imgs = img.size(0)
            if gt_bboxes_ignore is None:
                gt_bboxes_ignore = [None for _ in range(num_imgs)]
            sampling_results = []
            for i in range(num_imgs):
                assign_result = bbox_assigner.assign(proposal_list[i],
                                                     gt_bboxes[i],
                                                     gt_bboxes_ignore[i],
                                                     gt_labels[i])
                sampling_result = bbox_sampler.sample(
                    assign_result,
                    proposal_list[i],
                    gt_bboxes[i],
                    gt_labels[i],
                    feats=[lvl_feat[i][None] for lvl_feat in x])
                sampling_results.append(sampling_result)

        # bbox head forward and loss
        if self.with_bbox:
            rois = bbox2roi([res.bboxes for res in sampling_results])
            # TODO: a more flexible way to decide which feature maps to use
            bbox_feats = self.bbox_roi_extractor(
                x[:self.bbox_roi_extractor.num_inputs], rois)
            if self.with_shared_head:
                bbox_feats = self.shared_head(bbox_feats)
            cls_score, bbox_pred = self.bbox_head(bbox_feats)

            bbox_targets = self.bbox_head.get_target(sampling_results,
                                                     gt_bboxes, gt_labels,
                                                     self.train_cfg.rcnn)
            # print(cls_score.shape, bbox_targets[0].shape, bbox_targets[2].shape)
            # print(cls_score[0], bbox_pred[0], bbox_targets[0][0],bbox_targets[2][0])
            loss_bbox = self.bbox_head.loss(cls_score, bbox_pred,
                                            *bbox_targets)
            losses.update(loss_bbox)

        # mask head forward and loss
        if self.with_mask:
            if not self.share_roi_extractor:
                pos_rois = bbox2roi(
                    [res.pos_bboxes for res in sampling_results])
                mask_feats = self.mask_roi_extractor(
                    x[:self.mask_roi_extractor.num_inputs], pos_rois)
                if self.with_shared_head:
                    mask_feats = self.shared_head(mask_feats)
            else:
                pos_inds = []
                device = bbox_feats.device
                for res in sampling_results:
                    pos_inds.append(
                        torch.ones(res.pos_bboxes.shape[0],
                                   device=device,
                                   dtype=torch.uint8))
                    pos_inds.append(
                        torch.zeros(res.neg_bboxes.shape[0],
                                    device=device,
                                    dtype=torch.uint8))
                pos_inds = torch.cat(pos_inds)
                mask_feats = bbox_feats[pos_inds]
            mask_pred = self.mask_head(mask_feats)

            mask_targets = self.mask_head.get_target(sampling_results,
                                                     gt_masks,
                                                     self.train_cfg.rcnn)
            pos_labels = torch.cat(
                [res.pos_gt_labels for res in sampling_results])
            loss_mask = self.mask_head.loss(mask_pred, mask_targets,
                                            pos_labels)
            losses.update(loss_mask)

        return losses, proposal_list, sampling_results, rois
Esempio n. 5
0
    def forward_train(
        self,
        img,
        img_meta,
        gt_bboxes,
        gt_labels,
        gt_bboxes_ignore=None,
        gt_masks=None,
        proposals=None,
    ):
        mix_weight = []
        for i in range(len(img_meta)):
            if img_meta[i]['mix_weight'] is not None:
                mix_weight.append(
                    torch.tensor(img_meta[i]['mix_weight']).cuda())
            else:
                mix_weight = None

        x = self.extract_feat(img)

        losses = dict()

        # RPN forward and loss
        if self.with_rpn:
            rpn_outs = self.rpn_head(x)
            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
                                          self.train_cfg.rpn)
            rpn_losses = self.rpn_head.loss(*rpn_loss_inputs,
                                            gt_bboxes_ignore=gt_bboxes_ignore)
            losses.update(rpn_losses)

            proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn)
            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
        else:
            proposal_list = proposals

        # assign gts and sample proposals
        if self.with_bbox or self.with_mask:
            bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner)
            bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler,
                                         context=self)
            num_imgs = img.size(0)
            if gt_bboxes_ignore is None:
                gt_bboxes_ignore = [None for _ in range(num_imgs)]
            sampling_results = []
            for i in range(num_imgs):
                assign_result = bbox_assigner.assign(proposal_list[i],
                                                     gt_bboxes[i],
                                                     gt_bboxes_ignore[i],
                                                     gt_labels[i])
                sampling_result = bbox_sampler.sample(
                    assign_result,
                    proposal_list[i],
                    gt_bboxes[i],
                    gt_labels[i],
                    feats=[lvl_feat[i][None] for lvl_feat in x])
                sampling_results.append(sampling_result)

        # bbox head forward and loss
        if self.with_bbox:
            rois = bbox2roi([res.bboxes for res in sampling_results])
            # TODO: a more flexible way to decide which feature maps to use
            bbox_feats = self.bbox_roi_extractor(
                x[:self.bbox_roi_extractor.num_inputs], rois)
            cls_score, bbox_pred = self.bbox_head(bbox_feats)

            bbox_targets = self.bbox_head.get_target(sampling_results,
                                                     gt_bboxes, gt_labels,
                                                     self.train_cfg.rcnn)
            if mix_weight is not None:
                mix_inds = []
                for i in range(len(sampling_results)):
                    mix_ind = torch.ones(
                        sampling_results[i].bboxes.shape[0]).cuda()
                    for j in range(
                            len(sampling_results[i].pos_assigned_gt_inds)):
                        mix_ind[j] = mix_weight[i][
                            sampling_results[i].pos_assigned_gt_inds[j]]
                    mix_inds.append(mix_ind)
                mix_inds = torch.cat(mix_inds, 0)
            else:
                mix_inds = None
            loss_bbox = self.bbox_head.loss(cls_score, bbox_pred,
                                            *bbox_targets, mix_inds)
            losses.update(loss_bbox)

        # mask head forward and loss
        if self.with_mask:
            pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
            mask_feats = self.mask_roi_extractor(
                x[:self.mask_roi_extractor.num_inputs], pos_rois)
            mask_pred = self.mask_head(mask_feats)

            mask_targets = self.mask_head.get_target(sampling_results,
                                                     gt_masks,
                                                     self.train_cfg.rcnn)
            pos_labels = torch.cat(
                [res.pos_gt_labels for res in sampling_results])
            loss_mask = self.mask_head.loss(mask_pred, mask_targets,
                                            pos_labels)
            losses.update(loss_mask)

        return losses
Esempio n. 6
0
    def __init__(self,
                 num_classes,
                 in_channels,
                 point_feat_channels=256,
                 num_points=9,
                 gradient_mul=0.1,
                 point_strides=[8, 16, 32, 64, 128],
                 point_base_scale=4,
                 loss_cls=dict(type='FocalLoss',
                               use_sigmoid=True,
                               gamma=2.0,
                               alpha=0.25,
                               loss_weight=1.0),
                 loss_bbox_init=dict(type='SmoothL1Loss',
                                     beta=1.0 / 9.0,
                                     loss_weight=0.5),
                 loss_bbox_refine=dict(type='SmoothL1Loss',
                                       beta=1.0 / 9.0,
                                       loss_weight=1.0),
                 use_grid_points=False,
                 center_init=True,
                 transform_method='moment',
                 moment_mul=0.01,
                 **kwargs):
        self.num_points = num_points
        self.point_feat_channels = point_feat_channels
        self.use_grid_points = use_grid_points
        self.center_init = center_init

        # we use deform conv to extract points features
        self.dcn_kernel = int(np.sqrt(num_points))
        self.dcn_pad = int((self.dcn_kernel - 1) / 2)
        assert self.dcn_kernel * self.dcn_kernel == num_points, \
            'The points number should be a square number.'
        assert self.dcn_kernel % 2 == 1, \
            'The points number should be an odd square number.'
        dcn_base = np.arange(-self.dcn_pad,
                             self.dcn_pad + 1).astype(np.float64)
        dcn_base_y = np.repeat(dcn_base, self.dcn_kernel)
        dcn_base_x = np.tile(dcn_base, self.dcn_kernel)
        dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape(
            (-1))
        self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1)

        super().__init__(num_classes, in_channels, loss_cls=loss_cls, **kwargs)

        self.gradient_mul = gradient_mul
        self.point_base_scale = point_base_scale
        self.point_strides = point_strides
        self.point_generators = [PointGenerator() for _ in self.point_strides]

        self.sampling = loss_cls['type'] not in ['FocalLoss']
        if self.train_cfg:
            self.init_assigner = build_assigner(self.train_cfg.init.assigner)
            self.refine_assigner = build_assigner(
                self.train_cfg.refine.assigner)
            # use PseudoSampler when sampling is False
            if self.sampling and hasattr(self.train_cfg, 'sampler'):
                sampler_cfg = self.train_cfg.sampler
            else:
                sampler_cfg = dict(type='PseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)
        self.transform_method = transform_method
        if self.transform_method == 'moment':
            self.moment_transfer = nn.Parameter(data=torch.zeros(2),
                                                requires_grad=True)
            self.moment_mul = moment_mul

        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
        if self.use_sigmoid_cls:
            self.cls_out_channels = self.num_classes
        else:
            self.cls_out_channels = self.num_classes + 1
        self.loss_bbox_init = build_loss(loss_bbox_init)
        self.loss_bbox_refine = build_loss(loss_bbox_refine)
    def forward_train(self,
                      img,
                      img_meta,
                      gt_bboxes,
                      gt_labels,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      proposals=None):
        x = self.extract_feat(img)

        losses = dict()

        # trans gt_masks to gt_obbs
        gt_obbs = gt_mask_bp_obbs_list(gt_masks)

        # RPN forward and loss
        if self.with_rpn:
            rpn_outs = self.rpn_head(x)
            rpn_loss_inputs = rpn_outs + (gt_bboxes, gt_labels, img_meta,
                                          self.train_cfg.rpn)
            rpn_losses = self.rpn_head.loss(*rpn_loss_inputs,
                                            gt_bboxes_ignore=gt_bboxes_ignore)
            losses.update(rpn_losses)

            proposal_cfg = self.train_cfg.get('rpn_proposal',
                                              self.test_cfg.rpn)

            proposal_inputs = rpn_outs + (img_meta, proposal_cfg)

            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
        else:
            proposal_list = proposals

        # assign gts and sample proposals (hbb assign)
        if self.with_bbox or self.with_mask:
            bbox_assigner = build_assigner(self.train_cfg.rcnn[0].assigner)
            bbox_sampler = build_sampler(self.train_cfg.rcnn[0].sampler,
                                         context=self)
            num_imgs = img.size(0)
            if gt_bboxes_ignore is None:
                gt_bboxes_ignore = [None for _ in range(num_imgs)]
            sampling_results = []
            for i in range(num_imgs):
                assign_result = bbox_assigner.assign(proposal_list[i],
                                                     gt_bboxes[i],
                                                     gt_bboxes_ignore[i],
                                                     gt_labels[i])
                sampling_result = bbox_sampler.sample(
                    assign_result,
                    proposal_list[i],
                    gt_bboxes[i],
                    gt_labels[i],
                    feats=[lvl_feat[i][None] for lvl_feat in x])
                sampling_results.append(sampling_result)

        # bbox head forward and loss
        if self.with_bbox:
            rois = bbox2roi([res.bboxes for res in sampling_results])
            # TODO: a more flexible way to decide which feature maps to use
            bbox_feats = self.bbox_roi_extractor(
                x[:self.bbox_roi_extractor.num_inputs], rois)

            if self.with_shared_head:
                bbox_feats = self.shared_head(bbox_feats)
            cls_score, bbox_pred = self.bbox_head(bbox_feats)
            ## rbbox
            rbbox_targets = self.bbox_head.get_target(sampling_results,
                                                      gt_masks, gt_labels,
                                                      self.train_cfg.rcnn[0])

            loss_bbox = self.bbox_head.loss(cls_score, bbox_pred,
                                            *rbbox_targets)
            # losses.update(loss_bbox)
            for name, value in loss_bbox.items():
                losses['s{}.{}'.format(0, name)] = (value)

        pos_is_gts = [res.pos_is_gt for res in sampling_results]
        roi_labels = rbbox_targets[0]
        with torch.no_grad():
            # import pdb
            # pdb.set_trace()
            rotated_proposal_list = self.bbox_head.refine_rbboxes(
                roi2droi(rois), roi_labels, bbox_pred, pos_is_gts, img_meta)
        # import pdb
        # pdb.set_trace()
        # assign gts and sample proposals (rbb assign)
        if self.with_rbbox:
            bbox_assigner = build_assigner(self.train_cfg.rcnn[1].assigner)
            bbox_sampler = build_sampler(self.train_cfg.rcnn[1].sampler,
                                         context=self)
            num_imgs = img.size(0)
            if gt_bboxes_ignore is None:
                gt_bboxes_ignore = [None for _ in range(num_imgs)]
            sampling_results = []
            for i in range(num_imgs):
                gt_obbs_best_roi = choose_best_Rroi_batch(gt_obbs[i])
                assign_result = bbox_assigner.assign(rotated_proposal_list[i],
                                                     gt_obbs_best_roi,
                                                     gt_bboxes_ignore[i],
                                                     gt_labels[i])
                sampling_result = bbox_sampler.sample(
                    assign_result,
                    rotated_proposal_list[i],
                    torch.from_numpy(gt_obbs_best_roi).float().to(
                        rotated_proposal_list[i].device),
                    gt_labels[i],
                    feats=[lvl_feat[i][None] for lvl_feat in x])
                sampling_results.append(sampling_result)

        if self.with_rbbox:
            # (batch_ind, x_ctr, y_ctr, w, h, angle)
            rrois = dbbox2roi([res.bboxes for res in sampling_results])
            # feat enlarge
            # rrois[:, 3] = rrois[:, 3] * 1.2
            # rrois[:, 4] = rrois[:, 4] * 1.4
            rrois[:, 3] = rrois[:, 3] * self.rbbox_roi_extractor.w_enlarge
            rrois[:, 4] = rrois[:, 4] * self.rbbox_roi_extractor.h_enlarge
            rbbox_feats = self.rbbox_roi_extractor(
                x[:self.rbbox_roi_extractor.num_inputs], rrois)
            if self.with_shared_head_rbbox:
                rbbox_feats = self.shared_head_rbbox(rbbox_feats)
            cls_score, rbbox_pred = self.rbbox_head(rbbox_feats)
            rbbox_targets = self.rbbox_head.get_target_rbbox(
                sampling_results, gt_obbs, gt_labels, self.train_cfg.rcnn[1])
            loss_rbbox = self.rbbox_head.loss(cls_score, rbbox_pred,
                                              *rbbox_targets)
            for name, value in loss_rbbox.items():
                losses['s{}.{}'.format(1, name)] = (value)

        return losses
    def forward_train(self,
                      img,
                      img_meta,
                      gt_bboxes,
                      gt_bboxes_ignore,
                      gt_labels,
                      gt_masks=None,
                      proposals=None):
        x = self.extract_feat(img)

        losses = dict()

        # RPN forward and loss
        if self.with_rpn:
            rpn_outs = self.rpn_head(x)
            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
                                          self.train_cfg.rpn)
            rpn_losses = self.rpn_head.loss(*rpn_loss_inputs)
            losses.update(rpn_losses)

            proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn)
            proposal_list = self.rpn_head.get_proposals(*proposal_inputs)
        else:
            proposal_list = proposals

        # assign gts and sample proposals
        if self.with_bbox or self.with_mask:
            bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner)
            bbox_sampler = build_sampler(
                self.train_cfg.rcnn.sampler, context=self)
            num_imgs = img.size(0)
            sampling_results = []
            for i in range(num_imgs):
                assign_result = bbox_assigner.assign(
                    proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i],
                    gt_labels[i])
                sampling_result = bbox_sampler.sample(
                    assign_result,
                    proposal_list[i],
                    gt_bboxes[i],
                    gt_labels[i],
                    feats=[lvl_feat[i][None] for lvl_feat in x])
                sampling_results.append(sampling_result)

        # bbox head forward and loss
        if self.with_bbox:
            rois = bbox2roi([res.bboxes for res in sampling_results])
            # TODO: a more flexible way to decide which feature maps to use
            bbox_feats = self.bbox_roi_extractor(
                x[:self.bbox_roi_extractor.num_inputs], rois)
            pred = self.action__head(bbox_feats,x[:self.bbox_roi_extractor.num_inputs])

            loss_bbox = self.action_head.loss(pred,gt_labels)
            losses.update(loss_bbox)

        # mask head forward and loss
        if self.with_mask:
            pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
            mask_feats = self.mask_roi_extractor(
                x[:self.mask_roi_extractor.num_inputs], pos_rois)
            mask_pred = self.mask_head(mask_feats)

            mask_targets = self.mask_head.get_target(
                sampling_results, gt_masks, self.train_cfg.rcnn)
            pos_labels = torch.cat(
                [res.pos_gt_labels for res in sampling_results])
            loss_mask = self.mask_head.loss(mask_pred, mask_targets,
                                            pos_labels)
            losses.update(loss_mask)

        return losses
Esempio n. 9
0
    def forward_train_vis(self,
                          img,
                          img_meta,
                          gt_bboxes,
                          gt_labels,
                          gt_bboxes_ignore=None,
                          gt_masks=None,
                          proposals=None):
        losses = dict()

        # separate gt
        num_imgs = img.size(0)
        text_gt_bboxes = []
        text_gt_labels = []
        char_gt_bboxes = []
        char_gt_labels = []
        for img_i in range(num_imgs):
            text_num = gt_masks[img_i].shape[0]
            # text line gt
            text_gt_bboxes.append(gt_bboxes[img_i][:text_num])
            text_gt_labels.append(gt_labels[img_i][:text_num])
            # character gt
            char_gt_bboxes.append(gt_bboxes[img_i][text_num:])
            char_gt_labels.append(gt_labels[img_i][text_num:])

        x = self.extract_feat(img)

        # RPN forward and loss
        rpn_outs = self.rpn_head(x)
        stage_num = len(rpn_outs[0])
        # text line proposals
        text_rpn_outs = ([], [])
        for stage_i in range(stage_num):
            text_rpn_outs[0].append(rpn_outs[0][stage_i])
            text_rpn_outs[1].append(rpn_outs[1][stage_i])
        text_rpn_loss_inputs = text_rpn_outs + (text_gt_bboxes, img_meta,
                                                self.train_cfg.rpn)
        text_rpn_losses = self.rpn_head.loss(*text_rpn_loss_inputs,
                                             gt_bboxes_ignore=gt_bboxes_ignore,
                                             type='text')
        losses.update(text_rpn_losses)
        text_proposal_inputs = text_rpn_outs + (
            img_meta, self.train_cfg.text_rpn_proposal)
        text_proposal_list = self.rpn_head.get_bboxes(*text_proposal_inputs,
                                                      type='text')
        # character proposals
        char_rpn_outs = ([], [])
        for stage_i in range(stage_num):
            char_rpn_outs[0].append(rpn_outs[2][stage_i])
            char_rpn_outs[1].append(rpn_outs[3][stage_i])
        char_rpn_loss_inputs = char_rpn_outs + (char_gt_bboxes, img_meta,
                                                self.train_cfg.rpn)
        char_rpn_losses = self.rpn_head.loss(*char_rpn_loss_inputs,
                                             gt_bboxes_ignore=gt_bboxes_ignore,
                                             type='char')
        losses.update(char_rpn_losses)
        char_proposal_inputs = char_rpn_outs + (
            img_meta, self.train_cfg.char_rpn_proposal)
        char_proposal_list = self.rpn_head.get_bboxes(*char_proposal_inputs,
                                                      type='char')

        # assign gts and sample proposals
        bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner)
        bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self)
        num_imgs = img.size(0)
        if gt_bboxes_ignore is None:
            gt_bboxes_ignore = [None for _ in range(num_imgs)]
        text_sampling_results, char_sampling_results = [], []
        for i in range(num_imgs):
            # sample text line proposals
            text_assign_result = bbox_assigner.assign(text_proposal_list[i],
                                                      text_gt_bboxes[i],
                                                      gt_bboxes_ignore[i],
                                                      text_gt_labels[i])
            text_sampling_result = bbox_sampler.sample(
                text_assign_result,
                text_proposal_list[i],
                text_gt_bboxes[i],
                text_gt_labels[i],
                feats=[lvl_feat[i][None] for lvl_feat in x])
            text_sampling_results.append(text_sampling_result)
            # sample character proposals
            char_assign_result = bbox_assigner.assign(char_proposal_list[i],
                                                      char_gt_bboxes[i],
                                                      gt_bboxes_ignore[i],
                                                      char_gt_labels[i])
            char_sampling_result = bbox_sampler.sample(
                char_assign_result,
                char_proposal_list[i],
                char_gt_bboxes[i],
                char_gt_labels[i],
                feats=[lvl_feat[i][None] for lvl_feat in x])
            char_sampling_results.append(char_sampling_result)

        # text detection module
        text_rois = bbox2roi([res.bboxes for res in text_sampling_results])
        text_bbox_feats = self.text_bbox_roi_extractor(
            x[:self.text_bbox_roi_extractor.num_inputs], text_rois)
        text_cls_score, text_bbox_pred = self.text_bbox_head(text_bbox_feats)
        text_bbox_targets = self.text_bbox_head.get_target(
            text_sampling_results, text_gt_bboxes, text_gt_labels,
            self.train_cfg.rcnn)
        text_loss_bbox = self.text_bbox_head.loss(text_cls_score,
                                                  text_bbox_pred,
                                                  *text_bbox_targets,
                                                  type='text')
        losses.update(text_loss_bbox)
        pos_rois = bbox2roi([res.pos_bboxes for res in text_sampling_results])
        text_mask_feats = self.text_mask_roi_extractor(
            x[:self.text_mask_roi_extractor.num_inputs], pos_rois)
        mask_pred = self.text_mask_head(text_mask_feats)
        mask_targets = self.text_mask_head.get_target(text_sampling_results,
                                                      gt_masks,
                                                      self.train_cfg.rcnn)
        pos_labels = torch.cat(
            [res.pos_gt_labels for res in text_sampling_results])
        loss_mask = self.text_mask_head.loss(mask_pred, mask_targets,
                                             pos_labels)
        losses.update(loss_mask)

        # character-based recognition module
        char_rois = bbox2roi([res.bboxes for res in char_sampling_results])
        char_bbox_feats = self.char_bbox_roi_extractor(
            x[:self.char_bbox_roi_extractor.num_inputs], char_rois)
        char_cls_score, char_bbox_pred = self.char_bbox_head(
            char_bbox_feats)  # the input may be a tuple
        char_bbox_targets = self.char_bbox_head.get_target(
            char_sampling_results, char_gt_bboxes, char_gt_labels,
            self.train_cfg.rcnn)
        char_loss_bbox = self.char_bbox_head.loss(char_cls_score,
                                                  char_bbox_pred,
                                                  *char_bbox_targets,
                                                  type='char')
        losses.update(char_loss_bbox)

        # print(losses)
        # exit()

        return losses
Esempio n. 10
0
    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes,
                      gt_labels,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      gt_semantic_seg=None):
        # change the neck
        segm_pred, proto, x = self.extract_feat(img)
        losses = dict()

        cls_scores, bbox_preds, bbox_embeds = self.bbox_head(x)
        outs = (cls_scores, bbox_preds)
        loss_inputs = outs + (gt_bboxes, gt_labels, img_metas,
                              self.train_cfg.bbox_head)
        loss_bbox = self.bbox_head.loss(
            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
        losses.update(loss_bbox)

        # proposal
        proposal_cfg = self.train_cfg.get('proposals')
        proposal_inputs = outs + (bbox_embeds, img_metas, proposal_cfg)
        proposal_list = self.bbox_head.get_proposals(*proposal_inputs)
        proposal_bbox, proposal_labels, proposal_embeds = map(list, zip(*proposal_list))

        # assign gts and sample proposals
        assigner = build_assigner(self.train_cfg.mask_head.assigner)
        sampler = build_sampler(self.train_cfg.mask_head.sampler,
                                context=self)
        num_imgs = img.size(0)
        if gt_bboxes_ignore is None:
            gt_bboxes_ignore = [None for _ in range(num_imgs)]
        sampling_results = []
        for i in range(num_imgs):
            assign_result = assigner.assign(
                proposal_bbox[i],
                gt_bboxes[i],
                gt_bboxes_ignore[i],
                gt_labels[i])
            sampling_result = sampler.sample(
                assign_result,
                proposal_bbox[i],
                gt_bboxes[i],
                gt_labels[i],
                feats=[lvl_feat[i][None] for lvl_feat in x])
            sampling_results.append(sampling_result)

        # mask_rois
        pos_rois = bbox2roi(
            res.pos_bboxes for res in sampling_results)
        pos_gt_rois = bbox2roi(
            res.pos_gt_bboxes for res in sampling_results)
        pos_embed_list = [
            proposal_embeds[i][res.pos_inds] for i, res in enumerate(sampling_results)]
        pos_embeds = torch.cat(pos_embed_list)
        assert pos_embeds.size(0) == pos_gt_rois.size(0)

        # crop the rois from proto_masks, then calculate the loss
        # using pos_bboxes but not pos_gt_rois
        # crop the rois from proto_masks, then calculate the loss
        # using pos_bboxes but not pos_gt_rois
        mask_targets = self.get_target(sampling_results, gt_masks,
                                       stride=self.train_cfg.proto.final_stride)
        mask_preds = self.extract_proto(proto, pos_rois, pos_embeds,
                                        stride=self.train_cfg.proto.stride,
                                        final_stride=self.train_cfg.proto.final_stride)

        # for segm
        # segm_targets = self.segm_target(segm_pred, gt_semantic_seg)
        loss_mask = self.mask_loss(mask_preds, mask_targets,
                                   segm_pred, gt_semantic_seg)
        losses.update(loss_mask)
        return losses
    def forward_train_pair(self,
                      img,
                      img_meta,
                      gt_bboxes,
                      gt_labels,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      proposals=None):
        #img: b, 2*c = 6, h, w
        b, c, h, w = img.shape
        img = img.reshape(-1,c//2,h,w)
        #img: 2*b, c, h, w [0,1] , [2,3] , ......
        x = self.extract_feat(img)
        #x : tuple,5 layer
        losses = dict()

        # RPN forward and loss
        if self.with_rpn:
            rpn_outs = self.rpn_head(x)
            #[cls_score:layer,bchw, bbox_pred, shape_pred, loc_pred]
            rpn_outs_half = []
            for outs_0 in rpn_outs:
                tmp = []
                for outs_1 in outs_0:
                    tmp.append(outs_1[::2,:,:,:])
                rpn_outs_half.append(tmp)

            rpn_loss_inputs = tuple(rpn_outs_half) + (gt_bboxes, img_meta,
                                          self.train_cfg.rpn)
            rpn_losses = self.rpn_head.loss(
                *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
            losses.update(rpn_losses)

            proposal_cfg = self.train_cfg.get('rpn_proposal',
                                              self.test_cfg.rpn)

            #copy train img_meta to pair. 1,2,3->1,1,2,2,3,3
            img_meta = [img_meta[i//2] for i in range(2*len(img_meta))]

            proposal_inputs = rpn_outs + (img_meta, proposal_cfg)
            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
        else:
            proposal_list = proposals

        # proposal_list : batch * [det_bboxes] -> det_bboxes: [n,5] 5: x,y,x,y,score

        #generate blank gt for normal img
        gt_bboxes_ = []
        gt_labels_ = []
        for i in range(len(gt_bboxes)):
            gt_bboxes_.append(gt_bboxes[i])
            gt_bboxes_.append(torch.Tensor([[1,1,1,1]]).to(gt_bboxes[i].device))
            gt_labels_.append(gt_labels[i])
            gt_labels_.append(torch.Tensor([[0]]).to(gt_labels[i].device))



        # assign gts and sample proposals
        if self.with_bbox or self.with_mask:
            bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner)
            bbox_sampler = build_sampler(
                self.train_cfg.rcnn.sampler, context=self)
            assert img.size(0) % 2 == 0
            num_pairs = img.size(0) // 2
            num_imgs = img.size(0)

            if gt_bboxes_ignore is None:
                gt_bboxes_ignore = [None for _ in range(num_imgs)]
            sampling_results = []

            for i in range(num_pairs):
                i_train = 2*i
                i_normal = i_train + 1

                assign_result_train = bbox_assigner.assign(proposal_list[i_train],
                                                     gt_bboxes_[i_train],
                                                     gt_bboxes_ignore[i_train],
                                                     gt_labels_[i_train])

                assign_result_normal = bbox_assigner.assign(proposal_list[i_normal],
                                                           gt_bboxes_[i_normal],
                                                           gt_bboxes_ignore[i_normal],
                                                           gt_labels_[i_normal])




                sampling_result_train, sampling_results_normal = bbox_sampler.pair_sample(
                    assign_result_train,
                    assign_result_normal,
                    proposal_list[i_train],
                    proposal_list[i_normal],
                    gt_bboxes_[i_train],
                    gt_labels_[i_train],
                    feats_train=[lvl_feat[i_train][None] for lvl_feat in x],
                    feats_normal=[lvl_feat[i_normal][None] for lvl_feat in x])
                sampling_results.append(sampling_result_train)
                sampling_results.append(sampling_results_normal)


        # bbox head forward and loss
        if self.with_bbox:
            rois = bbox2roi([res.bboxes for res in sampling_results])
            # TODO: a more flexible way to decide which feature maps to use
            bbox_feats = self.bbox_roi_extractor(
                x[:self.bbox_roi_extractor.num_inputs], rois)
            if self.with_shared_head:
                bbox_feats = self.shared_head(bbox_feats)
            cls_score, bbox_pred = self.bbox_head(bbox_feats)

            bbox_targets = self.bbox_head.get_target(sampling_results,
                                                     gt_bboxes, gt_labels,
                                                     self.train_cfg.rcnn)
            loss_bbox = self.bbox_head.loss(cls_score, bbox_pred,
                                            *bbox_targets)
            losses.update(loss_bbox)

        # mask head forward and loss
        if self.with_mask:
            if not self.share_roi_extractor:
                pos_rois = bbox2roi(
                    [res.pos_bboxes for res in sampling_results])
                mask_feats = self.mask_roi_extractor(
                    x[:self.mask_roi_extractor.num_inputs], pos_rois)
                if self.with_shared_head:
                    mask_feats = self.shared_head(mask_feats)
            else:
                pos_inds = []
                device = bbox_feats.device
                for res in sampling_results:
                    pos_inds.append(
                        torch.ones(
                            res.pos_bboxes.shape[0],
                            device=device,
                            dtype=torch.uint8))
                    pos_inds.append(
                        torch.zeros(
                            res.neg_bboxes.shape[0],
                            device=device,
                            dtype=torch.uint8))
                pos_inds = torch.cat(pos_inds)
                mask_feats = bbox_feats[pos_inds]
            mask_pred = self.mask_head(mask_feats)

            mask_targets = self.mask_head.get_target(sampling_results,
                                                     gt_masks,
                                                     self.train_cfg.rcnn)
            pos_labels = torch.cat(
                [res.pos_gt_labels for res in sampling_results])
            loss_mask = self.mask_head.loss(mask_pred, mask_targets,
                                            pos_labels)
            losses.update(loss_mask)

        return losses
Esempio n. 12
0
    def forward_train(self,
                      img,
                      img_meta,
                      gt_bboxes,
                      gt_labels,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      proposals=None):
        x = self.extract_feat(img)

        losses = dict()

        # RPN forward and loss
        if self.with_rpn:
            rpn_outs = self.rpn_head(x)
            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
                                          self.train_cfg.rpn)
            rpn_losses = self.rpn_head.loss(*rpn_loss_inputs,
                                            gt_bboxes_ignore=gt_bboxes_ignore)
            losses.update(rpn_losses)

            proposal_cfg = self.train_cfg.get('rpn_proposal',
                                              self.test_cfg.rpn)
            proposal_inputs = rpn_outs + (img_meta, proposal_cfg)
            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
        else:
            proposal_list = proposals

        if self.with_bbox:
            # assign gts and sample proposals
            bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner)
            bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler,
                                         context=self)
            num_imgs = img.size(0)
            if gt_bboxes_ignore is None:
                gt_bboxes_ignore = [None for _ in range(num_imgs)]
            sampling_results = []
            for i in range(num_imgs):
                assign_result = bbox_assigner.assign(proposal_list[i],
                                                     gt_bboxes[i],
                                                     gt_bboxes_ignore[i],
                                                     gt_labels[i])
                sampling_result = bbox_sampler.sample(
                    assign_result,
                    proposal_list[i],
                    gt_bboxes[i],
                    gt_labels[i],
                    feats=[lvl_feat[i][None] for lvl_feat in x])
                sampling_results.append(sampling_result)

            # bbox head forward and loss
            rois = bbox2roi([res.bboxes for res in sampling_results])
            # TODO: a more flexible way to decide which feature maps to use
            bbox_feats = self.bbox_roi_extractor(
                x[:self.bbox_roi_extractor.num_inputs], rois)
            if self.with_shared_head:
                bbox_feats = self.shared_head(bbox_feats)
            cls_score, bbox_pred = self.bbox_head(bbox_feats)

            bbox_targets = self.bbox_head.get_target(sampling_results,
                                                     gt_bboxes, gt_labels,
                                                     self.train_cfg.rcnn)
            loss_bbox = self.bbox_head.loss(cls_score, bbox_pred,
                                            *bbox_targets)
            losses.update(loss_bbox)

            #####dense local regression head ################################
            sampling_results = self._random_jitter(sampling_results, img_meta)
            pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])

            reg_feats = self.reg_roi_extractor(
                x[:self.reg_roi_extractor.num_inputs], pos_rois)

            if self.with_shared_head:
                reg_feats = self.shared_head(reg_feats)
            # Accelerate training
            max_sample_num_reg = self.train_cfg.rcnn.get('max_num_reg', 192)
            sample_idx = torch.randperm(
                reg_feats.shape[0])[:min(reg_feats.shape[0], max_sample_num_reg
                                         )]
            reg_feats = reg_feats[sample_idx]
            pos_gt_labels = torch.cat(
                [res.pos_gt_labels for res in sampling_results])
            pos_gt_labels = pos_gt_labels[sample_idx]
            if self.MASK_ON == False:
                #####################instance segmentation############################
                reg_pred, reg_masks_pred = self.D2Det_head(reg_feats)
                reg_points, reg_targets, reg_masks = self.D2Det_head.get_target(
                    sampling_results)
                reg_targets = reg_targets[sample_idx]
                reg_points = reg_points[sample_idx]
                reg_masks = reg_masks[sample_idx]
                x1 = reg_points[:,
                                0, :, :] - reg_pred[:,
                                                    0, :, :] * reg_points[:,
                                                                          2, :, :]
                x2 = reg_points[:,
                                0, :, :] + reg_pred[:,
                                                    1, :, :] * reg_points[:,
                                                                          2, :, :]
                y1 = reg_points[:,
                                1, :, :] - reg_pred[:,
                                                    2, :, :] * reg_points[:,
                                                                          3, :, :]
                y2 = reg_points[:,
                                1, :, :] + reg_pred[:,
                                                    3, :, :] * reg_points[:,
                                                                          3, :, :]

                pos_decoded_bbox_preds = torch.stack([x1, y1, x2, y2], dim=1)

                x1_1 = reg_points[:, 0, :, :] - reg_targets[:, 0, :, :]
                x2_1 = reg_points[:, 0, :, :] + reg_targets[:, 1, :, :]
                y1_1 = reg_points[:, 1, :, :] - reg_targets[:, 2, :, :]
                y2_1 = reg_points[:, 1, :, :] + reg_targets[:, 3, :, :]

                pos_decoded_target_preds = torch.stack(
                    [x1_1, y1_1, x2_1, y2_1], dim=1)
                loss_reg = self.loss_roi_reg(
                    pos_decoded_bbox_preds.permute(0, 2, 3, 1).reshape(-1, 4),
                    pos_decoded_target_preds.permute(0, 2, 3,
                                                     1).reshape(-1, 4),
                    weight=reg_masks.reshape(-1, 1))

                loss_mask = self.loss_roi_mask(
                    reg_masks_pred.reshape(
                        -1, reg_masks.shape[2] * reg_masks.shape[3]),
                    reg_masks.reshape(-1,
                                      reg_masks.shape[2] * reg_masks.shape[3]))
                losses.update(dict(loss_reg=loss_reg, loss_mask=loss_mask))
                #############################################
            else:
                #####################object detection############################
                reg_pred, reg_masks_pred, reg_instances_pred, reg_iou = self.D2Det_head(
                    reg_feats, pos_gt_labels)
                reg_points, reg_targets, reg_masks, reg_instances = self.D2Det_head.get_target_mask(
                    sampling_results, gt_masks, self.train_cfg.rcnn)

                reg_targets = reg_targets[sample_idx]
                reg_points = reg_points[sample_idx]
                reg_masks = reg_masks[sample_idx]
                reg_instances = reg_instances[sample_idx]

                x1 = reg_points[:,
                                0, :, :] - reg_pred[:,
                                                    0, :, :] * reg_points[:,
                                                                          2, :, :]
                x2 = reg_points[:,
                                0, :, :] + reg_pred[:,
                                                    1, :, :] * reg_points[:,
                                                                          2, :, :]
                y1 = reg_points[:,
                                1, :, :] - reg_pred[:,
                                                    2, :, :] * reg_points[:,
                                                                          3, :, :]
                y2 = reg_points[:,
                                1, :, :] + reg_pred[:,
                                                    3, :, :] * reg_points[:,
                                                                          3, :, :]

                pos_decoded_bbox_preds = torch.stack([x1, y1, x2, y2], dim=1)

                x1_1 = reg_points[:, 0, :, :] - reg_targets[:, 0, :, :]
                x2_1 = reg_points[:, 0, :, :] + reg_targets[:, 1, :, :]
                y1_1 = reg_points[:, 1, :, :] - reg_targets[:, 2, :, :]
                y2_1 = reg_points[:, 1, :, :] + reg_targets[:, 3, :, :]

                pos_decoded_target_preds = torch.stack(
                    [x1_1, y1_1, x2_1, y2_1], dim=1)
                loss_reg = self.loss_roi_reg(
                    pos_decoded_bbox_preds.permute(0, 2, 3, 1).reshape(-1, 4),
                    pos_decoded_target_preds.permute(0, 2, 3,
                                                     1).reshape(-1, 4),
                    weight=reg_masks.reshape(-1, 1))
                loss_mask = self.loss_roi_mask(
                    reg_masks_pred.reshape(
                        -1, reg_masks.shape[1] * reg_masks.shape[2]),
                    reg_masks.reshape(-1,
                                      reg_masks.shape[1] * reg_masks.shape[2]))

                loss_instance = self.loss_roi_instance(reg_instances_pred,
                                                       reg_instances,
                                                       pos_gt_labels)
                reg_iou_targets = self.D2Det_head.get_target_maskiou(
                    sampling_results, gt_masks,
                    reg_instances_pred[pos_gt_labels > 0, pos_gt_labels],
                    reg_instances, sample_idx)
                reg_iou_weights = ((reg_iou_targets > 0.1) &
                                   (reg_iou_targets <= 1.0)).float()
                loss_iou = self.loss_iou(reg_iou[pos_gt_labels > 0,
                                                 pos_gt_labels],
                                         reg_iou_targets,
                                         weight=reg_iou_weights)

                losses.update(
                    dict(loss_reg=loss_reg,
                         loss_mask=loss_mask,
                         loss_instance=loss_instance,
                         loss_iou=loss_iou))
                #############################################

        return losses
Esempio n. 13
0
    def __init__(self,
                 num_classes=80,
                 in_channels=(512, 1024, 512, 256, 256, 256),
                 anchor_generator=dict(type='SSDAnchorGenerator',
                                       scale_major=False,
                                       input_size=300,
                                       strides=[8, 16, 32, 64, 100, 300],
                                       ratios=([2], [2,
                                                     3], [2,
                                                          3], [2,
                                                               3], [2], [2]),
                                       basesize_ratio_range=(0.1, 0.9)),
                 background_label=None,
                 bbox_coder=dict(
                     type='DeltaXYWHBBoxCoder',
                     target_means=[.0, .0, .0, .0],
                     target_stds=[1.0, 1.0, 1.0, 1.0],
                 ),
                 reg_decoded_bbox=False,
                 train_cfg=None,
                 test_cfg=None):
        super(AnchorHead, self).__init__()
        self.num_classes = num_classes
        self.in_channels = in_channels
        self.cls_out_channels = num_classes + 1  # add background class
        self.anchor_generator = build_anchor_generator(anchor_generator)
        num_anchors = self.anchor_generator.num_base_anchors

        reg_convs = []
        cls_convs = []
        for i in range(len(in_channels)):
            reg_convs.append(
                nn.Conv2d(in_channels[i],
                          num_anchors[i] * 4,
                          kernel_size=3,
                          padding=1))
            cls_convs.append(
                nn.Conv2d(in_channels[i],
                          num_anchors[i] * (num_classes + 1),
                          kernel_size=3,
                          padding=1))
        self.reg_convs = nn.ModuleList(reg_convs)
        self.cls_convs = nn.ModuleList(cls_convs)

        self.background_label = (num_classes if background_label is None else
                                 background_label)
        # background_label should be either 0 or num_classes
        assert (self.background_label == 0
                or self.background_label == num_classes)

        self.bbox_coder = build_bbox_coder(bbox_coder)
        self.reg_decoded_bbox = reg_decoded_bbox
        self.use_sigmoid_cls = False
        self.cls_focal_loss = False
        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        # set sampling=False for archor_target
        self.sampling = False
        if self.train_cfg:
            self.assigner = build_assigner(self.train_cfg.assigner)
            # SSD sampling=False so use PseudoSampler
            sampler_cfg = dict(type='PseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)
        self.fp16_enabled = False
Esempio n. 14
0
    def forward_train(self,
                      img,
                      img_meta,
                      gt_bboxes,
                      gt_labels,
                      gt_pids,
                      ref_img,
                      ref_bboxes,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      proposals=None):
        x = self.extract_feat(img)

        losses = dict()

        # RPN forward and loss
        if self.with_rpn:
            rpn_outs = self.rpn_head(x)
            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
                                          self.train_cfg.rpn)
            rpn_losses = self.rpn_head.loss(*rpn_loss_inputs,
                                            gt_bboxes_ignore=gt_bboxes_ignore)
            losses.update(rpn_losses)

            proposal_cfg = self.train_cfg.get('rpn_proposal',
                                              self.test_cfg.rpn)
            proposal_inputs = rpn_outs + (img_meta, proposal_cfg)
            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
        else:
            proposal_list = proposals

        # assign gts and sample proposals
        if self.with_bbox or self.with_mask:
            bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner)
            bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler,
                                         context=self)
            num_imgs = img.size(0)
            if gt_bboxes_ignore is None:
                gt_bboxes_ignore = [None for _ in range(num_imgs)]
            sampling_results = []
            for i in range(num_imgs):
                assign_result = bbox_assigner.assign(proposal_list[i],
                                                     gt_bboxes[i],
                                                     gt_bboxes_ignore[i],
                                                     gt_labels[i], gt_pids[i])
                sampling_result = bbox_sampler.sample(
                    assign_result,
                    proposal_list[i],
                    gt_bboxes[i],
                    gt_labels[i],
                    gt_pids[i],
                    ref_bboxes[i],
                    feats=[lvl_feat[i][None] for lvl_feat in x])
                sampling_results.append(sampling_result)

        # bbox head forward and loss
        if self.with_bbox:
            rois = bbox2roi([res.bboxes for res in sampling_results])
            # bbox_img_n = [res.bboxes.size(0) for res in sampling_results]
            # TODO: a more flexible way to decide which feature maps to use
            bbox_feats = self.bbox_roi_extractor(
                x[:self.bbox_roi_extractor.num_inputs], rois)
            if self.with_shared_head:
                bbox_feats = self.shared_head(bbox_feats)
            cls_score, bbox_pred = self.bbox_head(bbox_feats)

            bbox_targets = self.bbox_head.get_target(sampling_results,
                                                     gt_bboxes, gt_labels,
                                                     self.train_cfg.rcnn)
            loss_bbox = self.bbox_head.loss(cls_score, bbox_pred,
                                            *bbox_targets)
            losses.update(loss_bbox)

        ref_x = self.extract_feat(ref_img)
        if self.with_prop_track:
            """Difference between this implementation and the
                paper `Detect to Track and Track to Detect`:
                1. Based on FPN Faster R-CNN instead of R-FCN, that means:
                    a. Correlations are implemented on specific levels.
                    b. Contacted features for TrackHead is from backbone/neck.
                2. Training with positive proposals instead of only gts.
            """
            track_x = self.extract_corr_feat(x, ref_x)
            # TODO consider training use gt or (gt + props)[NOW]
            # TODO consider whether to include semantic consistence[NO]
            # TODO consider how to calculate the correlation features
            pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
            prop_track_feats = self.prop_track_roi_extractor(
                track_x[:self.prop_track_roi_extractor.num_inputs], pos_rois)
            prop_cls, prop_reg = self.prop_track_head(prop_track_feats)
            prop_targets = self.prop_track_head.get_target(
                sampling_results, self.train_cfg.rcnn)
            loss_prop_track = self.prop_track_head.loss(
                prop_cls, prop_reg, *prop_targets)
            losses.update(loss_prop_track)

        if self.with_asso_track:
            """Associate tracking, based on appearance features.
            """
            ref_rois = bbox2roi(ref_bboxes)
            num_bbox_x = [res.bboxes.size(0) for res in sampling_results]
            num_bbox_ref_x = [res.size(0) for res in ref_bboxes]
            bbox_feats = self.asso_track_roi_extractor(
                x[:self.bbox_roi_extractor.num_inputs], rois)
            ref_bbox_feats = self.asso_track_roi_extractor(
                ref_x[:self.asso_track_roi_extractor.num_inputs], ref_rois)
            asso_probs = self.asso_track_head(bbox_feats, ref_bbox_feats,
                                              num_bbox_x, num_bbox_ref_x)
            asso_targets = self.asso_track_head.get_target(
                sampling_results, self.train_cfg.track)
            loss_asso_track = self.asso_track_head.loss(
                asso_probs, *asso_targets)
            losses.update(loss_asso_track)

        return losses
Esempio n. 15
0
    def __init__(
        self,
        num_classes,
        in_channels,
        feat_channels=256,
        approx_anchor_generator=dict(
            type='AnchorGenerator',
            octave_base_scale=8,
            scales_per_octave=3,
            ratios=[0.5, 1.0, 2.0],
            strides=[4, 8, 16, 32, 64]),
        square_anchor_generator=dict(
            type='AnchorGenerator',
            ratios=[1.0],
            scales=[8],
            strides=[4, 8, 16, 32, 64]),
        anchor_coder=dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[.0, .0, .0, .0],
            target_stds=[1.0, 1.0, 1.0, 1.0]
        ),
        bbox_coder=dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[.0, .0, .0, .0],
            target_stds=[1.0, 1.0, 1.0, 1.0]
        ),
        reg_decoded_bbox=False,
        deform_groups=4,
        loc_filter_thr=0.01,
        train_cfg=None,
        test_cfg=None,
        loss_loc=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
        loss_cls=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
                       loss_weight=1.0)):  # yapf: disable
        super(AnchorHead, self).__init__()
        self.in_channels = in_channels
        self.num_classes = num_classes
        self.feat_channels = feat_channels
        self.deform_groups = deform_groups
        self.loc_filter_thr = loc_filter_thr

        # build approx_anchor_generator and square_anchor_generator
        assert (approx_anchor_generator['octave_base_scale'] ==
                square_anchor_generator['scales'][0])
        assert (approx_anchor_generator['strides'] ==
                square_anchor_generator['strides'])
        self.approx_anchor_generator = build_anchor_generator(
            approx_anchor_generator)
        self.square_anchor_generator = build_anchor_generator(
            square_anchor_generator)
        self.approxs_per_octave = self.approx_anchor_generator \
            .num_base_anchors[0]

        self.reg_decoded_bbox = reg_decoded_bbox

        # one anchor per location
        self.num_anchors = 1
        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
        self.loc_focal_loss = loss_loc['type'] in ['FocalLoss']
        self.sampling = loss_cls['type'] not in ['FocalLoss']
        self.ga_sampling = train_cfg is not None and hasattr(
            train_cfg, 'ga_sampler')
        if self.use_sigmoid_cls:
            self.cls_out_channels = self.num_classes
        else:
            self.cls_out_channels = self.num_classes + 1

        # build bbox_coder
        self.anchor_coder = build_bbox_coder(anchor_coder)
        self.bbox_coder = build_bbox_coder(bbox_coder)

        # build losses
        self.loss_loc = build_loss(loss_loc)
        self.loss_shape = build_loss(loss_shape)
        self.loss_cls = build_loss(loss_cls)
        self.loss_bbox = build_loss(loss_bbox)

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg

        if self.train_cfg:
            self.assigner = build_assigner(self.train_cfg.assigner)
            # use PseudoSampler when sampling is False
            if self.sampling and hasattr(self.train_cfg, 'sampler'):
                sampler_cfg = self.train_cfg.sampler
            else:
                sampler_cfg = dict(type='PseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)

            self.ga_assigner = build_assigner(self.train_cfg.ga_assigner)
            if self.ga_sampling:
                ga_sampler_cfg = self.train_cfg.ga_sampler
            else:
                ga_sampler_cfg = dict(type='PseudoSampler')
            self.ga_sampler = build_sampler(ga_sampler_cfg, context=self)

        self.fp16_enabled = False

        self._init_layers()
    def forward_train(self,
                      img,
                      img_meta,
                      gt_bboxes,
                      gt_labels,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      gt_semantic_seg=None,
                      gt_semantic_seg_Nx=None,
                      proposals=None,
                      ref_img=None, # images of reference frame
                      ref_bboxes=None, # gt bbox of reference frame
                      ref_labels=None,
                      ref_masks=None,
                      ref_semantic_seg=None,
                      ref_semantic_seg_Nx=None,
                      ref_obj_ids=None,
                      gt_pids=None, # gt ids of target objs mapped to reference objs
                      gt_obj_ids=None,
                      # gt_flow=None,                     
                      ):

        losses = dict()

        # ********************************
        # Initial Flow and Feature Warping
        # ******************************** 
        flowR2T, _ = self.compute_flow(img.clone(), ref_img.clone(), scale_factor=0.25)
        x = self.extract_feat(img)
        ref_x = self.extract_feat(ref_img)
        x = self.extra_neck(x, ref_x, flowR2T)

        # **********************************
        # FCN Semantic Head forward and loss
        # **********************************
        if hasattr(self, 'panopticFPN') and self.panopticFPN is not None:
            #### semantic FCN GT
            gt_semantic_seg = gt_semantic_seg.long()
            gt_semantic_seg = gt_semantic_seg.squeeze(1)
            fcn_output, fcn_score = self.panopticFPN(
                    x[0:self.panopticFPN.num_levels])
            loss_fcn = F.cross_entropy(
                    fcn_output, gt_semantic_seg, ignore_index=255)
            loss_fcn = {'loss_segm': loss_fcn}
            losses.update(loss_fcn)

        # ***************************
        # RPN forward and loss
        # ***************************
        if self.with_rpn:
            rpn_outs = self.rpn_head(x)
            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
                                          self.train_cfg.rpn)
            rpn_losses = self.rpn_head.loss(
                    *rpn_loss_inputs, 
                    gt_bboxes_ignore=gt_bboxes_ignore)
            losses.update(rpn_losses)

            proposal_cfg = self.train_cfg.get('rpn_proposal',
                                              self.test_cfg.rpn)
            proposal_inputs = rpn_outs + (img_meta, proposal_cfg)
            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
        else:
            proposal_list = proposals

        # *******************************
        # assign gts and sample proposals
        # *******************************
        if self.with_bbox or self.with_mask:
            bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner)
            bbox_sampler = build_sampler(
                    self.train_cfg.rcnn.sampler, context=self)
            num_imgs = img.size(0)
            if gt_bboxes_ignore is None:
                gt_bboxes_ignore = [None for _ in range(num_imgs)]
            sampling_results = []
            for i in range(num_imgs):
                assign_result = bbox_assigner.assign(
                        proposal_list[i], gt_bboxes[i], 
                        gt_bboxes_ignore[i], gt_labels[i])
                sampling_result = bbox_sampler.sample(
                        assign_result,
                        proposal_list[i],
                        gt_bboxes[i],
                        gt_labels[i],
                        feats=[lvl_feat[i][None] for lvl_feat in x])
                sampling_results.append(sampling_result)

        # *******************************
        # bbox head forward and loss
        # *******************************
        if self.with_bbox:
            rois = bbox2roi([res.bboxes for res in sampling_results])
            # TODO: a more flexible decision which feature maps to use
            bbox_feats = self.bbox_roi_extractor(
                    x[:self.bbox_roi_extractor.num_inputs], rois)

            if self.with_shared_head:
                bbox_feats = self.shared_head(bbox_feats)
            cls_score, bbox_pred = self.bbox_head(bbox_feats)

            bbox_targets = self.bbox_head.get_target(
                    sampling_results, gt_bboxes, 
                    gt_labels, self.train_cfg.rcnn)
            loss_bbox = self.bbox_head.loss(
                    cls_score, bbox_pred, *bbox_targets)
            losses.update(loss_bbox)

        # *******************************    
        # mask head forward and loss
        # *******************************
        if self.with_mask:
            if not self.share_roi_extractor:
                pos_rois = bbox2roi(
                    [res.pos_bboxes for res in sampling_results])
                mask_feats = self.mask_roi_extractor(
                    x[:self.mask_roi_extractor.num_inputs], pos_rois)

                if self.with_shared_head:
                    mask_feats = self.shared_head(mask_feats)
            else:
                pos_inds = []
                device = bbox_feats.device
                for res in sampling_results:
                    pos_inds.append(
                        torch.ones(
                            res.pos_bboxes.shape[0],
                            device=device,
                            dtype=torch.uint8))
                    pos_inds.append(
                        torch.zeros(
                            res.neg_bboxes.shape[0],
                            device=device,
                            dtype=torch.uint8))
                pos_inds = torch.cat(pos_inds)
                mask_feats = bbox_feats[pos_inds]
            mask_pred = self.mask_head(mask_feats)
            mask_targets = self.mask_head.get_target(
                    sampling_results, gt_masks, self.train_cfg.rcnn)
            pos_labels = torch.cat(
                [res.pos_gt_labels for res in sampling_results])
            loss_mask = self.mask_head.loss(
                    mask_pred, mask_targets, pos_labels)
            losses.update(loss_mask)
        # ***************************************
        # PANOPTIC HEAD - Only for BATCH SIZE: 1
        # ***************************************
        if hasattr(self.train_cfg, 'loss_pano_weight'):
            # extract gt rois for panpotic head
            gt_rois = bbox2roi(gt_bboxes) # [#bbox, 5]
            cls_idx = gt_labels[0] # [#bbox] / batch_size must be 1
            # fcn_score # [1,20,200,400]
            # compute mask logits with gt rois
            mask_feats = self.mask_roi_extractor(
                    x[:self.mask_roi_extractor.num_inputs], gt_rois)
                    # [#bbox,256,14,14]
            mask_score = self.mask_head(mask_feats) # [#bbox,#things+1,28,28], #things+1=9
            nobj,_,H,W = mask_score.shape
            mask_score = mask_score.gather(
                    1, cls_idx.view(-1,1,1,1).expand(-1,-1,H,W)) 
                                # [#bbox,1,28,28]
            # compute panoptic logits
            seg_stuff_logits, seg_inst_logits = self.seg_term(
                    cls_idx, fcn_score, gt_rois)
            mask_logits = self.mask_term(
                    mask_score, gt_rois, cls_idx, fcn_score)
            # panoptic_logits: [1,#stuff+#bbox,200,400]
            panoptic_logits = torch.cat(
                    [seg_stuff_logits, (seg_inst_logits + mask_logits)],
                    dim=1)
            # generate gt for panoptic head
            # added for panoptic gt generation : gt_masks_4x
            gt_masks_4x = gt_masks[0][:,::4,::4]
            with torch.no_grad():
                # gt_semantic_seg_Nx[0] [1,200,400], 
                # gt_masks_4x [#bbox,200,400]
                panoptic_gt = self.mask_matching(
                        gt_semantic_seg_Nx[0], gt_masks_4x)
                panoptic_gt = panoptic_gt.long()

            panoptic_loss = F.cross_entropy(
                    panoptic_logits, panoptic_gt, ignore_index = 255)
            pano_loss = {'loss_pano': panoptic_loss * self.train_cfg.loss_pano_weight}
            losses.update(pano_loss)

        return losses
Esempio n. 17
0
    def forward_train(self,
                      imgs,
                      img_meta,
                      imgs_2,
                      img_meta_2,
                      gt_bboxes,
                      gt_bboxes_2,
                      gt_labels,
                      gt_labels_2,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      gt_masks_2=None,
                      proposals=None):
        # self.print_iterations()
        assert imgs.shape[
            1] == 3 and imgs_2.shape[1] == 3  # make sure channel size is 3
        x = self.extract_feat(imgs)
        x_2 = self.extract_feat(imgs_2)

        losses = dict()

        # RPN forward and loss
        if self.with_rpn:
            rpn_outs = self.rpn_head(x)
            rpn_outs_2 = self.rpn_head(x_2)

            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
                                          self.train_cfg.rpn)
            rpn_loss_inputs_2 = rpn_outs_2 + (gt_bboxes_2, img_meta,
                                              self.train_cfg.rpn)

            rpn_losses = self.rpn_head.loss(*rpn_loss_inputs,
                                            gt_bboxes_ignore=gt_bboxes_ignore,
                                            iteration=self.iteration)
            rpn_losses_2 = self.rpn_head.loss(
                *rpn_loss_inputs_2,
                gt_bboxes_ignore=gt_bboxes_ignore,
                iteration=self.iteration,
                img_meta_2=img_meta_2)

            losses.update(rpn_losses)
            losses.update(rpn_losses_2)

            proposal_inputs = rpn_outs + (img_meta,
                                          self.train_cfg.rpn_proposal)
            proposal_inputs_2 = rpn_outs_2 + (img_meta,
                                              self.train_cfg.rpn_proposal)

            proposal_list, anchors = self.rpn_head.get_bboxes(*proposal_inputs)
            proposal_list_2, anchors_2 = self.rpn_head.get_bboxes(
                *proposal_inputs_2, img_meta_2=img_meta_2)
            # self.rpn_head.visualize_anchor_boxes(imgs, rpn_outs[0], img_meta, slice_num=45, shuffle=True) # debug only
            # self.visualize_proposals(imgs, proposal_list, gt_bboxes, img_meta, slice_num=None, isProposal=True) #debug only
            # self.visualize_proposals(imgs, anchors, gt_bboxes, img_meta, slice_num=None, isProposal=False) #debug only
            # self.visualize_gt_bboxes(imgs, gt_bboxes, img_meta) #debug only
            # breakpoint()
            # self.visualize_gt_bboxes(imgs_2, gt_bboxes_2, img_meta_2) #debug only
            # breakpoint()
            # self.visualize_gt_bboxes_masks(imgs, gt_bboxes, img_meta, gt_masks) # debug only
        else:
            proposal_list = proposals

        # assign gts and sample proposals
        if self.with_bbox or self.with_mask:
            bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner)
            bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler,
                                         context=self)
            num_imgs = imgs.size(0)
            gt_bboxes_ignore = [None for _ in range(num_imgs)]
            sampling_results = []

            for i in range(num_imgs):
                gt_bboxes_cur_pat = gt_bboxes[i]
                gt_bboxes_ignore_cur_pat = gt_bboxes_ignore[i]
                gt_labels_cur_pat = gt_labels[i]

                assign_result = bbox_assigner.assign(proposal_list[i],
                                                     gt_bboxes_cur_pat,
                                                     gt_bboxes_ignore_cur_pat,
                                                     gt_labels_cur_pat)
                sampling_result = bbox_sampler.sample(
                    assign_result,
                    proposal_list[i],
                    gt_bboxes_cur_pat,
                    gt_labels_cur_pat,
                    feats=[lvl_feat[i][None] for lvl_feat in x])
                sampling_results.append(sampling_result)

            bbox_assigner_2 = build_assigner(self.train_cfg.rcnn.assigner)
            bbox_sampler_2 = build_sampler(self.train_cfg.rcnn.sampler,
                                           context=self)
            num_imgs_2 = imgs_2.size(0)
            gt_bboxes_ignore_2 = [None for _ in range(num_imgs_2)]
            sampling_results_2 = []

            for i in range(num_imgs_2):
                gt_bboxes_cur_pat_2 = gt_bboxes_2[i]
                gt_bboxes_ignore_cur_pat_2 = gt_bboxes_ignore_2[i]
                gt_labels_cur_pat_2 = gt_labels_2[i]

                assign_result_2 = bbox_assigner_2.assign(
                    proposal_list_2[i], gt_bboxes_cur_pat_2,
                    gt_bboxes_ignore_cur_pat_2, gt_labels_cur_pat_2)
                sampling_result_2 = bbox_sampler_2.sample(
                    assign_result_2,
                    proposal_list_2[i],
                    gt_bboxes_cur_pat_2,
                    gt_labels_cur_pat_2,
                    feats=[lvl_feat[i][None] for lvl_feat in x_2])
                sampling_results_2.append(sampling_result_2)

        # bbox head forward and loss
        if self.with_bbox:
            rois = bbox2roi3D([res.bboxes for res in sampling_results])
            rois_2 = bbox2roi3D([res.bboxes for res in sampling_results_2])

            # TODO: a more flexible way to decide which feature maps to use
            bbox_feats = self.bbox_roi_extractor(
                x[:self.bbox_roi_extractor.num_inputs], rois)
            bbox_feats_2 = self.bbox_roi_extractor(
                x_2[:self.bbox_roi_extractor.num_inputs], rois_2)

            cls_score, bbox_pred = self.bbox_head(bbox_feats)
            cls_score_2, bbox_pred_2 = self.bbox_head(bbox_feats_2)

            cls_score = torch.cat((cls_score, cls_score_2), 0)
            bbox_pred = torch.cat((bbox_pred, bbox_pred_2), 0)

            bbox_targets = self.bbox_head.get_target(sampling_results,
                                                     gt_bboxes, gt_labels,
                                                     self.train_cfg.rcnn)
            bbox_targets_2 = self.bbox_head.get_target(sampling_results_2,
                                                       gt_bboxes_2,
                                                       gt_labels_2,
                                                       self.train_cfg.rcnn)
            bbox_targets_combined = []
            for bbox_target, bbox_target_2 in zip(bbox_targets,
                                                  bbox_targets_2):
                bbox_targets_combined.append(
                    torch.cat((bbox_target, bbox_target_2), 0))
            bbox_targets_combined = tuple(bbox_targets_combined)

            loss_bbox = self.bbox_head.loss(cls_score, bbox_pred,
                                            *bbox_targets_combined)

            losses.update(loss_bbox)

            # prepare upscaled data for refinement head
            upscaled_factor = img_meta_2[0]['ori_shape'][0] / img_meta[0][
                'ori_shape'][0]
            # convert parameterized adjustments to actual bounding boxes coordinates
            pred_bboxes_2 = self.bbox_head.convert_adjustments_to_bboxes(
                rois_2, bbox_pred_2, img_meta_2[0]
                ['img_shape'])[:, 6:].cpu().detach().numpy() / upscaled_factor

            pred_cls_score_2 = cls_score_2[:, 1, None].cpu().detach().numpy()
            pred_bboxes_2 = np.concatenate((pred_bboxes_2, pred_cls_score_2),
                                           axis=1)
            pred_bboxes_2 = [torch.from_numpy(pred_bboxes_2).cuda()]
            bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner)
            bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler,
                                         context=self)
            num_imgs = imgs.size(0)
            gt_bboxes_ignore = [None for _ in range(num_imgs)]
            sampling_results_refinement = []
            for i in range(num_imgs):
                gt_bboxes_cur_pat = gt_bboxes[i]
                gt_bboxes_ignore_cur_pat = gt_bboxes_ignore[i]
                gt_labels_cur_pat = gt_labels[i]

                assign_result = bbox_assigner.assign(pred_bboxes_2[i],
                                                     gt_bboxes_cur_pat,
                                                     gt_bboxes_ignore_cur_pat,
                                                     gt_labels_cur_pat)
                sampling_result = bbox_sampler.sample(
                    assign_result,
                    pred_bboxes_2[i],
                    gt_bboxes_cur_pat,
                    gt_labels_cur_pat,
                    feats=[lvl_feat[i][None] for lvl_feat in x])
                sampling_results_refinement.append(sampling_result)
            rois_refinement = bbox2roi3D(
                [res.bboxes for res in sampling_results_refinement])
            bbox_feats_refinement = self.bbox_roi_extractor_refinement(
                x[:self.bbox_roi_extractor_refinement.num_inputs],
                rois_refinement)
            # training refinement head
            refined_bbox_pred = self.refinement_head(bbox_feats_refinement)
            bbox_targets_refinement = self.refinement_head.get_target(
                sampling_results_refinement, gt_bboxes, gt_labels,
                self.train_cfg.rcnn)
            loss_refinement = self.refinement_head.loss(
                refined_bbox_pred, *bbox_targets_refinement)
            losses.update(loss_refinement)

        # mask head forward and loss
        if self.with_mask:
            pos_rois = bbox2roi3D([res.pos_bboxes for res in sampling_results])
            mask_feats = self.mask_roi_extractor(
                x[:self.mask_roi_extractor.num_inputs], pos_rois)
            mask_pred = self.mask_head(mask_feats)
            mask_targets = self.mask_head.get_target(sampling_results,
                                                     gt_masks,
                                                     self.train_cfg.rcnn)
            pos_labels = torch.cat(
                [res.pos_gt_labels for res in sampling_results])

            pos_rois_refined = bbox2roi3D(
                [res.pos_bboxes for res in sampling_results_refinement])
            mask_feats_refined = self.mask_roi_extractor(
                x[:self.mask_roi_extractor.num_inputs], pos_rois_refined)
            mask_pred_refined = self.mask_head(mask_feats_refined)
            mask_targets_refined = self.mask_head.get_target(
                sampling_results_refinement, gt_masks, self.train_cfg.rcnn)
            pos_labels_refined = torch.cat(
                [res.pos_gt_labels for res in sampling_results_refinement])

            mask_pred_combined = torch.cat((mask_pred, mask_pred_refined))
            mask_targets_combined = torch.cat(
                (mask_targets, mask_targets_refined))
            pos_labels_combined = torch.cat((pos_labels, pos_labels_refined))
            loss_mask = self.mask_head.loss(mask_pred_combined,
                                            mask_targets_combined,
                                            pos_labels_combined)
            losses.update(loss_mask)

        # self.save_losses_plt(losses) #debug only...
        self.iteration += 1
        return losses
Esempio n. 18
0
    def __init__(self,
                 in_channels,
                 feat_channels,
                 out_channels,
                 num_things_classes=80,
                 num_stuff_classes=53,
                 num_queries=100,
                 pixel_decoder=None,
                 enforce_decoder_input_project=False,
                 transformer_decoder=None,
                 positional_encoding=None,
                 loss_cls=dict(type='CrossEntropyLoss',
                               use_sigmoid=False,
                               loss_weight=1.0,
                               class_weight=[1.0] * 133 + [0.1]),
                 loss_mask=dict(type='FocalLoss',
                                use_sigmoid=True,
                                gamma=2.0,
                                alpha=0.25,
                                loss_weight=20.0),
                 loss_dice=dict(type='DiceLoss',
                                use_sigmoid=True,
                                activate=True,
                                naive_dice=True,
                                loss_weight=1.0),
                 train_cfg=None,
                 test_cfg=None,
                 init_cfg=None,
                 **kwargs):
        super(AnchorFreeHead, self).__init__(init_cfg)
        self.num_things_classes = num_things_classes
        self.num_stuff_classes = num_stuff_classes
        self.num_classes = self.num_things_classes + self.num_stuff_classes
        self.num_queries = num_queries

        pixel_decoder.update(in_channels=in_channels,
                             feat_channels=feat_channels,
                             out_channels=out_channels)
        self.pixel_decoder = build_plugin_layer(pixel_decoder)[1]
        self.transformer_decoder = build_transformer_layer_sequence(
            transformer_decoder)
        self.decoder_embed_dims = self.transformer_decoder.embed_dims
        pixel_decoder_type = pixel_decoder.get('type')
        if pixel_decoder_type == 'PixelDecoder' and (
                self.decoder_embed_dims != in_channels[-1]
                or enforce_decoder_input_project):
            self.decoder_input_proj = Conv2d(in_channels[-1],
                                             self.decoder_embed_dims,
                                             kernel_size=1)
        else:
            self.decoder_input_proj = nn.Identity()
        self.decoder_pe = build_positional_encoding(positional_encoding)
        self.query_embed = nn.Embedding(self.num_queries, out_channels)

        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
        self.mask_embed = nn.Sequential(
            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
            nn.Linear(feat_channels, out_channels))

        self.test_cfg = test_cfg
        self.train_cfg = train_cfg
        if train_cfg:
            self.assigner = build_assigner(train_cfg.assigner)
            self.sampler = build_sampler(train_cfg.sampler, context=self)

        self.class_weight = loss_cls.class_weight
        self.loss_cls = build_loss(loss_cls)
        self.loss_mask = build_loss(loss_mask)
        self.loss_dice = build_loss(loss_dice)
Esempio n. 19
0
    def __init__(self,
                 num_classes,
                 in_channels,
                 stacked_convs=4,
                 feat_channels=256,
                 approx_anchor_generator=dict(
                     type='AnchorGenerator',
                     octave_base_scale=4,
                     scales_per_octave=3,
                     ratios=[0.5, 1.0, 2.0],
                     strides=[8, 16, 32, 64, 128]),
                 square_anchor_generator=dict(
                     type='AnchorGenerator',
                     ratios=[1.0],
                     scales=[4],
                     strides=[8, 16, 32, 64, 128]),
                 conv_cfg=None,
                 norm_cfg=None,
                 bbox_coder=dict(
                     type='BucketingBBoxCoder',
                     num_buckets=14,
                     scale_factor=3.0),
                 reg_decoded_bbox=False,
                 background_label=None,
                 train_cfg=None,
                 test_cfg=None,
                 loss_cls=dict(
                     type='FocalLoss',
                     use_sigmoid=True,
                     gamma=2.0,
                     alpha=0.25,
                     loss_weight=1.0),
                 loss_bbox_cls=dict(
                     type='CrossEntropyLoss',
                     use_sigmoid=True,
                     loss_weight=1.5),
                 loss_bbox_reg=dict(
                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)):
        super(SABLRetinaHead, self).__init__()
        self.in_channels = in_channels
        self.num_classes = num_classes
        self.feat_channels = feat_channels
        self.num_buckets = bbox_coder['num_buckets']
        self.side_num = int(np.ceil(self.num_buckets / 2))

        assert (approx_anchor_generator['octave_base_scale'] ==
                square_anchor_generator['scales'][0])
        assert (approx_anchor_generator['strides'] ==
                square_anchor_generator['strides'])

        self.approx_anchor_generator = build_anchor_generator(
            approx_anchor_generator)
        self.square_anchor_generator = build_anchor_generator(
            square_anchor_generator)
        self.approxs_per_octave = (
            self.approx_anchor_generator.num_base_anchors[0])

        # one anchor per location
        self.num_anchors = 1
        self.stacked_convs = stacked_convs
        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg

        self.reg_decoded_bbox = reg_decoded_bbox
        self.background_label = (
            num_classes if background_label is None else background_label)

        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
        self.sampling = loss_cls['type'] not in [
            'FocalLoss', 'GHMC', 'QualityFocalLoss'
        ]
        if self.use_sigmoid_cls:
            self.cls_out_channels = num_classes
        else:
            self.cls_out_channels = num_classes + 1

        self.bbox_coder = build_bbox_coder(bbox_coder)
        self.loss_cls = build_loss(loss_cls)
        self.loss_bbox_cls = build_loss(loss_bbox_cls)
        self.loss_bbox_reg = build_loss(loss_bbox_reg)

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg

        if self.train_cfg:
            self.assigner = build_assigner(self.train_cfg.assigner)
            # use PseudoSampler when sampling is False
            if self.sampling and hasattr(self.train_cfg, 'sampler'):
                sampler_cfg = self.train_cfg.sampler
            else:
                sampler_cfg = dict(type='PseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)

        self.fp16_enabled = False
        self._init_layers()
Esempio n. 20
0
    def __init__(self,
                 num_classes,
                 in_channels,
                 num_fcs=2,
                 transformer=dict(
                     type='Transformer',
                     embed_dims=256,
                     num_heads=8,
                     num_encoder_layers=6,
                     num_decoder_layers=6,
                     feedforward_channels=2048,
                     dropout=0.1,
                     act_cfg=dict(type='ReLU', inplace=True),
                     norm_cfg=dict(type='LN'),
                     num_fcs=2,
                     pre_norm=False,
                     return_intermediate_dec=True),
                 positional_encoding=dict(
                     type='SinePositionalEncoding',
                     num_feats=128,
                     normalize=True),
                 loss_cls=dict(
                     type='CrossEntropyLoss',
                     bg_cls_weight=0.1,
                     use_sigmoid=False,
                     loss_weight=1.0,
                     class_weight=1.0),
                 loss_bbox=dict(type='L1Loss', loss_weight=5.0),
                 loss_iou=dict(type='GIoULoss', loss_weight=2.0),
                 train_cfg=dict(
                     assigner=dict(
                         type='HungarianAssigner',
                         cls_weight=1.,
                         bbox_weight=5.,
                         iou_weight=2.,
                         iou_calculator=dict(type='BboxOverlaps2D'),
                         iou_mode='giou')),
                 test_cfg=dict(max_per_img=100),
                 **kwargs):
        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
        # since it brings inconvenience when the initialization of
        # `AnchorFreeHead` is called.
        super(AnchorFreeHead, self).__init__()
        use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
        assert not use_sigmoid_cls, 'setting use_sigmoid_cls as True is ' \
            'not supported in DETR, since background is needed for the ' \
            'matching process.'
        assert 'embed_dims' in transformer \
            and 'num_feats' in positional_encoding
        num_feats = positional_encoding['num_feats']
        embed_dims = transformer['embed_dims']
        assert num_feats * 2 == embed_dims, 'embed_dims should' \
            f' be exactly 2 times of num_feats. Found {embed_dims}' \
            f' and {num_feats}.'
        assert test_cfg is not None and 'max_per_img' in test_cfg

        class_weight = loss_cls.get('class_weight', None)
        if class_weight is not None:
            assert isinstance(class_weight, float), 'Expected ' \
                'class_weight to have type float. Found ' \
                f'{type(class_weight)}.'
            # NOTE following the official DETR rep0, bg_cls_weight means
            # relative classification weight of the no-object class.
            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
            assert isinstance(bg_cls_weight, float), 'Expected ' \
                'bg_cls_weight to have type float. Found ' \
                f'{type(bg_cls_weight)}.'
            class_weight = torch.ones(num_classes + 1) * class_weight
            # set background class as the last indice
            class_weight[num_classes] = bg_cls_weight
            loss_cls.update({'class_weight': class_weight})
            if 'bg_cls_weight' in loss_cls:
                loss_cls.pop('bg_cls_weight')
            self.bg_cls_weight = bg_cls_weight

        if train_cfg:
            assert 'assigner' in train_cfg, 'assigner should be provided '\
                'when train_cfg is set.'
            assigner = train_cfg['assigner']
            assert loss_cls['loss_weight'] == assigner['cls_weight'], \
                'The classification weight for loss and matcher should be' \
                'exactly the same.'
            assert loss_bbox['loss_weight'] == assigner['bbox_weight'], \
                'The regression L1 weight for loss and matcher should be' \
                'exactly the same.'
            assert loss_iou['loss_weight'] == assigner['iou_weight'], \
                'The regression iou weight for loss and matcher should be' \
                'exactly the same.'
            self.assigner = build_assigner(assigner)
            # DETR sampling=False, so use PseudoSampler
            sampler_cfg = dict(type='PseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)
        self.num_classes = num_classes
        self.cls_out_channels = num_classes + 1
        self.in_channels = in_channels
        self.num_fcs = num_fcs
        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        self.use_sigmoid_cls = use_sigmoid_cls
        self.embed_dims = embed_dims
        self.num_query = test_cfg['max_per_img']
        self.fp16_enabled = False
        self.loss_cls = build_loss(loss_cls)
        self.loss_bbox = build_loss(loss_bbox)
        self.loss_iou = build_loss(loss_iou)
        self.act_cfg = transformer.get('act_cfg',
                                       dict(type='ReLU', inplace=True))
        self.activate = build_activation_layer(self.act_cfg)
        self.positional_encoding = build_positional_encoding(
            positional_encoding)
        self.transformer = build_transformer(transformer)
        self._init_layers()
Esempio n. 21
0
    def __init__(self,
                 num_classes,
                 in_channels,
                 feat_channels=256,
                 stacked_convs=2,
                 strides=[8, 16, 32],
                 use_depthwise=False,
                 dcn_on_last_conv=False,
                 conv_bias='auto',
                 conv_cfg=None,
                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
                 act_cfg=dict(type='Swish'),
                 loss_cls=dict(type='CrossEntropyLoss',
                               use_sigmoid=True,
                               reduction='sum',
                               loss_weight=1.0),
                 loss_bbox=dict(type='IoULoss',
                                mode='square',
                                eps=1e-16,
                                reduction='sum',
                                loss_weight=5.0),
                 loss_obj=dict(type='CrossEntropyLoss',
                               use_sigmoid=True,
                               reduction='sum',
                               loss_weight=1.0),
                 loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
                 train_cfg=None,
                 test_cfg=None,
                 init_cfg=dict(type='Kaiming',
                               layer='Conv2d',
                               a=math.sqrt(5),
                               distribution='uniform',
                               mode='fan_in',
                               nonlinearity='leaky_relu')):

        super().__init__(init_cfg=init_cfg)
        self.num_classes = num_classes
        self.cls_out_channels = num_classes
        self.in_channels = in_channels
        self.feat_channels = feat_channels
        self.stacked_convs = stacked_convs
        self.strides = strides
        self.use_depthwise = use_depthwise
        self.dcn_on_last_conv = dcn_on_last_conv
        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
        self.conv_bias = conv_bias
        self.use_sigmoid_cls = True

        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.act_cfg = act_cfg

        self.loss_cls = build_loss(loss_cls)
        self.loss_bbox = build_loss(loss_bbox)
        self.loss_obj = build_loss(loss_obj)

        self.use_l1 = False  # This flag will be modified by hooks.
        self.loss_l1 = build_loss(loss_l1)

        self.prior_generator = MlvlPointGenerator(strides, offset=0)

        self.test_cfg = test_cfg
        self.train_cfg = train_cfg

        self.sampling = False
        if self.train_cfg:
            self.assigner = build_assigner(self.train_cfg.assigner)
            # sampling=False so use PseudoSampler
            sampler_cfg = dict(type='PseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)

        self.fp16_enabled = False
        self._init_layers()
Esempio n. 22
0
    def __init__(self,
                 num_classes,
                 in_channels,
                 feat_channels=256,
                 anchor_generator=dict(type='AnchorGenerator',
                                       scales=[8, 16, 32],
                                       ratios=[0.5, 1.0, 2.0],
                                       strides=[4, 8, 16, 32, 64]),
                 bbox_coder=dict(type='DeltaXYWHBBoxCoder',
                                 target_means=(.0, .0, .0, .0),
                                 target_stds=(1.0, 1.0, 1.0, 1.0)),
                 reg_decoded_bbox=False,
                 background_label=None,
                 loss_cls=dict(type='CrossEntropyLoss',
                               use_sigmoid=True,
                               loss_weight=1.0),
                 loss_bbox=dict(type='SmoothL1Loss',
                                beta=1.0 / 9.0,
                                loss_weight=1.0),
                 train_cfg=None,
                 test_cfg=None):
        super(AnchorHead, self).__init__()
        self.in_channels = in_channels
        self.num_classes = num_classes
        self.feat_channels = feat_channels
        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
        # TODO better way to determine whether sample or not
        self.sampling = loss_cls['type'] not in [
            'FocalLoss', 'GHMC', 'QualityFocalLoss'
        ]
        if self.use_sigmoid_cls:
            self.cls_out_channels = num_classes
        else:
            self.cls_out_channels = num_classes + 1

        if self.cls_out_channels <= 0:
            raise ValueError(f'num_classes={num_classes} is too small')
        self.reg_decoded_bbox = reg_decoded_bbox

        self.background_label = (num_classes if background_label is None else
                                 background_label)
        # background_label should be either 0 or num_classes
        assert (self.background_label == 0
                or self.background_label == num_classes)

        self.bbox_coder = build_bbox_coder(bbox_coder)
        self.loss_cls = build_loss(loss_cls)
        self.loss_bbox = build_loss(loss_bbox)
        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        if self.train_cfg:
            self.assigner = build_assigner(self.train_cfg.assigner)
            # use PseudoSampler when sampling is False
            if self.sampling and hasattr(self.train_cfg, 'sampler'):
                sampler_cfg = self.train_cfg.sampler
            else:
                sampler_cfg = dict(type='PseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)
        self.fp16_enabled = False

        self.anchor_generator = build_anchor_generator(anchor_generator)
        # usually the numbers of anchors for each level are the same
        # except SSD detectors
        self.num_anchors = self.anchor_generator.num_base_anchors[0]
        self._init_layers()
Esempio n. 23
0
    def __init__(self,
                 num_classes,
                 in_channels,
                 out_channels=(1024, 512, 256),
                 anchor_generator=dict(type='YOLOAnchorGenerator',
                                       base_sizes=[[(116, 90), (156, 198),
                                                    (373, 326)],
                                                   [(30, 61), (62, 45),
                                                    (59, 119)],
                                                   [(10, 13), (16, 30),
                                                    (33, 23)]],
                                       strides=[32, 16, 8]),
                 bbox_coder=dict(type='YOLOBBoxCoder'),
                 featmap_strides=[32, 16, 8],
                 one_hot_smoother=0.,
                 conv_cfg=None,
                 norm_cfg=dict(type='BN', requires_grad=True),
                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
                 loss_cls=dict(type='CrossEntropyLoss',
                               use_sigmoid=True,
                               loss_weight=1.0),
                 loss_conf=dict(type='CrossEntropyLoss',
                                use_sigmoid=True,
                                loss_weight=1.0),
                 loss_xy=dict(type='CrossEntropyLoss',
                              use_sigmoid=True,
                              loss_weight=1.0),
                 loss_wh=dict(type='MSELoss', loss_weight=1.0),
                 train_cfg=None,
                 test_cfg=None):
        super(YOLOV3Head, self).__init__()
        # Check params
        assert (len(in_channels) == len(out_channels) == len(featmap_strides))

        self.num_classes = num_classes
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.featmap_strides = featmap_strides
        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        if self.train_cfg:
            self.assigner = build_assigner(self.train_cfg.assigner)
            if hasattr(self.train_cfg, 'sampler'):
                sampler_cfg = self.train_cfg.sampler
            else:
                sampler_cfg = dict(type='PseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)

        self.one_hot_smoother = one_hot_smoother

        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.act_cfg = act_cfg

        self.bbox_coder = build_bbox_coder(bbox_coder)
        self.anchor_generator = build_anchor_generator(anchor_generator)

        self.loss_cls = build_loss(loss_cls)
        self.loss_conf = build_loss(loss_conf)
        self.loss_xy = build_loss(loss_xy)
        self.loss_wh = build_loss(loss_wh)
        # usually the numbers of anchors for each level are the same
        # except SSD detectors
        self.num_anchors = self.anchor_generator.num_base_anchors[0]
        assert len(
            self.anchor_generator.num_base_anchors) == len(featmap_strides)
        self._init_layers()
Esempio n. 24
0
    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes,
                      gt_labels,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      proposals=None):
        """
        Args:
            img (Tensor): of shape (N, C, H, W) encoding input images.
                Typically these should be mean centered and std scaled.

            img_metas (list[dict]): list of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and my also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                `mmdet/datasets/pipelines/formatting.py:Collect`.

            gt_bboxes (list[Tensor]): each item are the truth boxes for each
                image in [tl_x, tl_y, br_x, br_y] format.

            gt_labels (list[Tensor]): class indices corresponding to each box

            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
                boxes can be ignored when computing the loss.

            gt_masks (None | Tensor) : true segmentation masks for each box
                used if the architecture supports a segmentation task.

            proposals : override rpn proposals with custom proposals. Use when
                `with_rpn` is False.

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        x = self.extract_feat(img)

        losses = dict()

        if self.with_rpn:
            rpn_outs = self.rpn_head(x)
            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas,
                                          self.train_cfg.rpn)
            rpn_losses = self.rpn_head.loss(*rpn_loss_inputs,
                                            gt_bboxes_ignore=gt_bboxes_ignore)
            losses.update(rpn_losses)

            proposal_cfg = self.train_cfg.get('rpn_proposal',
                                              self.test_cfg.rpn)
            proposal_inputs = rpn_outs + (img_metas, proposal_cfg)
            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
        else:
            proposal_list = proposals

        for i in range(self.num_stages):
            self.current_stage = i
            rcnn_train_cfg = self.train_cfg.rcnn[i]
            lw = self.train_cfg.stage_loss_weights[i]

            # assign gts and sample proposals
            sampling_results = []
            if self.with_bbox or self.with_mask:
                bbox_assigner = build_assigner(rcnn_train_cfg.assigner)
                bbox_sampler = build_sampler(rcnn_train_cfg.sampler,
                                             context=self)
                num_imgs = img.size(0)
                if gt_bboxes_ignore is None:
                    gt_bboxes_ignore = [None for _ in range(num_imgs)]

                for j in range(num_imgs):
                    assign_result = bbox_assigner.assign(
                        proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j],
                        gt_labels[j])

                    sampling_result = bbox_sampler.sample(
                        assign_result,
                        proposal_list[j],
                        gt_bboxes[j],
                        gt_labels[j],
                        feats=[lvl_feat[j][None] for lvl_feat in x])
                    sampling_results.append(sampling_result)

            # bbox head forward and loss
            bbox_roi_extractor = self.bbox_roi_extractor[i]
            bbox_head = self.bbox_head[i]

            rois = bbox2roi([res.bboxes for res in sampling_results])

            if len(rois) == 0:
                # If there are no predicted and/or truth boxes, then we cannot
                # compute head / mask losses
                continue

            bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
                                            rois)
            if self.with_shared_head:
                bbox_feats = self.shared_head(bbox_feats)
            cls_score, bbox_pred = bbox_head(bbox_feats)

            bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes,
                                                gt_labels, rcnn_train_cfg)
            loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets)
            for name, value in loss_bbox.items():
                losses['s{}.{}'.format(
                    i, name)] = (value * lw if 'loss' in name else value)

            # mask head forward and loss
            if self.with_mask:
                if not self.share_roi_extractor:
                    mask_roi_extractor = self.mask_roi_extractor[i]
                    pos_rois = bbox2roi(
                        [res.pos_bboxes for res in sampling_results])
                    mask_feats = mask_roi_extractor(
                        x[:mask_roi_extractor.num_inputs], pos_rois)
                    if self.with_shared_head:
                        mask_feats = self.shared_head(mask_feats)
                else:
                    # reuse positive bbox feats
                    pos_inds = []
                    device = bbox_feats.device
                    for res in sampling_results:
                        pos_inds.append(
                            torch.ones(res.pos_bboxes.shape[0],
                                       device=device,
                                       dtype=torch.uint8))
                        pos_inds.append(
                            torch.zeros(res.neg_bboxes.shape[0],
                                        device=device,
                                        dtype=torch.uint8))
                    pos_inds = torch.cat(pos_inds)
                    mask_feats = bbox_feats[pos_inds.type(torch.bool)]
                mask_head = self.mask_head[i]
                mask_pred = mask_head(mask_feats)
                mask_targets = mask_head.get_target(sampling_results, gt_masks,
                                                    rcnn_train_cfg)
                pos_labels = torch.cat(
                    [res.pos_gt_labels for res in sampling_results])
                loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels)
                for name, value in loss_mask.items():
                    losses['s{}.{}'.format(
                        i, name)] = (value * lw if 'loss' in name else value)

            # refine bboxes
            if i < self.num_stages - 1:
                pos_is_gts = [res.pos_is_gt for res in sampling_results]
                roi_labels = bbox_targets[0]  # bbox_targets is a tuple
                with torch.no_grad():
                    proposal_list = bbox_head.refine_bboxes(
                        rois, roi_labels, bbox_pred, pos_is_gts, img_metas)

        return losses
    def forward_train(self,
                      img,
                      img_meta,
                      gt_bboxes,
                      gt_labels,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      gt_semantic_seg=None,
                      proposals=None):
        x = self.extract_feat(img)

        losses = dict()

        # RPN part, the same as normal two-stage detectors
        if self.with_rpn:
            rpn_outs = self.rpn_head(x)
            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
                                          self.train_cfg.rpn)
            rpn_losses = self.rpn_head.loss(*rpn_loss_inputs,
                                            gt_bboxes_ignore=gt_bboxes_ignore)
            losses.update(rpn_losses)

            proposal_cfg = self.train_cfg.get('rpn_proposal',
                                              self.test_cfg.rpn)
            proposal_inputs = rpn_outs + (img_meta, proposal_cfg)
            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
        else:
            proposal_list = proposals

        # semantic segmentation part
        # 2 outputs: segmentation prediction and embedded features
        if self.with_semantic:
            semantic_pred, semantic_feat = self.semantic_head(x)
            loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_seg)
            losses['loss_semantic_seg'] = loss_seg
        else:
            semantic_feat = None

        for i in range(self.num_stages):
            self.current_stage = i
            rcnn_train_cfg = self.train_cfg.rcnn[i]
            lw = self.train_cfg.stage_loss_weights[i]

            # assign gts and sample proposals
            sampling_results = []
            bbox_assigner = build_assigner(rcnn_train_cfg.assigner)
            bbox_sampler = build_sampler(rcnn_train_cfg.sampler, context=self)
            num_imgs = img.size(0)
            if gt_bboxes_ignore is None:
                gt_bboxes_ignore = [None for _ in range(num_imgs)]

            for j in range(num_imgs):
                assign_result = bbox_assigner.assign(proposal_list[j],
                                                     gt_bboxes[j],
                                                     gt_bboxes_ignore[j],
                                                     gt_labels[j])
                sampling_result = bbox_sampler.sample(
                    assign_result,
                    proposal_list[j],
                    gt_bboxes[j],
                    gt_labels[j],
                    feats=[lvl_feat[j][None] for lvl_feat in x])
                sampling_results.append(sampling_result)

            # bbox head forward and loss
            loss_bbox, rois, bbox_targets, bbox_pred = \
                self._bbox_forward_train(
                    i, x, sampling_results, gt_bboxes, gt_labels,
                    rcnn_train_cfg, semantic_feat)
            roi_labels = bbox_targets[0]

            for name, value in loss_bbox.items():
                losses['s{}.{}'.format(
                    i, name)] = (value * lw if 'loss' in name else value)

            # mask head forward and loss
            if self.with_mask:
                # interleaved execution: use regressed bboxes by the box branch
                # to train the mask branch
                if self.interleaved:
                    pos_is_gts = [res.pos_is_gt for res in sampling_results]
                    with torch.no_grad():
                        proposal_list = self.bbox_head[i].refine_bboxes(
                            rois, roi_labels, bbox_pred, pos_is_gts, img_meta)
                        # re-assign and sample 512 RoIs from 512 RoIs
                        sampling_results = []
                        for j in range(num_imgs):
                            assign_result = bbox_assigner.assign(
                                proposal_list[j], gt_bboxes[j],
                                gt_bboxes_ignore[j], gt_labels[j])
                            sampling_result = bbox_sampler.sample(
                                assign_result,
                                proposal_list[j],
                                gt_bboxes[j],
                                gt_labels[j],
                                feats=[lvl_feat[j][None] for lvl_feat in x])
                            sampling_results.append(sampling_result)
                loss_mask = self._mask_forward_train(i, x, sampling_results,
                                                     gt_masks, rcnn_train_cfg,
                                                     semantic_feat)
                for name, value in loss_mask.items():
                    losses['s{}.{}'.format(
                        i, name)] = (value * lw if 'loss' in name else value)

            # refine bboxes (same as Cascade R-CNN)
            if i < self.num_stages - 1 and not self.interleaved:
                pos_is_gts = [res.pos_is_gt for res in sampling_results]
                with torch.no_grad():
                    proposal_list = self.bbox_head[i].refine_bboxes(
                        rois, roi_labels, bbox_pred, pos_is_gts, img_meta)

        return losses
Esempio n. 26
0
    def __init__(self,
                 num_classes,
                 in_channels,
                 regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 512),
                                 (512, INF)),
                 center_sampling=False,
                 center_sample_radius=1.5,
                 sync_num_pos=True,
                 gradient_mul=0.1,
                 bbox_norm_type='reg_denom',
                 loss_cls_fl=dict(type='FocalLoss',
                                  use_sigmoid=True,
                                  gamma=2.0,
                                  alpha=0.25,
                                  loss_weight=1.0),
                 use_vfl=True,
                 loss_cls=dict(type='VarifocalLoss',
                               use_sigmoid=True,
                               alpha=0.75,
                               gamma=2.0,
                               iou_weighted=True,
                               loss_weight=1.0),
                 loss_bbox=dict(type='GIoULoss', loss_weight=1.5),
                 loss_bbox_refine=dict(type='GIoULoss', loss_weight=2.0),
                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
                 use_atss=True,
                 anchor_generator=dict(type='AnchorGenerator',
                                       ratios=[1.0],
                                       octave_base_scale=8,
                                       scales_per_octave=1,
                                       center_offset=0.0,
                                       strides=[8, 16, 32, 64, 128]),
                 **kwargs):
        # dcn base offsets, adapted from reppoints_head.py
        self.num_dconv_points = 9
        self.dcn_kernel = int(np.sqrt(self.num_dconv_points))
        self.dcn_pad = int((self.dcn_kernel - 1) / 2)
        dcn_base = np.arange(-self.dcn_pad,
                             self.dcn_pad + 1).astype(np.float64)
        dcn_base_y = np.repeat(dcn_base, self.dcn_kernel)
        dcn_base_x = np.tile(dcn_base, self.dcn_kernel)
        dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape(
            (-1))
        self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1)

        super(FCOSHead, self).__init__(num_classes,
                                       in_channels,
                                       norm_cfg=norm_cfg,
                                       **kwargs)
        self.regress_ranges = regress_ranges
        self.reg_denoms = [
            regress_range[-1] for regress_range in regress_ranges
        ]
        self.reg_denoms[-1] = self.reg_denoms[-2] * 2
        self.center_sampling = center_sampling
        self.center_sample_radius = center_sample_radius
        self.sync_num_pos = sync_num_pos
        self.bbox_norm_type = bbox_norm_type
        self.gradient_mul = gradient_mul
        self.use_vfl = use_vfl
        if self.use_vfl:
            self.loss_cls = build_loss(loss_cls)
        else:
            self.loss_cls = build_loss(loss_cls_fl)
        self.loss_bbox = build_loss(loss_bbox)
        self.loss_bbox_refine = build_loss(loss_bbox_refine)

        # for getting ATSS targets
        self.use_atss = use_atss
        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
        self.anchor_generator = build_anchor_generator(anchor_generator)
        self.anchor_center_offset = anchor_generator['center_offset']
        self.num_anchors = self.anchor_generator.num_base_anchors[0]
        self.sampling = False
        if self.train_cfg:
            self.assigner = build_assigner(self.train_cfg.assigner)
            sampler_cfg = dict(type='PseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)
Esempio n. 27
0
    def forward_train(self,
                      img,
                      img_meta,
                      gt_bboxes,
                      gt_labels,
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      proposals=None):
        x = self.extract_feat(img)

        losses = dict()

        if self.with_rpn:
            rpn_outs = self.rpn_head(x)
            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
                                          self.train_cfg.rpn)
            rpn_losses = self.rpn_head.loss(*rpn_loss_inputs,
                                            gt_bboxes_ignore=gt_bboxes_ignore)
            losses.update(rpn_losses)

            proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn)
            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
        else:
            proposal_list = proposals

        for i in range(self.num_stages):
            self.current_stage = i
            rcnn_train_cfg = self.train_cfg.rcnn[i]
            lw = self.train_cfg.stage_loss_weights[i]

            # assign gts and sample proposals
            sampling_results = []
            if self.with_bbox or self.with_mask:
                bbox_assigner = build_assigner(rcnn_train_cfg.assigner)
                bbox_sampler = build_sampler(rcnn_train_cfg.sampler,
                                             context=self)
                num_imgs = img.size(0)
                if gt_bboxes_ignore is None:
                    gt_bboxes_ignore = [None for _ in range(num_imgs)]

                for j in range(num_imgs):
                    assign_result = bbox_assigner.assign(
                        proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j],
                        gt_labels[j])
                    sampling_result = bbox_sampler.sample(
                        assign_result,
                        proposal_list[j],
                        gt_bboxes[j],
                        gt_labels[j],
                        feats=[lvl_feat[j][None] for lvl_feat in x])
                    sampling_results.append(sampling_result)

            # bbox head forward and loss
            bbox_roi_extractor = self.bbox_roi_extractor[i]
            bbox_head = self.bbox_head[i]

            rois = bbox2roi([res.bboxes for res in sampling_results])
            bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
                                            rois)
            if self.with_shared_head:
                bbox_feats = self.shared_head(bbox_feats)
            cls_score, bbox_pred = bbox_head(bbox_feats)

            bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes,
                                                gt_labels, rcnn_train_cfg)
            loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets)
            for name, value in loss_bbox.items():
                losses['s{}.{}'.format(
                    i, name)] = (value * lw if 'loss' in name else value)

            # mask head forward and loss
            if self.with_mask:
                if not self.share_roi_extractor:
                    mask_roi_extractor = self.mask_roi_extractor[i]
                    pos_rois = bbox2roi(
                        [res.pos_bboxes for res in sampling_results])
                    mask_feats = mask_roi_extractor(
                        x[:mask_roi_extractor.num_inputs], pos_rois)
                    if self.with_shared_head:
                        mask_feats = self.shared_head(mask_feats)
                else:
                    # reuse positive bbox feats
                    pos_inds = []
                    device = bbox_feats.device
                    for res in sampling_results:
                        pos_inds.append(
                            torch.ones(res.pos_bboxes.shape[0],
                                       device=device,
                                       dtype=torch.uint8))
                        pos_inds.append(
                            torch.zeros(res.neg_bboxes.shape[0],
                                        device=device,
                                        dtype=torch.uint8))
                    pos_inds = torch.cat(pos_inds)
                    mask_feats = bbox_feats[pos_inds]
                mask_head = self.mask_head[i]
                mask_pred = mask_head(mask_feats)
                mask_targets = mask_head.get_target(sampling_results, gt_masks,
                                                    rcnn_train_cfg)
                pos_labels = torch.cat(
                    [res.pos_gt_labels for res in sampling_results])
                loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels)
                for name, value in loss_mask.items():
                    losses['s{}.{}'.format(
                        i, name)] = (value * lw if 'loss' in name else value)

            # refine bboxes
            if i < self.num_stages - 1:
                pos_is_gts = [res.pos_is_gt for res in sampling_results]
                roi_labels = bbox_targets[0]  # bbox_targets is a tuple
                with torch.no_grad():
                    proposal_list = bbox_head.refine_bboxes(
                        rois, roi_labels, bbox_pred, pos_is_gts, img_meta)

        return losses
Esempio n. 28
0
    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes,
                      gt_labels,
                      gt_bboxes_ignore=None):
        if self.use_consistent_supervision:
            x, y = self.extract_feat(img)
            gt_bboxes_auxiliary = [gt.clone() for gt in gt_bboxes]
            gt_labels_auxiliary = [label.clone() for label in gt_labels]
        else:
            x = self.extract_feat(img)
        outs = self.bbox_head(x)
        loss_inputs = outs + (gt_bboxes, gt_labels, img_metas, self.train_cfg)

        losses = self.bbox_head.loss(*loss_inputs,
                                     gt_bboxes_ignore=gt_bboxes_ignore)
        if self.use_consistent_supervision:
            proposal_cfg = self.train_cfg.auxiliary.proposal
            proposal_inputs = outs + (img_metas, proposal_cfg)
            proposal_list = self.bbox_head.get_bboxes_auxiliary(
                *proposal_inputs)

            bbox_assigner = build_assigner(self.train_cfg.auxiliary.assigner)
            bbox_sampler = build_sampler(self.train_cfg.auxiliary.sampler,
                                         context=self)
            num_imgs = img.size(0)
            if gt_bboxes_ignore is None:
                gt_bboxes_ignore = [None for _ in range(num_imgs)]
            sampling_results = []
            for i in range(num_imgs):

                assign_result = bbox_assigner.assign(proposal_list[i],
                                                     gt_bboxes_auxiliary[i],
                                                     gt_bboxes_ignore[i],
                                                     gt_labels_auxiliary[i])
                sampling_result = bbox_sampler.sample(
                    assign_result,
                    proposal_list[i],
                    gt_bboxes_auxiliary[i],
                    gt_labels_auxiliary[i],
                    feats=[lvl_feat[i][None] for lvl_feat in x])
                sampling_results.append(sampling_result)
            rois = bbox2roi([res.bboxes for res in sampling_results])
            bbox_feats_raw = self.auxiliary_bbox_roi_extractor(
                y[:self.auxiliary_bbox_roi_extractor.num_inputs], rois)
            cls_score_auxiliary, bbox_pred_auxiliary = self.auxiliary_bbox_head(
                bbox_feats_raw)

            bbox_targets = self.auxiliary_bbox_head.get_target(
                sampling_results, gt_bboxes, gt_labels,
                self.train_cfg.auxiliary.rcnn)

            loss_bbox_auxiliary = self.auxiliary_bbox_head.loss(
                cls_score_auxiliary,
                bbox_pred_auxiliary,
                *bbox_targets,
                alpha=0.25,
                num_level=3)
            losses.update(loss_bbox_auxiliary)

        return losses
Esempio n. 29
0
    def __init__(self,
                 num_classes,
                 in_channels,
                 anchor_generator=dict(type='YOLOAnchorGenerator',
                                       base_sizes=[[(32, 32), (48, 48),
                                                    (24, 32), (32, 48)],
                                                   [(64, 64), (72, 72),
                                                    (72, 96), (96, 96)],
                                                   [(72, 96), (96, 96),
                                                    (128, 128), (96, 128)]],
                                       strides=[16, 32, 64]),
                 bbox_coder=dict(type='YOLOBBoxCoder'),
                 featmap_strides=[16, 32, 64],
                 one_hot_smoother=0.,
                 loss_cls=dict(type='CrossEntropyLoss',
                               use_sigmoid=True,
                               loss_weight=1.0),
                 loss_conf=dict(type='CrossEntropyLoss',
                                use_sigmoid=True,
                                loss_weight=1.0),
                 loss_xy=dict(type='CrossEntropyLoss',
                              use_sigmoid=True,
                              loss_weight=1.0),
                 loss_wh=dict(type='MSELoss', loss_weight=1.0),
                 loss_iou=dict(type='CIoULoss', loss_weight=2.0),
                 train_cfg=None,
                 test_cfg=None):
        super(CZ_CoarseHead, self).__init__()
        # Check params
        assert (len(in_channels) == len(featmap_strides))

        self.num_classes = num_classes
        self.in_channels = in_channels
        self.featmap_strides = featmap_strides
        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        self.sampling = False
        if self.train_cfg:
            self.assigner = build_assigner(self.train_cfg.assigner)
            if hasattr(self.train_cfg, 'sampler'):
                sampler_cfg = self.train_cfg.sampler
            else:
                sampler_cfg = dict(type='PseudoSampler')
            self.sampler = build_sampler(sampler_cfg, context=self)

        self.one_hot_smoother = one_hot_smoother

        self.bbox_coder = build_bbox_coder(bbox_coder)
        self.anchor_generator = build_anchor_generator(anchor_generator)

        self.loss_cls = build_loss(loss_cls)
        self.loss_conf = build_loss(loss_conf)
        self.loss_xy = build_loss(loss_xy)
        self.loss_wh = build_loss(loss_wh)
        self.loss_iou = build_loss(loss_iou)
        # usually the numbers of anchors for each level are the same
        # except SSD detectors
        self.num_anchors = self.anchor_generator.num_base_anchors[0]
        assert len(
            self.anchor_generator.num_base_anchors) == len(featmap_strides)
        self._init_layers()