def extract_feat(self, img, gt_bboxes): if self.visual_modality: x = super().extract_feat(img)[-1] feats = self.maxpool(self.extractor([x], bbox2roi(gt_bboxes))) return feats.view(feats.size(0), -1) return None
def test_sabl_bbox_head_loss(): """Tests bbox head loss when truth is empty and non-empty.""" self = SABLHead( num_classes=4, cls_in_channels=3, reg_in_channels=3, cls_out_channels=3, reg_offset_out_channels=3, reg_cls_out_channels=3, roi_feat_size=7) # Dummy proposals proposal_list = [ torch.Tensor([[23.6667, 23.8757, 228.6326, 153.8874]]), ] target_cfg = mmcv.Config(dict(pos_weight=1)) # Test bbox loss when truth is empty gt_bboxes = [torch.empty((0, 4))] gt_labels = [torch.LongTensor([])] sampling_results = _dummy_bbox_sampling(proposal_list, gt_bboxes, gt_labels) bbox_targets = self.get_targets(sampling_results, gt_bboxes, gt_labels, target_cfg) labels, label_weights, bbox_targets, bbox_weights = bbox_targets # Create dummy features "extracted" for each sampled bbox num_sampled = sum(len(res.bboxes) for res in sampling_results) rois = bbox2roi([res.bboxes for res in sampling_results]) dummy_feats = torch.rand(num_sampled, 3, 7, 7) cls_scores, bbox_preds = self.forward(dummy_feats) losses = self.loss(cls_scores, bbox_preds, rois, labels, label_weights, bbox_targets, bbox_weights) assert losses.get('loss_cls', 0) > 0, 'cls-loss should be non-zero' assert losses.get('loss_bbox_cls', 0) == 0, 'empty gt bbox-cls-loss should be zero' assert losses.get('loss_bbox_reg', 0) == 0, 'empty gt bbox-reg-loss should be zero' # Test bbox loss when truth is non-empty gt_bboxes = [ torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), ] gt_labels = [torch.LongTensor([2])] sampling_results = _dummy_bbox_sampling(proposal_list, gt_bboxes, gt_labels) rois = bbox2roi([res.bboxes for res in sampling_results]) bbox_targets = self.get_targets(sampling_results, gt_bboxes, gt_labels, target_cfg) labels, label_weights, bbox_targets, bbox_weights = bbox_targets # Create dummy features "extracted" for each sampled bbox num_sampled = sum(len(res.bboxes) for res in sampling_results) dummy_feats = torch.rand(num_sampled, 3, 7, 7) cls_scores, bbox_preds = self.forward(dummy_feats) losses = self.loss(cls_scores, bbox_preds, rois, labels, label_weights, bbox_targets, bbox_weights) assert losses.get('loss_bbox_cls', 0) > 0, 'empty gt bbox-cls-loss should be zero' assert losses.get('loss_bbox_reg', 0) > 0, 'empty gt bbox-reg-loss should be zero'
def aug_test(self, features, proposal_list, img_metas, rescale=False): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ rcnn_test_cfg = self.test_cfg aug_bboxes = [] aug_scores = [] for x, img_meta in zip(features, img_metas): # only one image in the batch img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] flip_direction = img_meta[0]['flip_direction'] proposals = bbox_mapping(proposal_list[0][:, :4], img_shape, scale_factor, flip, flip_direction) # "ms" in variable names means multi-stage ms_scores = [] rois = bbox2roi([proposals]) for i in range(self.num_stages): bbox_results = self._bbox_forward(i, x, rois) ms_scores.append(bbox_results['cls_score']) if i < self.num_stages - 1: bbox_label = bbox_results['cls_score'][:, :-1].argmax( dim=1) rois = self.bbox_head[i].regress_by_class( rois, bbox_label, bbox_results['bbox_pred'], img_meta[0]) cls_score = sum(ms_scores) / float(len(ms_scores)) bboxes, scores = self.bbox_head[-1].get_bboxes( rois, cls_score, bbox_results['bbox_pred'], img_shape, scale_factor, rescale=False, cfg=None) aug_bboxes.append(bboxes) aug_scores.append(scores) # after merging, bboxes will be rescaled to the original image size merged_bboxes, merged_scores = merge_aug_bboxes( aug_bboxes, aug_scores, img_metas, rcnn_test_cfg) det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores, rcnn_test_cfg.score_thr, rcnn_test_cfg.nms, rcnn_test_cfg.max_per_img) bbox_result = bbox2result(det_bboxes, det_labels, self.bbox_head[-1].num_classes) if self.with_mask: if det_bboxes.shape[0] == 0: segm_result = [[[] for _ in range(self.mask_head[-1].num_classes)] ] else: aug_masks = [] aug_img_metas = [] for x, img_meta in zip(features, img_metas): img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] flip_direction = img_meta[0]['flip_direction'] _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape, scale_factor, flip, flip_direction) mask_rois = bbox2roi([_bboxes]) for i in range(self.num_stages): mask_results = self._mask_forward(i, x, mask_rois) aug_masks.append( mask_results['mask_pred'].sigmoid().cpu().numpy()) aug_img_metas.append(img_meta) merged_masks = merge_aug_masks(aug_masks, aug_img_metas, self.test_cfg) ori_shape = img_metas[0][0]['ori_shape'] segm_result = self.mask_head[-1].get_seg_masks( merged_masks, det_bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor=1.0, rescale=False) return [(bbox_result, segm_result)] else: return [bbox_result]
def aug_test(self, imgs, img_metas, proposals=None, rescale=False): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ # recompute feats to save memory proposal_list = self.aug_test_rpn(self.extract_feats(imgs), img_metas, self.test_cfg.rpn) rcnn_test_cfg = self.test_cfg.rcnn aug_bboxes = [] aug_scores = [] for x, img_meta in zip(self.extract_feats(imgs), img_metas): # only one image in the batch img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] proposals = bbox_mapping(proposal_list[0][:, :4], img_shape, scale_factor, flip) # "ms" in variable names means multi-stage ms_scores = [] rois = bbox2roi([proposals]) for i in range(self.num_stages): bbox_roi_extractor = self.bbox_roi_extractor[i] bbox_head = self.bbox_head[i] bbox_feats = bbox_roi_extractor( x[:len(bbox_roi_extractor.featmap_strides)], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = bbox_head(bbox_feats) ms_scores.append(cls_score) if i < self.num_stages - 1: bbox_label = cls_score.argmax(dim=1) rois = bbox_head.regress_by_class(rois, bbox_label, bbox_pred, img_meta[0]) cls_score = sum(ms_scores) / float(len(ms_scores)) iou_pred = None bboxes, scores = self.bbox_head[-1].get_det_bboxes(rois, cls_score, bbox_pred, iou_pred, img_shape, scale_factor, rescale=False, cfg=None) aug_bboxes.append(bboxes) aug_scores.append(scores) # after merging, bboxes will be rescaled to the original image size merged_bboxes, merged_scores = merge_aug_bboxes( aug_bboxes, aug_scores, img_metas, rcnn_test_cfg) det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores, rcnn_test_cfg.score_thr, rcnn_test_cfg.nms, rcnn_test_cfg.max_per_img) bbox_result = bbox2result(det_bboxes, det_labels, self.bbox_head[-1].num_classes) if self.with_mask: if det_bboxes.shape[0] == 0: segm_result = [[] for _ in range(self.mask_head[-1].num_classes - 1)] else: aug_masks = [] aug_img_metas = [] for x, img_meta in zip(self.extract_feats(imgs), img_metas): img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape, scale_factor, flip) mask_rois = bbox2roi([_bboxes]) for i in range(self.num_stages): mask_feats = self.mask_roi_extractor[i]( x[:len(self.mask_roi_extractor[i].featmap_strides )], mask_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) mask_pred = self.mask_head[i](mask_feats) aug_masks.append(mask_pred.sigmoid().cpu().numpy()) aug_img_metas.append(img_meta) merged_masks = merge_aug_masks(aug_masks, aug_img_metas, self.test_cfg.rcnn) ori_shape = img_metas[0][0]['ori_shape'] segm_result = self.mask_head[-1].get_seg_masks( merged_masks, det_bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor=1.0, rescale=False) return bbox_result, segm_result else: return bbox_result
def forward_train(self, img, img_meta, gt_bboxes, gt_bboxes_ignore, gt_labels, ref_img, # images of reference frame ref_bboxes, # gt bbox of reference frame gt_pids, # gt ids of current frame bbox mapped to reference frame gt_masks=None, proposals=None): x = self.extract_feat(img) ref_x = self.extract_feat(ref_img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs) losses.update(rpn_losses) proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler( self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i], gt_pids[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], gt_pids[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) bbox_img_n = [res.bboxes.size(0) for res in sampling_results] ref_rois = bbox2roi(ref_bboxes) ref_bbox_img_n = [x.size(0) for x in ref_bboxes] # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) ref_bbox_feats = self.bbox_roi_extractor( ref_x[:self.bbox_roi_extractor.num_inputs], ref_rois) cls_score, bbox_pred = self.bbox_head(bbox_feats) # fetch bbox and object_id targets bbox_targets, (ids, id_weights) = self.bbox_head.get_target( sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) match_score = self.track_head(bbox_feats, ref_bbox_feats, bbox_img_n, ref_bbox_img_n) loss_match = self.track_head.loss(match_score, ids, id_weights) losses.update(loss_match) # mask head forward and loss if self.with_mask: pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def simple_test(self, img, img_meta, proposals=None, rescale=False): """Run inference on a single image. Args: img (Tensor): must be in shape (N, C, H, W) img_meta (list[dict]): a list with one dictionary element. See `mmdet/datasets/pipelines/formatting.py:Collect` for details of meta dicts. proposals : if specified overrides rpn proposals rescale (bool): if True returns boxes in original image space Returns: dict: results """ x = self.extract_feat(img) proposal_list = self.simple_test_rpn( x, img_meta, self.test_cfg.rpn) if proposals is None else proposals img_shape = img_meta[0]['img_shape'] ori_shape = img_meta[0]['ori_shape'] scale_factor = img_meta[0]['scale_factor'] # "ms" in variable names means multi-stage ms_bbox_result = {} ms_segm_result = {} ms_scores = [] rcnn_test_cfg = self.test_cfg.rcnn rois = bbox2roi(proposal_list) for i in range(self.num_stages): bbox_roi_extractor = self.bbox_roi_extractor[i] bbox_head = self.bbox_head[i] bbox_feats = bbox_roi_extractor( x[:len(bbox_roi_extractor.featmap_strides)], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = bbox_head(bbox_feats) ms_scores.append(cls_score) if i < self.num_stages - 1: bbox_label = cls_score.argmax(dim=1) rois = bbox_head.regress_by_class(rois, bbox_label, bbox_pred, img_meta[0]) # average 3 stage cls score as final cls score, but the output box come from final stage cls_score = sum(ms_scores) / self.num_stages det_bboxes, det_labels = self.bbox_head[ -1].get_det_bboxes( # bbox_pred may be tuple rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=rescale, cfg=rcnn_test_cfg) bbox_result = bbox2result(det_bboxes, det_labels, self.bbox_head[-1].num_classes) ms_bbox_result['ensemble'] = bbox_result if self.with_mask: if det_bboxes.shape[0] == 0: mask_classes = self.mask_head[-1].num_classes - 1 segm_result = [[] for _ in range(mask_classes)] else: if isinstance(scale_factor, float): # aspect ratio fixed _bboxes = (det_bboxes[:, :4] * scale_factor if rescale else det_bboxes) else: _bboxes = ( det_bboxes[:, :4] * torch.from_numpy(scale_factor).to(det_bboxes.device) if rescale else det_bboxes) mask_rois = bbox2roi([_bboxes]) aug_masks = [] for i in range(self.num_stages): mask_roi_extractor = self.mask_roi_extractor[i] mask_feats = mask_roi_extractor( x[:len(mask_roi_extractor.featmap_strides)], mask_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) mask_pred = self.mask_head[i](mask_feats) aug_masks.append(mask_pred.sigmoid().cpu().numpy()) merged_masks = merge_aug_masks(aug_masks, [img_meta] * self.num_stages, self.test_cfg.rcnn) segm_result = self.mask_head[-1].get_seg_masks( merged_masks, _bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor, rescale) ms_segm_result['ensemble'] = segm_result if self.with_mask: results = (ms_bbox_result['ensemble'], ms_segm_result['ensemble']) else: results = ms_bbox_result['ensemble'] return results
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_meta (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and my also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ x = self.extract_feat(img) # x得到的是一个元组,对于resnet,可包含多个conv层的结果 losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) # rpn_outs = rpn_cls_score + rpn_bbox_pred # 注意:rpn_bbox_pred预测的是delta,是(tx,ty,tw,th) # 计算loss时使用的是train的配置 rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) # loss:生成anchor,将anchor与gt_bbox匹配,生成正负样本,计算delta # 得到每个anchor的label/label_weights/delta/delta_weights,以及pos_inds/neg_inds # 利用这些使用交叉熵和SmoothL1Loss,得到loss rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) # rpn_losses 字典类型,包含loss_rpn_cls,loss_rpn_bbox losses.update(rpn_losses) # 将rpn_loss加入到总体的loss中 # 得到proposal时使用的是train的配置 proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) # get_bboxes:得到nms之后的proposals proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals ''' with_bbox和with_mask是训练时的选项 eg.在faster_rcnn_r50_caffe_c4_1x.py中 train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), ] ''' if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler( self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): # 在RPN部分完成了anchor的匹配和正负样本生成,用于计算loss # 并根据bbox_pred和NMS得到proposal # 在RCNN部分,将proposal和gt_bbox匹配并生成正负样本,用于计算这部分的loss assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # 所有图的roi,(img_ind, x1, y1, x2, y2) # TODO: a more flexible way to decide which feature maps to use # todo:看SingleRoIExtractor bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) # bbox_feats尺寸(roi_num, roi_out_channels, roi_w(7), roi_h(7)) if self.with_shared_head: # 将ROI再经过一段特征提取层,这里用的是Resnet的conv5 bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones( res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros( res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target(sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def simple_test_mask_(self, x, img_metas, det_bboxes, det_labels, semantic_logits, rescale=False): ori_shape = img_metas[0]['ori_shape'] scale_factor = img_metas[0]['scale_factor'] ref_size = (np.int(np.round(ori_shape[0] * scale_factor)), np.int(np.round(ori_shape[1] * scale_factor))) sem_pred = torch.argmax(semantic_logits, dim=1)[0] panoptic_mask = torch.zeros_like(sem_pred, dtype=torch.long) cat = [255] if det_bboxes.shape[0] == 0: intermediate_logits = semantic_logits[0, :self.num_stuff] else: # if det_bboxes is rescaled to the original image size, we need to # rescale it back to the testing scale to obtain RoIs. if rescale and not isinstance(scale_factor, float): scale_factor = torch.from_numpy(scale_factor).to( det_bboxes.device) _bboxes = (det_bboxes[:, :4] * scale_factor if rescale else det_bboxes) mask_rois = bbox2roi([_bboxes]) mask_feats = self.mask_roi_extractor( x[:len(self.mask_roi_extractor.featmap_strides)], mask_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) mask_pred = self.mask_head(mask_feats) confidence = det_bboxes[:, 4] idx = torch.argsort(confidence, descending=True) bbx_inv = invert_roi_bbx(det_bboxes[:, :4], tuple(mask_pred.shape[2:]), ref_size) bbx_idx = torch.arange(0, det_bboxes.size(0), dtype=torch.long, device=det_bboxes.device) mask_pred = roi_sampling(mask_pred, bbx_inv, bbx_idx, ref_size, padding="zero") ML_A = mask_pred.new_zeros(mask_pred.shape[0], mask_pred.shape[-2], mask_pred.shape[-1]) ML_B = ML_A.clone() occupied = torch.zeros_like(sem_pred, dtype=torch.bool) i = 0 for id_i in idx: label_i = det_labels[id_i] mask_pred_i = mask_pred[id_i, label_i + 1, :, :] mask_i = (mask_pred_i.sigmoid() > self.test_cfg.rcnn.mask_thr_binary) mask_i = mask_i.type(torch.bool) intersection = occupied & mask_i if intersection.float().sum() / mask_i.float().sum( ) > self.test_cfg.panoptic.overlap_thr: continue mask_i = mask_i ^ intersection occupied += mask_i y0 = max(int(det_bboxes[id_i, 1] + 1), 0) y1 = min(int((det_bboxes[id_i, 3] - 1).round() + 1), ref_size[0]) x0 = max(int(det_bboxes[id_i, 0] + 1), 0) x1 = min(int((det_bboxes[id_i, 2] - 1).round() + 1), ref_size[1]) ML_A[i] = 4 * mask_pred_i ML_B[i, y0:y1, x0:x1] = semantic_logits[0, label_i + self.num_stuff, y0:y1, x0:x1] cat.append(label_i.item() + self.num_stuff) i = i + 1 ML_A = ML_A[:i] ML_B = ML_B[:i] FL = (ML_A.sigmoid() + ML_B.sigmoid()) * (ML_A + ML_B) intermediate_logits = torch.cat( [semantic_logits[0, :self.num_stuff], FL], dim=0) cat = torch.tensor(cat, dtype=torch.long) intermediate_mask = torch.argmax(F.softmax(intermediate_logits, dim=0), dim=0) + 1 intermediate_mask = intermediate_mask - self.num_stuff intermediate_mask[intermediate_mask <= 0] = 0 unique = torch.unique(intermediate_mask) ignore_val = intermediate_mask.max().item() + 1 ignore_arr = torch.ones( (ignore_val, ), dtype=unique.dtype, device=unique.device) * ignore_val total_unique = unique.shape[0] ignore_arr[unique] = torch.arange(total_unique).cuda(ignore_arr.device) panoptic_mask = ignore_arr[intermediate_mask] panoptic_mask[intermediate_mask == ignore_val] = 0 cat_ = cat[unique].long() sem_pred[panoptic_mask > 0] = self.num_stuff sem_pred[sem_pred >= self.num_stuff] = self.num_stuff cls_stuff, area = torch.unique(sem_pred, return_counts=True) cls_stuff[ area < self.test_cfg.panoptic.min_stuff_area] = self.num_stuff cls_stuff = cls_stuff[cls_stuff != self.num_stuff] tmp = torch.ones((self.num_stuff + 1, ), dtype=cls_stuff.dtype, device=cls_stuff.device) * self.num_stuff tmp[cls_stuff] = torch.arange(cls_stuff.shape[0]).cuda(tmp.device) new_sem_pred = tmp[sem_pred] cat_ = torch.cat((cat_, cls_stuff.cpu().long()), -1) bool_mask = new_sem_pred != self.num_stuff panoptic_mask[bool_mask] = new_sem_pred[bool_mask] + total_unique return panoptic_mask.cpu(), cat_.cpu()
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None, epoch=None, **kwargs): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_meta (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ # x = self.extract_feat(img) losses = dict() # keep the original usage if self.dfn_balance is None: x = self.extract_feat(img) else: # count the number of defects in each image defect_nums = [0] * img.shape[0] for i, gt_label in enumerate(gt_labels): defect_cnt = 0 for label in gt_label: if self.dfn_balance.background_id != int(label): defect_cnt += 1 defect_nums[i] = defect_cnt # split the images into defect(1) image and normal(0) image dfn_labels = [0 if i == 0 else 1 for i in defect_nums] dfn_labels = torch.Tensor(dfn_labels).long().cuda() # calculate dfn loss x, dfn_loss = self.extract_defect_feat(img, targets=dfn_labels) # balance different network loss dfn_weight = self.get_dfn_weight(epoch) loss_dfn = dfn_loss['loss'] loss_dfn = loss_dfn * dfn_weight losses.update(loss_dfn=loss_dfn) # delete the ignored annotations if self.ignore_ids is not None: for ignore_id in self.ignore_ids: for i, gt_label in enumerate(gt_labels): keep_ind = gt_label != ignore_id gt_labels[i] = gt_labels[i][keep_ind] gt_bboxes[i] = gt_bboxes[i][keep_ind] keep_ind = [True] * len(gt_labels) for i in range(len(gt_labels) - 1, -1, -1): if gt_labels[i].shape[0] == 0: img_meta.pop(i) gt_bboxes.pop(i) gt_labels.pop(i) keep_ind[i] = False img = img[keep_ind] x = list(x) for i in range(len(x)): x[i] = x[i][keep_ind] x = tuple(x) # if have no any images, then set all loss to be 0 except dfn loss if len(gt_labels) == 0 or len( gt_bboxes) == 0 or img.shape[0] == 0: loss_zeros = [ torch.Tensor([0.]).float().cuda() for i in range(len(x)) ] loss_ones = [ torch.Tensor([1.]).float().cuda() for i in range(len(x)) ] losses.update(loss_rpn_cls=loss_zeros, loss_rpn_bbox=loss_zeros) loss_zero = torch.Tensor([0.]).float().cuda() loss_one = torch.Tensor([1.]).float().cuda() losses['loss_cls'] = loss_zero losses['loss_bbox'] = loss_zero losses['acc'] = loss_one return losses # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] if mask_feats.shape[0] > 0: mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def forward_train(self, # 核心双阶段检测器的流程 img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format(left top point, right bottom point). gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ # 前向 backbone 和 neck x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: # rpn_head在上面__init__函数里面build了,返回了rpn_head对象, # 但是因为都继承了nn.Module(实现了__call__),可以直接用实例名字调用里面的forward函数,从而进行了前向传播 rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas, self.train_cfg.rpn) # 这里loss是rpn_head里面的实现了,因为rpn_head继承了anchor_head, # 所以用了父类anchor_head实现的loss,其实就是anchor的那一套,返回是一个字典 rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_metas, proposal_cfg) # 得到proposal,把anchor转化为对应的框的信息,然后NMS再取top-N个候选框 proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals ''' ============ proposal_list 2 torch.Size([2000, 5]) ****** torch.Size([2000, 5]) ============ ============ ''' # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler( self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) # bbox_targets: 长度为4的tuple,元素是tensor ''' bbox_targets:长度为4 四个元素分别的维度 labels: torch.Size([1024]) label_weights: torch.Size([1024]) bbox_targets torch.Size([1024, 4]) bbox_weights torch.Size([1024, 4]) ''' # tensor维度为1维与tensor长度和cls_score,bbox_pred一致,值为0或1,用来表示预测是否命中真实框 bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones( res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros( res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] if mask_feats.shape[0] > 0: mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_bboxes_ignore, gt_labels, gt_masks=None, proposals=None): losses = dict() x = self.extract_feat(img) if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs) losses.update(rpn_losses) proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn) proposal_list = self.rpn_head.get_proposals(*proposal_inputs) else: proposal_list = proposals if self.with_bbox: (pos_proposals, neg_proposals, pos_assigned_gt_inds, pos_gt_bboxes, pos_gt_labels) = multi_apply( sample_bboxes, proposal_list, gt_bboxes, gt_bboxes_ignore, gt_labels, cfg=self.train_cfg.rcnn) (labels, label_weights, bbox_targets, bbox_weights) = self.bbox_head.get_bbox_target( pos_proposals, neg_proposals, pos_gt_bboxes, pos_gt_labels, self.train_cfg.rcnn) rois = bbox2roi([ torch.cat([pos, neg], dim=0) for pos, neg in zip(pos_proposals, neg_proposals) ]) # TODO: a more flexible way to configurate feat maps roi_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) cls_score, bbox_pred = self.bbox_head(roi_feats) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, labels, label_weights, bbox_targets, bbox_weights) losses.update(loss_bbox) if self.with_mask: mask_targets = self.mask_head.get_mask_target( pos_proposals, pos_assigned_gt_inds, gt_masks, self.train_cfg.rcnn) pos_rois = bbox2roi(pos_proposals) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) mask_pred = self.mask_head(mask_feats) loss_mask = self.mask_head.loss(mask_pred, mask_targets, torch.cat(pos_gt_labels)) losses.update(loss_mask) return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] for i in range(self.num_stages): rcnn_train_cfg = self.train_cfg.rcnn[i] lw = self.train_cfg.stage_loss_weights[i] # assign gts and sample proposals assign_results, sampling_results = multi_apply( assign_and_sample, proposal_list, gt_bboxes, gt_bboxes_ignore, gt_labels, cfg=rcnn_train_cfg) # bbox head forward and loss bbox_roi_extractor = self.bbox_roi_extractor[i] bbox_head = self.bbox_head[i] rois = bbox2roi([res.bboxes for res in sampling_results]) bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], rois) cls_score, bbox_pred = bbox_head(bbox_feats) bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg) loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets) for name, value in loss_bbox.items(): losses['s{}.{}'.format(i, name)] = (value * lw if 'loss' in name else value) # mask head forward and loss if self.with_mask: mask_roi_extractor = self.mask_roi_extractor[i] mask_head = self.mask_head[i] pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = mask_roi_extractor( x[:mask_roi_extractor.num_inputs], pos_rois) mask_pred = mask_head(mask_feats) mask_targets = mask_head.get_target(sampling_results, gt_masks, rcnn_train_cfg) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels) for name, value in loss_mask.items(): losses['s{}.{}'.format(i, name)] = (value * lw if 'loss' in name else value) # refine bboxes if i < self.num_stages - 1: pos_is_gts = [res.pos_is_gt for res in sampling_results] roi_labels = bbox_targets[0] # bbox_targets is a tuple with torch.no_grad(): proposal_list = bbox_head.refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_meta) return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() # trans gt_masks to gt_obbs gt_obbs = gt_mask_bp_obbs_list(gt_masks) gt_obbs = [choose_best_Rroi_batch(gt) for gt in gt_obbs] # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, gt_labels, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals (hbb assign) if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn[0].assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn[0].sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) ## rbbox gt_obbs_torch = [ torch.from_numpy(gt).float().to(rois.device) for gt in gt_obbs ] rbbox_targets = self.bbox_head.get_target(sampling_results, gt_obbs_torch, gt_labels, self.train_cfg.rcnn[0]) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *rbbox_targets) # losses.update(loss_bbox) for name, value in loss_bbox.items(): losses['s{}.{}'.format(0, name)] = (value) return losses
def aug_test(self, imgs, img_metas, proposals=None, rescale=None): # raise NotImplementedError # import pdb; pdb.set_trace() proposal_list = self.aug_test_rpn_rotate( self.extract_feats(imgs), img_metas, self.test_cfg.rpn) rcnn_test_cfg = self.test_cfg.rcnn aug_rbboxes = [] aug_rscores = [] for x, img_meta in zip(self.extract_feats(imgs), img_metas): # only one image in the batch img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] proposals = bbox_mapping(proposal_list[0][:, :4], img_shape, scale_factor, flip) angle = img_meta[0]['angle'] # print('img shape: ', img_shape) if angle != 0: try: proposals = bbox_rotate_mapping(proposal_list[0][:, :4], img_shape, angle) except: import pdb; pdb.set_trace() rois = bbox2roi([proposals]) # recompute feature maps to save GPU memory roi_feats = self.bbox_roi_extractor( x[:len(self.bbox_roi_extractor.featmap_strides)], rois) if self.with_shared_head: roi_feats = self.shared_head(roi_feats) cls_score, bbox_pred = self.bbox_head(roi_feats) bbox_label = cls_score.argmax(dim=1) rrois = self.bbox_head.regress_by_class_rbbox(roi2droi(rois), bbox_label, bbox_pred, img_meta[0]) rrois_enlarge = copy.deepcopy(rrois) rrois_enlarge[:, 3] = rrois_enlarge[:, 3] * self.rbbox_roi_extractor.w_enlarge rrois_enlarge[:, 4] = rrois_enlarge[:, 4] * self.rbbox_roi_extractor.h_enlarge rbbox_feats = self.rbbox_roi_extractor( x[:len(self.rbbox_roi_extractor.featmap_strides)], rrois_enlarge) if self.with_shared_head_rbbox: rbbox_feats = self.shared_head_rbbox(rbbox_feats) rcls_score, rbbox_pred = self.rbbox_head(rbbox_feats) rbboxes, rscores = self.rbbox_head.get_det_rbboxes( rrois, rcls_score, rbbox_pred, img_shape, scale_factor, rescale=rescale, cfg=None) aug_rbboxes.append(rbboxes) aug_rscores.append(rscores) merged_rbboxes, merged_rscores = merge_rotate_aug_bboxes( aug_rbboxes, aug_rscores, img_metas, rcnn_test_cfg ) det_rbboxes, det_rlabels = multiclass_nms_rbbox( merged_rbboxes, merged_rscores, rcnn_test_cfg.score_thr, rcnn_test_cfg.nms, rcnn_test_cfg.max_per_img) if rescale: _det_rbboxes = det_rbboxes else: _det_rbboxes = det_rbboxes.clone() _det_rbboxes[:, :4] *= img_metas[0][0]['scale_factor'] rbbox_results = dbbox2result(_det_rbboxes, det_rlabels, self.rbbox_head.num_classes) return rbbox_results
def forward_train(self, img, img_meta, gt_bboxes, gt_polygons, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) if self.with_attention: x = self.add_attention(x) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals if self.with_deform_adjust: x = self.deform_adjust(x, rpn_outs[1]) # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_polygons[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_polygons, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target(sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def simple_test(self, x, proposal_list, img_metas, rescale=False): if self.with_semantic: _, semantic_feat = self.semantic_head(x) else: semantic_feat = None img_shape = img_metas[0]['img_shape'] ori_shape = img_metas[0]['ori_shape'] scale_factor = img_metas[0]['scale_factor'] # "ms" in variable names means multi-stage ms_bbox_result = {} ms_segm_result = {} ms_scores = [] rcnn_test_cfg = self.test_cfg rois = bbox2roi(proposal_list) for i in range(self.num_stages): bbox_head = self.bbox_head[i] bbox_results = self._bbox_forward(i, x, rois, semantic_feat=semantic_feat) ms_scores.append(bbox_results['cls_score']) if i < self.num_stages - 1: bbox_label = bbox_results['cls_score'].argmax(dim=1) rois = bbox_head.regress_by_class(rois, bbox_label, bbox_results['bbox_pred'], img_metas[0]) cls_score = sum(ms_scores) / float(len(ms_scores)) det_bboxes, det_labels = self.bbox_head[-1].get_bboxes( rois, cls_score, bbox_results['bbox_pred'], img_shape, scale_factor, rescale=rescale, cfg=rcnn_test_cfg) bbox_result = bbox2result(det_bboxes, det_labels, self.bbox_head[-1].num_classes) ms_bbox_result['ensemble'] = bbox_result if self.with_mask: if det_bboxes.shape[0] == 0: mask_classes = self.mask_head[-1].num_classes segm_result = [[] for _ in range(mask_classes)] else: _bboxes = (det_bboxes[:, :4] * det_bboxes.new_tensor(scale_factor) if rescale else det_bboxes) mask_rois = bbox2roi([_bboxes]) aug_masks = [] mask_roi_extractor = self.mask_roi_extractor[-1] mask_feats = mask_roi_extractor( x[:len(mask_roi_extractor.featmap_strides)], mask_rois) if self.with_semantic and 'mask' in self.semantic_fusion: mask_semantic_feat = self.semantic_roi_extractor( [semantic_feat], mask_rois) mask_feats += mask_semantic_feat last_feat = None for i in range(self.num_stages): mask_head = self.mask_head[i] if self.mask_info_flow: mask_pred, last_feat = mask_head(mask_feats, last_feat) else: mask_pred = mask_head(mask_feats) aug_masks.append(mask_pred.sigmoid().cpu().numpy()) merged_masks = merge_aug_masks(aug_masks, [img_metas] * self.num_stages, self.test_cfg) segm_result = self.mask_head[-1].get_seg_masks( merged_masks, _bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor, rescale) ms_segm_result['ensemble'] = segm_result if self.with_mask: results = (ms_bbox_result['ensemble'], ms_segm_result['ensemble']) else: results = ms_bbox_result['ensemble'] return results
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_meta (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and my also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ x = self.extract_feat(img) # pdb.set_trace() losses = dict() if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals for i in range(self.num_stages): self.current_stage = i # pdb.set_trace() rcnn_train_cfg = self.train_cfg.rcnn[i] lw = self.train_cfg.stage_loss_weights[i] # assign gts and sample proposals sampling_results = [] if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(rcnn_train_cfg.assigner) bbox_sampler = build_sampler(rcnn_train_cfg.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] for j in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j], gt_labels[j]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[j], gt_bboxes[j], gt_labels[j], feats=[lvl_feat[j][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss bbox_roi_extractor = self.bbox_roi_extractor[i] bbox_head = self.bbox_head[i] rois = bbox2roi([res.bboxes for res in sampling_results]) if len(rois) == 0: # If there are no predicted and/or truth boxes, then we cannot # compute head / mask losses continue bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = bbox_head(bbox_feats) bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg, img_meta) loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets) for name, value in loss_bbox.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: mask_roi_extractor = self.mask_roi_extractor[i] pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = mask_roi_extractor( x[:mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: # reuse positive bbox feats pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_head = self.mask_head[i] mask_pred = mask_head(mask_feats) mask_targets = mask_head.get_target(sampling_results, gt_masks, rcnn_train_cfg) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels) for name, value in loss_mask.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # refine bboxes if i < self.num_stages - 1: pos_is_gts = [res.pos_is_gt for res in sampling_results] roi_labels = bbox_targets[0] # bbox_targets is a tuple with torch.no_grad(): proposal_list = bbox_head.refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_meta) return losses
def aug_test(self, img_feats, img_metas, proposals=None, rescale=False): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ if self.with_semantic: semantic_feats = [ self.semantic_head(feat)[1] for feat in img_feats ] else: semantic_feats = [None] * len(img_metas) # recompute feats to save memory proposal_list = self.aug_test_rpn(img_feats, img_metas, self.test_cfg.rpn) rcnn_test_cfg = self.test_cfg aug_bboxes = [] aug_scores = [] for x, img_meta, semantic in zip(img_feats, img_metas, semantic_feats): # only one image in the batch img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] flip_direction = img_meta[0]['flip_direction'] proposals = bbox_mapping(proposal_list[0][:, :4], img_shape, scale_factor, flip, flip_direction) # "ms" in variable names means multi-stage ms_scores = [] rois = bbox2roi([proposals]) for i in range(self.num_stages): bbox_head = self.bbox_head[i] bbox_results = self._bbox_forward(i, x, rois, semantic_feat=semantic) ms_scores.append(bbox_results['cls_score']) if i < self.num_stages - 1: bbox_label = bbox_results['cls_score'].argmax(dim=1) rois = bbox_head.regress_by_class( rois, bbox_label, bbox_results['bbox_pred'], img_meta[0]) cls_score = sum(ms_scores) / float(len(ms_scores)) bboxes, scores = self.bbox_head[-1].get_bboxes( rois, cls_score, bbox_results['bbox_pred'], img_shape, scale_factor, rescale=False, cfg=None) aug_bboxes.append(bboxes) aug_scores.append(scores) # after merging, bboxes will be rescaled to the original image size merged_bboxes, merged_scores = merge_aug_bboxes( aug_bboxes, aug_scores, img_metas, rcnn_test_cfg) det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores, rcnn_test_cfg.score_thr, rcnn_test_cfg.nms, rcnn_test_cfg.max_per_img) bbox_result = bbox2result(det_bboxes, det_labels, self.bbox_head[-1].num_classes) if self.with_mask: if det_bboxes.shape[0] == 0: segm_result = [[] for _ in range(self.mask_head[-1].num_classes - 1)] else: aug_masks = [] aug_img_metas = [] for x, img_meta, semantic in zip(img_feats, img_metas, semantic_feats): img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] flip_direction = img_meta[0]['flip_direction'] _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape, scale_factor, flip, flip_direction) mask_rois = bbox2roi([_bboxes]) mask_feats = self.mask_roi_extractor[-1]( x[:len(self.mask_roi_extractor[-1].featmap_strides)], mask_rois) if self.with_semantic: semantic_feat = semantic mask_semantic_feat = self.semantic_roi_extractor( [semantic_feat], mask_rois) if mask_semantic_feat.shape[-2:] != mask_feats.shape[ -2:]: mask_semantic_feat = F.adaptive_avg_pool2d( mask_semantic_feat, mask_feats.shape[-2:]) mask_feats += mask_semantic_feat last_feat = None for i in range(self.num_stages): mask_head = self.mask_head[i] if self.mask_info_flow: mask_pred, last_feat = mask_head( mask_feats, last_feat) else: mask_pred = mask_head(mask_feats) aug_masks.append(mask_pred.sigmoid().cpu().numpy()) aug_img_metas.append(img_meta) merged_masks = merge_aug_masks(aug_masks, aug_img_metas, self.test_cfg) ori_shape = img_metas[0][0]['ori_shape'] segm_result = self.mask_head[-1].get_seg_masks( merged_masks, det_bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor=1.0, rescale=False) return bbox_result, segm_result else: return bbox_result
def aug_test(self, imgs, img_metas, proposals=None, rescale=False): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ # recompute feats to save memory base_feats = list(self.extract_feats(imgs)) proposal_list = self.aug_test_rpn(base_feats, img_metas, self.test_cfg.rpn) rcnn_test_cfg = self.test_cfg.rcnn aug_bboxes = [] aug_scores = [] aug_variance = [] for x, img_meta in zip(base_feats, img_metas): # only one image in the batch img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] flip_direction = img_meta[0].get('flip_direction', 'horizontal') proposals = bbox_mapping(proposal_list[0][:, :4], img_shape, scale_factor, flip, flip_direction) # "ms" in variable names means multi-stage ms_scores = [] rois = bbox2roi([proposals]) for i in range(self.num_stages): bbox_roi_extractor = self.bbox_roi_extractor[i] bbox_head = self.bbox_head[i] bbox_feats = bbox_roi_extractor( x[:len(bbox_roi_extractor.featmap_strides)], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = bbox_head(bbox_feats) ms_scores.append(cls_score) if i < self.num_stages - 1: bbox_label = cls_score.argmax(dim=1) rois = bbox_head.regress_by_class(rois, bbox_label, bbox_pred, img_meta[0]) cls_score = sum(ms_scores) / float(len(ms_scores)) bboxes, scores = self.bbox_head[-1].get_det_bboxes(rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=False, cfg=None) # aug_bboxes.append(bboxes) # notice: bboxes may be tuple if isinstance(bboxes, tuple): aug_bboxes.append(bboxes[0]) aug_variance.append(bboxes[1]) else: aug_bboxes.append(bboxes) aug_scores.append(scores) return_variance = rcnn_test_cfg.get('return_variance', False) if len(aug_variance ) == 0 or rcnn_test_cfg.nms.type != 'soft_nms_variance_voting': merged_bboxes, merged_scores = merge_aug_bboxes( aug_bboxes, aug_scores, img_metas, rcnn_test_cfg) det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores, rcnn_test_cfg.score_thr, rcnn_test_cfg.nms, rcnn_test_cfg.max_per_img) else: merged_bboxes, merged_scores, merged_variance = merge_aug_bboxes_variance( aug_bboxes, aug_scores, aug_variance, img_metas, rcnn_test_cfg) det_bboxes, det_labels = multiclass_soft_nms_variance_voting( merged_bboxes, merged_variance, merged_scores, rcnn_test_cfg.score_thr, rcnn_test_cfg.nms, rcnn_test_cfg.max_per_img, return_variance=return_variance) bbox_result = bbox2result(det_bboxes, det_labels, self.bbox_head[-1].num_classes, with_variance=return_variance) if self.with_mask: if det_bboxes.shape[0] == 0: segm_result = [[] for _ in range(self.mask_head[-1].num_classes - 1)] else: aug_masks = [] aug_img_metas = [] for x, img_meta in zip(self.extract_feats(imgs), img_metas): img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] flip_direction = img_meta[0].get('flip_direction', 'horizontal') _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape, scale_factor, flip, flip_direction) mask_rois = bbox2roi([_bboxes]) for i in range(self.num_stages): mask_feats = self.mask_roi_extractor[i]( x[:len(self.mask_roi_extractor[i].featmap_strides )], mask_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) mask_pred = self.mask_head[i](mask_feats) aug_masks.append(mask_pred.sigmoid().cpu().numpy()) aug_img_metas.append(img_meta) merged_masks = merge_aug_masks(aug_masks, aug_img_metas, self.test_cfg.rcnn) ori_shape = img_metas[0][0]['ori_shape'] segm_result = self.mask_head[-1].get_seg_masks( merged_masks, det_bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor=1.0, rescale=False) return bbox_result, segm_result else: return bbox_result
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_meta (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and my also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ x = self.extract_feat(img) # x得到的是一个元组,对于resnet,可包含多个conv层的结果 conv4 = (x[0], ) conv5 = x[1] losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(conv4) # rpn使用的是resnet的conv4 # rpn_outs = rpn_cls_score + rpn_bbox_pred rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals large_seperate_conv_output = (self.large_seperate_conv(conv5), ) # assign gts and sample proposals if self.with_bbox: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # rois是基于特征图尺寸的,不是基于原图尺寸的 # roi:(roi_batch_inds,x1,y1,x2,y2) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( large_seperate_conv_output[:self.bbox_roi_extractor. num_inputs], rois) # bbox_feats是基于特征图尺寸的,如果需要基于原图尺寸,需要传入scale_factor参数 cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) return losses
def simple_test(self, img, img_meta, proposals=None, rescale=False): x = self.extract_feat(img) proposal_list = self.simple_test_rpn( x, img_meta, self.test_cfg.rpn) if proposals is None else proposals img_shape = img_meta[0]['img_shape'] ori_shape = img_meta[0]['ori_shape'] scale_factor = img_meta[0]['scale_factor'] # "ms" in variable names means multi-stage ms_bbox_result = {} ms_segm_result = {} ms_scores = [] rcnn_test_cfg = self.test_cfg.rcnn rois = bbox2roi(proposal_list) for i in range(self.num_stages): bbox_roi_extractor = self.bbox_roi_extractor[i] bbox_head = self.bbox_head[i] bbox_feats = bbox_roi_extractor( x[:len(bbox_roi_extractor.featmap_strides)], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = bbox_head(bbox_feats) ms_scores.append(cls_score) if self.test_cfg.keep_all_stages: det_bboxes, det_labels = bbox_head.get_det_bboxes( rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=rescale, cfg=rcnn_test_cfg) bbox_result = bbox2result(det_bboxes, det_labels, bbox_head.num_classes) ms_bbox_result['stage{}'.format(i)] = bbox_result if self.with_mask: mask_roi_extractor = self.mask_roi_extractor[i] mask_head = self.mask_head[i] if det_bboxes.shape[0] == 0: mask_classes = mask_head.num_classes - 1 segm_result = [[] for _ in range(mask_classes)] else: _bboxes = (det_bboxes[:, :4] * scale_factor if rescale else det_bboxes) mask_rois = bbox2roi([_bboxes]) mask_feats = mask_roi_extractor( x[:len(mask_roi_extractor.featmap_strides)], mask_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats, i) mask_pred = mask_head(mask_feats) segm_result = mask_head.get_seg_masks( mask_pred, _bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor, rescale) ms_segm_result['stage{}'.format(i)] = segm_result if i < self.num_stages - 1: bbox_label = cls_score.argmax(dim=1) rois = bbox_head.regress_by_class(rois, bbox_label, bbox_pred, img_meta[0]) cls_score = sum(ms_scores) / self.num_stages # if not self.with_IoU: iou_pred = None det_bboxes, det_labels = self.bbox_head[-1].get_det_bboxes( rois, cls_score, bbox_pred, iou_pred, img_shape, scale_factor, rescale=rescale, cfg=rcnn_test_cfg) bbox_result = bbox2result(det_bboxes, det_labels, self.bbox_head[-1].num_classes) ms_bbox_result['ensemble'] = bbox_result if self.with_mask: if det_bboxes.shape[0] == 0: mask_classes = self.mask_head[-1].num_classes - 1 segm_result = [[] for _ in range(mask_classes)] else: if isinstance(scale_factor, float): # aspect ratio fixed _bboxes = (det_bboxes[:, :4] * scale_factor if rescale else det_bboxes) else: _bboxes = ( det_bboxes[:, :4] * torch.from_numpy(scale_factor).to(det_bboxes.device) if rescale else det_bboxes) mask_rois = bbox2roi([_bboxes]) aug_masks = [] for i in range(self.num_stages): mask_roi_extractor = self.mask_roi_extractor[i] mask_feats = mask_roi_extractor( x[:len(mask_roi_extractor.featmap_strides)], mask_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) mask_pred = self.mask_head[i](mask_feats) aug_masks.append(mask_pred.sigmoid().cpu().numpy()) merged_masks = merge_aug_masks(aug_masks, [img_meta] * self.num_stages, self.test_cfg.rcnn) segm_result = self.mask_head[-1].get_seg_masks( merged_masks, _bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor, rescale) ms_segm_result['ensemble'] = segm_result if not self.test_cfg.keep_all_stages: if self.with_mask: results = (ms_bbox_result['ensemble'], ms_segm_result['ensemble']) else: results = ms_bbox_result['ensemble'] else: if self.with_mask: results = { stage: (ms_bbox_result[stage], ms_segm_result[stage]) for stage in ms_bbox_result } else: results = ms_bbox_result return results
def simple_test_seq(self, x, img_metas, det_bboxes, det_labels, rescale=False): """Simple test for seq head without augmentation.""" # image shapes of images in the batch ori_shapes = tuple(meta['ori_shape'] for meta in img_metas) scale_factors = tuple(meta['scale_factor'] for meta in img_metas) num_imgs = len(det_bboxes) if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes): rec_results = [[[] for _ in range(self.seq_head.num_classes)] for _ in range(num_imgs)] else: # if det_bboxes is rescaled to the original image size, we need to # rescale it back to the testing scale to obtain RoIs. if rescale and not isinstance(scale_factors[0], float): scale_factors = [ torch.from_numpy(scale_factor).to(det_bboxes[0].device) for scale_factor in scale_factors ] if torch.onnx.is_in_onnx_export(): # avoid seq_pred.split with static number of prediction seq_preds = [] _bboxes = [] for i, boxes in enumerate(det_bboxes): boxes = boxes[:, :4] if rescale: boxes *= scale_factors[i] _bboxes.append(boxes) img_inds = boxes[:, :1].clone() * 0 + i seq_rois = torch.cat([img_inds, boxes], dim=-1) seq_result = self._seq_forward(x, seq_rois) seq_preds.append(seq_result['seq_pred']) else: _bboxes = [ det_bboxes[i][:, :4] * scale_factors[i] if rescale else det_bboxes[i][:, :4] for i in range(len(det_bboxes)) ] seq_rois = bbox2roi(_bboxes) seq_results = self._seq_forward(x, seq_rois) seq_pred = seq_results['seq_pred'] # split batch seq prediction back to each image num_seq_roi_per_img = [ det_bbox.shape[0] for det_bbox in det_bboxes ] seq_preds = seq_pred.split(num_seq_roi_per_img, 0) # apply seq post-processing to each image individually rec_results = [] for i in range(num_imgs): if det_bboxes[i].shape[0] == 0: rec_results.append( [[] for _ in range(self.seq_head.num_classes)]) else: rec_result = self.seq_head.get_rec_seqs( seq_preds[i], _bboxes[i], det_labels[i], self.test_cfg, ori_shapes[i], scale_factors[i], rescale) rec_results.append(rec_result) return rec_results
def simple_test_bboxes(self, x, img_meta, proposals, rcnn_test_cfg, rescale=False): """Test only det bboxes without augmentation.""" rois = bbox2roi(proposals) roi_feats = self.bbox_roi_extractor( x[:len(self.bbox_roi_extractor.featmap_strides)], rois) cls_score, bbox_pred = self.bbox_head(roi_feats) img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] is_first = img_meta[0]['is_first'] det_bboxes, det_labels = self.bbox_head.get_det_bboxes( rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=rescale, cfg=rcnn_test_cfg) if det_bboxes.nelement()==0: det_obj_ids=np.array([], dtype=np.int64) if is_first: self.prev_bboxes = None self.prev_roi_feats = None self.prev_det_labels = None return det_bboxes, det_labels, det_obj_ids res_det_bboxes = det_bboxes.clone() if rescale: res_det_bboxes[:, :4] *= scale_factor det_rois = bbox2roi([res_det_bboxes]) det_roi_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], det_rois) # recompute bbox match feature if is_first or (not is_first and self.prev_bboxes is None): det_obj_ids = np.arange(det_bboxes.size(0)) # save bbox and features for later matching self.prev_bboxes = det_bboxes self.prev_roi_feats = det_roi_feats self.prev_det_labels = det_labels else: assert self.prev_roi_feats is not None # only support one image at a time bbox_img_n = [det_bboxes.size(0)] prev_bbox_img_n = [self.prev_roi_feats.size(0)] match_score = self.track_head(det_roi_feats, self.prev_roi_feats, bbox_img_n, prev_bbox_img_n)[0] match_logprob = torch.nn.functional.log_softmax(match_score, dim=1) label_delta = (self.prev_det_labels == det_labels.view(-1,1)).float() bbox_ious = bbox_overlaps(det_bboxes[:,:4], self.prev_bboxes[:,:4]) # compute comprehensive score comp_scores = self.track_head.compute_comp_scores(match_logprob, det_bboxes[:,4].view(-1, 1), bbox_ious, label_delta, add_bbox_dummy=True) match_likelihood, match_ids = torch.max(comp_scores, dim =1) # translate match_ids to det_obj_ids, assign new id to new objects # update tracking features/bboxes of exisiting object, # add tracking features/bboxes of new object match_ids = match_ids.cpu().numpy().astype(np.int32) det_obj_ids = np.ones((match_ids.shape[0]), dtype=np.int32) * (-1) best_match_scores = np.ones((self.prev_bboxes.size(0))) * (-100) for idx, match_id in enumerate(match_ids): if match_id == 0: # add new object det_obj_ids[idx] = self.prev_roi_feats.size(0) self.prev_roi_feats = torch.cat((self.prev_roi_feats, det_roi_feats[idx][None]), dim=0) self.prev_bboxes = torch.cat((self.prev_bboxes, det_bboxes[idx][None]), dim=0) self.prev_det_labels = torch.cat((self.prev_det_labels, det_labels[idx][None]), dim=0) else: # multiple candidate might match with previous object, here we choose the one with # largest comprehensive score obj_id = match_id - 1 match_score = comp_scores[idx, match_id] if match_score > best_match_scores[obj_id]: det_obj_ids[idx] = obj_id best_match_scores[obj_id] = match_score # udpate feature self.prev_roi_feats[obj_id] = det_roi_feats[idx] self.prev_bboxes[obj_id] = det_bboxes[idx] return det_bboxes, det_labels, det_obj_ids
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() # trans gt_masks to gt_obbs # gt_obbs = gt_mask_bp_obbs_list(gt_masks) # gt_obbs = gt_mask_bp_obbs_list(mask2obb_mask(gt_masks)) gt_obbs = gt_mask_bp_obbs_list( tran2mix_mask(gt_bboxes, gt_masks, gt_labels)) # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals (hbb assign) if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn[0].assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn[0].sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) ## rbbox # rbbox_targets = self.bbox_head.get_target( # sampling_results, gt_masks, gt_labels, self.train_cfg.rcnn[0]) # rbbox_targets = self.bbox_head.get_target( # sampling_results, mask2obb_mask(gt_masks), gt_labels, self.train_cfg.rcnn[0]) rbbox_targets = self.bbox_head.get_target( sampling_results, tran2mix_mask(gt_bboxes, gt_masks, gt_labels), gt_labels, self.train_cfg.rcnn[0]) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *rbbox_targets) # losses.update(loss_bbox) for name, value in loss_bbox.items(): losses['s{}.{}'.format(0, name)] = (value) pos_is_gts = [res.pos_is_gt for res in sampling_results] roi_labels = rbbox_targets[0] with torch.no_grad(): # import pdb # pdb.set_trace() rotated_proposal_list = self.bbox_head.refine_rbboxes( roi2droi(rois), roi_labels, bbox_pred, pos_is_gts, img_meta) # import pdb # pdb.set_trace() # assign gts and sample proposals (rbb assign) if self.with_rbbox: bbox_assigner = build_assigner(self.train_cfg.rcnn[1].assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn[1].sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): gt_obbs_best_roi = choose_best_Rroi_batch(gt_obbs[i]) assign_result = bbox_assigner.assign(rotated_proposal_list[i], gt_obbs_best_roi, gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, rotated_proposal_list[i], torch.from_numpy(gt_obbs_best_roi).float().to( rotated_proposal_list[i].device), gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) if self.with_rbbox: # (batch_ind, x_ctr, y_ctr, w, h, angle) rrois = dbbox2roi([res.bboxes for res in sampling_results]) # feat enlarge # rrois[:, 3] = rrois[:, 3] * 1.2 # rrois[:, 4] = rrois[:, 4] * 1.4 rrois[:, 3] = rrois[:, 3] * self.rbbox_roi_extractor.w_enlarge rrois[:, 4] = rrois[:, 4] * self.rbbox_roi_extractor.h_enlarge rbbox_feats = self.rbbox_roi_extractor( x[:self.rbbox_roi_extractor.num_inputs], rrois) if self.with_shared_head_rbbox: rbbox_feats = self.shared_head_rbbox(rbbox_feats) cls_score, rbbox_pred = self.rbbox_head(rbbox_feats) rbbox_targets = self.rbbox_head.get_target_rbbox( sampling_results, gt_obbs, gt_labels, self.train_cfg.rcnn[1]) loss_rbbox = self.rbbox_head.loss(cls_score, rbbox_pred, *rbbox_targets) for name, value in loss_rbbox.items(): losses['s{}.{}'.format(1, name)] = (value) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rrois = dbbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rrois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_rotate_adaptive_target( sampling_results, gt_masks, self.train_cfg.rcnn[1], self.mask_roi_extractor.ratio_max) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target(sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def forward_train_pair(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): # img: b, 2*c = 6, h, w b, c, h, w = img.shape img = img.reshape(-1, c // 2, h, w) # img: 2*b, c, h, w [0,1] , [2,3] , ...... x = self.extract_feat(img) # x : tuple,5 layer losses = dict() if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_outs_half = [] for outs_0 in rpn_outs: tmp = [] for outs_1 in outs_0: tmp.append(outs_1[::2, :, :, :]) rpn_outs_half.append(tmp) rpn_loss_inputs = tuple(rpn_outs_half) + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) # copy train img_meta to pair. 1,2,3->1,1,2,2,3,3 img_meta = [img_meta[i // 2] for i in range(2 * len(img_meta))] proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # generate blank gt for normal img gt_bboxes_ = [] gt_labels_ = [] for i in range(len(gt_bboxes)): gt_bboxes_.append(gt_bboxes[i]) gt_bboxes_.append(torch.Tensor([[1, 1, 1, 1]]).to(gt_bboxes[i].device)) gt_labels_.append(gt_labels[i]) gt_labels_.append(torch.Tensor([[0]]).to(gt_labels[i].device)) for i in range(self.num_stages): self.current_stage = i rcnn_train_cfg = self.train_cfg.rcnn[i] lw = self.train_cfg.stage_loss_weights[i] # assign gts and sample proposals sampling_results = [] if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(rcnn_train_cfg.assigner) bbox_sampler = build_sampler( rcnn_train_cfg.sampler, context=self) assert img.size(0) % 2 == 0 num_pairs = img.size(0) // 2 num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for j in range(num_pairs): i_train = 2 * j i_normal = i_train + 1 assign_result_train = bbox_assigner.assign(proposal_list[i_train], gt_bboxes_[i_train], gt_bboxes_ignore[i_train], gt_labels_[i_train]) assign_result_normal = bbox_assigner.assign(proposal_list[i_normal], gt_bboxes_[i_normal], gt_bboxes_ignore[i_normal], gt_labels_[i_normal]) sampling_result_train, sampling_results_normal = bbox_sampler.pair_sample( assign_result_train, assign_result_normal, proposal_list[i_train], proposal_list[i_normal], gt_bboxes_[i_train], gt_labels_[i_train], feats_train=[lvl_feat[i_train][None] for lvl_feat in x], feats_normal=[lvl_feat[i_normal][None] for lvl_feat in x]) sampling_results.append(sampling_result_train) sampling_results.append(sampling_results_normal) # bbox head forward and loss bbox_roi_extractor = self.bbox_roi_extractor[i] bbox_head = self.bbox_head[i] rois = bbox2roi([res.bboxes for res in sampling_results]) bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = bbox_head(bbox_feats) bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg) loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets) for name, value in loss_bbox.items(): losses['s{}.{}'.format(i, name)] = ( value * lw if 'loss' in name else value) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: mask_roi_extractor = self.mask_roi_extractor[i] pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = mask_roi_extractor( x[:mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: # reuse positive bbox feats pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones( res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros( res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_head = self.mask_head[i] mask_pred = mask_head(mask_feats) mask_targets = mask_head.get_target(sampling_results, gt_masks, rcnn_train_cfg) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels) for name, value in loss_mask.items(): losses['s{}.{}'.format(i, name)] = ( value * lw if 'loss' in name else value) # refine bboxes if i < self.num_stages - 1: pos_is_gts = [res.pos_is_gt for res in sampling_results] roi_labels = bbox_targets[0] # bbox_targets is a tuple # print('stage:{}'.format(i)) # ic(proposal_list[1]) with torch.no_grad(): proposal_list = bbox_head.refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_meta) # ic(proposal_list[1]) # exit(0) return losses
def simple_test(self, x, proposal_list, img_metas, rescale=False): """Test without augmentation.""" assert self.with_bbox, 'Bbox head must be implemented.' num_imgs = len(proposal_list) img_shapes = tuple(meta['img_shape'] for meta in img_metas) ori_shapes = tuple(meta['ori_shape'] for meta in img_metas) scale_factors = tuple(meta['scale_factor'] for meta in img_metas) # "ms" in variable names means multi-stage ms_bbox_result = {} ms_segm_result = {} ms_scores = [] rcnn_test_cfg = self.test_cfg rois = bbox2roi(proposal_list) for i in range(self.num_stages): bbox_results = self._bbox_forward(i, x, rois) # split batch bbox prediction back to each image cls_score = bbox_results['cls_score'] bbox_pred = bbox_results['bbox_pred'] num_proposals_per_img = tuple( len(proposals) for proposals in proposal_list) rois = rois.split(num_proposals_per_img, 0) cls_score = cls_score.split(num_proposals_per_img, 0) bbox_pred = bbox_pred.split(num_proposals_per_img, 0) ms_scores.append(cls_score) if i < self.num_stages - 1: bbox_label = [s[:, :-1].argmax(dim=1) for s in cls_score] rois = torch.cat([ self.bbox_head[i].regress_by_class(rois[j], bbox_label[j], bbox_pred[j], img_metas[j]) for j in range(num_imgs) ]) # average scores of each image by stages cls_score = [ sum([score[i] for score in ms_scores]) / float(len(ms_scores)) for i in range(num_imgs) ] # apply bbox post-processing to each image individually det_bboxes = [] det_labels = [] for i in range(num_imgs): det_bbox, det_label = self.bbox_head[-1].get_bboxes( rois[i], cls_score[i], bbox_pred[i], img_shapes[i], scale_factors[i], rescale=rescale, cfg=rcnn_test_cfg) det_bboxes.append(det_bbox) det_labels.append(det_label) bbox_results = [ bbox2result(det_bboxes[i], det_labels[i], self.bbox_head[-1].num_classes) for i in range(num_imgs) ] ms_bbox_result['ensemble'] = bbox_results if self.with_mask: if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes): mask_classes = self.mask_head[-1].num_classes segm_results = [[[] for _ in range(mask_classes)] for _ in range(num_imgs)] else: if rescale and not isinstance(scale_factors[0], float): scale_factors = [ torch.from_numpy(scale_factor).to(det_bboxes[0].device) for scale_factor in scale_factors ] _bboxes = [ det_bboxes[i][:, :4] * scale_factors[i] if rescale else det_bboxes[i][:, :4] for i in range(len(det_bboxes)) ] mask_rois = bbox2roi(_bboxes) num_mask_rois_per_img = tuple( _bbox.size(0) for _bbox in _bboxes) aug_masks = [] for i in range(self.num_stages): mask_results = self._mask_forward(i, x, mask_rois) mask_pred = mask_results['mask_pred'] # split batch mask prediction back to each image mask_pred = mask_pred.split(num_mask_rois_per_img, 0) aug_masks.append( [m.sigmoid().cpu().numpy() for m in mask_pred]) # apply mask post-processing to each image individually segm_results = [] for i in range(num_imgs): if det_bboxes[i].shape[0] == 0: segm_results.append( [[] for _ in range(self.mask_head[-1].num_classes)]) else: aug_mask = [mask[i] for mask in aug_masks] merged_masks = merge_aug_masks( aug_mask, [[img_metas[i]]] * self.num_stages, rcnn_test_cfg) segm_result = self.mask_head[-1].get_seg_masks( merged_masks, _bboxes[i], det_labels[i], rcnn_test_cfg, ori_shapes[i], scale_factors[i], rescale) segm_results.append(segm_result) ms_segm_result['ensemble'] = segm_results if self.with_mask: results = list( zip(ms_bbox_result['ensemble'], ms_segm_result['ensemble'])) else: results = ms_bbox_result['ensemble'] return results
def forward_train(self, img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_metas, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] if mask_feats.shape[0] > 0: mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals for i in range(self.num_stages): self.current_stage = i rcnn_train_cfg = self.train_cfg.rcnn[i] lw = self.train_cfg.stage_loss_weights[i] # assign gts and sample proposals sampling_results = [] if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(rcnn_train_cfg.assigner) bbox_sampler = build_sampler(rcnn_train_cfg.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] for j in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j], gt_labels[j]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[j], gt_bboxes[j], gt_labels[j], feats=[lvl_feat[j][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss bbox_roi_extractor = self.bbox_roi_extractor[i] bbox_head = self.bbox_head[i] rois = bbox2roi([res.bboxes for res in sampling_results]) bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = bbox_head(bbox_feats) bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg) loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets) for name, value in loss_bbox.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: mask_roi_extractor = self.mask_roi_extractor[i] pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = mask_roi_extractor( x[:mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: # reuse positive bbox feats pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_head = self.mask_head[i] mask_pred = mask_head(mask_feats) mask_targets = mask_head.get_target(sampling_results, gt_masks, rcnn_train_cfg) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels) for name, value in loss_mask.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # refine bboxes if i < self.num_stages - 1: pos_is_gts = [res.pos_is_gt for res in sampling_results] roi_labels = bbox_targets[0] # bbox_targets is a tuple with torch.no_grad(): proposal_list = bbox_head.refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_meta) return losses
def forward_train(self, img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_metas, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals if self.with_bbox: # assign gts and sample proposals bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # Grid head forward and loss sampling_results = self._random_jitter(sampling_results, img_metas) pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) grid_feats = self.grid_roi_extractor( x[:self.grid_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: grid_feats = self.shared_head(grid_feats) # Accelerate training max_sample_num_grid = self.train_cfg.rcnn.get('max_num_grid', 192) sample_idx = torch.randperm( grid_feats.shape[0])[:min(grid_feats. shape[0], max_sample_num_grid)] grid_feats = grid_feats[sample_idx] grid_pred = self.grid_head(grid_feats) grid_targets = self.grid_head.get_target(sampling_results, self.train_cfg.rcnn) grid_targets = grid_targets[sample_idx] loss_grid = self.grid_head.loss(grid_pred, grid_targets) losses.update(loss_grid) return losses