def forward_train(self, img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None): x = self.extract_feat(img) losses = dict() semantic_logits = self.semantic_head(x[:4]) loss_seg = self.semantic_head.loss(semantic_logits, gt_semantic_seg) losses.update(loss_seg) rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_metas, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) sampling_results = self.assign_result(x, proposal_list, img, gt_bboxes, gt_labels, gt_bboxes_ignore) rois = bbox2roi([res.bboxes for res in sampling_results]) bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) if mask_feats.shape[0] > 0: mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target(sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def simple_test_mask(self, x, img_metas, det_bboxes, det_labels, semantic_logits, rescale=False): ori_shape = img_metas[0]['ori_shape'] scale_factor = img_metas[0]['scale_factor'] if det_bboxes.shape[0] == 0: segm_result = [[] for _ in range(self.mask_head.num_classes - 1)] else: # if det_bboxes is rescaled to the original image size, we need to # rescale it back to the testing scale to obtain RoIs. if rescale and not isinstance(scale_factor, float): scale_factor = torch.from_numpy(scale_factor).to( det_bboxes.device) _bboxes = (det_bboxes[:, :4] * scale_factor if rescale else det_bboxes) mask_rois = bbox2roi([_bboxes]) mask_feats = self.mask_roi_extractor( x[:len(self.mask_roi_extractor.featmap_strides)], mask_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) mask_pred = self.mask_head(mask_feats) segm_result = self.mask_head.get_seg_masks(mask_pred, _bboxes, det_labels, self.test_cfg.rcnn, ori_shape, scale_factor, rescale) return segm_result
async def async_test_bboxes(self, x, img_metas, proposals, rcnn_test_cfg, rescale=False, bbox_semaphore=None, global_lock=None): """Async test only det bboxes without augmentation.""" rois = bbox2roi(proposals) roi_feats = self.bbox_roi_extractor( x[:len(self.bbox_roi_extractor.featmap_strides)], rois) if self.with_shared_head: roi_feats = self.shared_head(roi_feats) sleep_interval = rcnn_test_cfg.get('async_sleep_interval', 0.017) async with completed(__name__, 'bbox_head_forward', sleep_interval=sleep_interval): cls_score, bbox_pred = self.bbox_head(roi_feats) img_shape = img_metas[0]['img_shape'] scale_factor = img_metas[0]['scale_factor'] det_bboxes, det_labels = self.bbox_head.get_det_bboxes( rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=rescale, cfg=rcnn_test_cfg) return det_bboxes, det_labels
def aug_test_mask(self, feats, img_metas, det_bboxes, det_labels): if det_bboxes.shape[0] == 0: segm_result = [[] for _ in range(self.mask_head.num_classes - 1)] else: aug_masks = [] for x, img_meta in zip(feats, img_metas): img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape, scale_factor, flip) mask_rois = bbox2roi([_bboxes]) mask_feats = self.mask_roi_extractor( x[:len(self.mask_roi_extractor.featmap_strides)], mask_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) mask_pred = self.mask_head(mask_feats) # convert to numpy array to save memory aug_masks.append(mask_pred.sigmoid().cpu().numpy()) merged_masks = merge_aug_masks(aug_masks, img_metas, self.test_cfg.rcnn) ori_shape = img_metas[0][0]['ori_shape'] segm_result = self.mask_head.get_seg_masks(merged_masks, det_bboxes, det_labels, self.test_cfg.rcnn, ori_shape, scale_factor=1.0, rescale=False) return segm_result
async def async_test_mask(self, x, img_metas, det_bboxes, det_labels, rescale=False, mask_test_cfg=None): # image shape of the first image in the batch (only one) ori_shape = img_metas[0]['ori_shape'] scale_factor = img_metas[0]['scale_factor'] if det_bboxes.shape[0] == 0: segm_result = [[] for _ in range(self.mask_head.num_classes - 1)] else: _bboxes = (det_bboxes[:, :4] * scale_factor if rescale else det_bboxes) mask_rois = bbox2roi([_bboxes]) mask_feats = self.mask_roi_extractor( x[:len(self.mask_roi_extractor.featmap_strides)], mask_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) if mask_test_cfg and mask_test_cfg.get('async_sleep_interval'): sleep_interval = mask_test_cfg['async_sleep_interval'] else: sleep_interval = 0.035 async with completed(__name__, 'mask_head_forward', sleep_interval=sleep_interval): mask_pred = self.mask_head(mask_feats) segm_result = self.mask_head.get_seg_masks( mask_pred, _bboxes, det_labels, self.test_cfg.rcnn, ori_shape, scale_factor, rescale) return segm_result
def aug_test_bboxes(self, feats, img_metas, proposal_list, rcnn_test_cfg): aug_bboxes = [] aug_scores = [] for x, img_meta in zip(feats, img_metas): # only one image in the batch img_shape = img_meta[0]['img_shape'] scale_factor = img_meta[0]['scale_factor'] flip = img_meta[0]['flip'] # TODO more flexible proposals = bbox_mapping(proposal_list[0][:, :4], img_shape, scale_factor, flip) rois = bbox2roi([proposals]) # recompute feature maps to save GPU memory roi_feats = self.bbox_roi_extractor( x[:len(self.bbox_roi_extractor.featmap_strides)], rois) if self.with_shared_head: roi_feats = self.shared_head(roi_feats) cls_score, bbox_pred = self.bbox_head(roi_feats) bboxes, scores = self.bbox_head.get_det_bboxes(rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=False, cfg=None) aug_bboxes.append(bboxes) aug_scores.append(scores) # after merging, bboxes will be rescaled to the original image size merged_bboxes, merged_scores = merge_aug_bboxes( aug_bboxes, aug_scores, img_metas, rcnn_test_cfg) det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores, rcnn_test_cfg.score_thr, rcnn_test_cfg.nms, rcnn_test_cfg.max_per_img) return det_bboxes, det_labels
def forward_dummy(self, img): """Used for computing network flops. See `mmdetection/tools/get_flops.py` """ outs = () # backbone x = self.extract_feat(img) # rpn if self.with_rpn: rpn_outs = self.rpn_head(x) outs = outs + (rpn_outs, ) proposals = torch.randn(1000, 4).to(device=img.device) # bbox head rois = bbox2roi([proposals]) if self.with_bbox: bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) outs = outs + (cls_score, bbox_pred) # mask head if self.with_mask: mask_rois = rois[:100] mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], mask_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) mask_pred = self.mask_head(mask_feats) outs = outs + (mask_pred, ) return outs
def simple_test_bboxes(self, x, img_metas, proposals, rcnn_test_cfg, rescale=False): rois = bbox2roi(proposals) roi_feats = self.bbox_roi_extractor( x[:len(self.bbox_roi_extractor.featmap_strides)], rois) if self.with_shared_head: roi_feats = self.shared_head(roi_feats) cls_score, bbox_pred = self.bbox_head(roi_feats) img_shape = img_metas[0]['img_shape'] scale_factor = img_metas[0]['scale_factor'] det_bboxes, det_labels = self.bbox_head.get_det_bboxes( rois, cls_score, bbox_pred, img_shape, scale_factor, rescale=rescale, cfg=rcnn_test_cfg) return det_bboxes, det_labels
def simple_test_mask_(self, x, img_metas, det_bboxes, det_labels, semantic_logits, rescale=False): ori_shape = img_metas[0]['ori_shape'] scale_factor = img_metas[0]['scale_factor'] ref_size = (np.int(np.round(ori_shape[0] * scale_factor)), np.int(np.round(ori_shape[1] * scale_factor))) semantic_logits = F.interpolate(semantic_logits, size=ref_size, mode="bilinear", align_corners=False) sem_pred = torch.argmax(semantic_logits, dim=1)[0] panoptic_mask = torch.zeros_like(sem_pred, dtype=torch.long) cat = [255] if det_bboxes.shape[0] == 0: intermediate_logits = semantic_logits[0, :self.num_stuff] else: # if det_bboxes is rescaled to the original image size, we need to # rescale it back to the testing scale to obtain RoIs. if rescale and not isinstance(scale_factor, float): scale_factor = torch.from_numpy(scale_factor).to( det_bboxes.device) _bboxes = (det_bboxes[:, :4] * scale_factor if rescale else det_bboxes) mask_rois = bbox2roi([_bboxes]) mask_feats = self.mask_roi_extractor( x[:len(self.mask_roi_extractor.featmap_strides)], mask_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) mask_pred = self.mask_head(mask_feats) confidence = det_bboxes[:, 4] idx = torch.argsort(confidence, descending=True) bbx_inv = invert_roi_bbx(det_bboxes[:, :4], tuple(mask_pred.shape[2:]), ref_size) bbx_idx = torch.arange(0, det_bboxes.size(0), dtype=torch.long, device=det_bboxes.device) mask_pred = roi_sampling(mask_pred, bbx_inv, bbx_idx, ref_size, padding="zero") ML_A = mask_pred.new_zeros(mask_pred.shape[0], mask_pred.shape[-2], mask_pred.shape[-1]) ML_B = ML_A.clone() occupied = torch.zeros_like(sem_pred, dtype=torch.bool) i = 0 for id_i in idx: label_i = det_labels[id_i] mask_pred_i = mask_pred[id_i, label_i + 1, :, :] mask_i = (mask_pred_i.sigmoid() > self.test_cfg.rcnn.mask_thr_binary) mask_i = mask_i.type(torch.bool) intersection = occupied & mask_i if intersection.float().sum() / mask_i.float().sum( ) > self.test_cfg.panoptic.overlap_thr: continue mask_i = mask_i ^ intersection occupied += mask_i y0 = max(int(det_bboxes[id_i, 1] + 1), 0) y1 = min(int((det_bboxes[id_i, 3] - 1).round() + 1), ref_size[0]) x0 = max(int(det_bboxes[id_i, 0] + 1), 0) x1 = min(int((det_bboxes[id_i, 2] - 1).round() + 1), ref_size[1]) ML_A[i] = 4 * mask_pred_i ML_B[i, y0:y1, x0:x1] = semantic_logits[0, label_i + self.num_stuff, y0:y1, x0:x1] cat.append(label_i.item() + self.num_stuff) i = i + 1 ML_A = ML_A[:i] ML_B = ML_B[:i] FL = (ML_A.sigmoid() + ML_B.sigmoid()) * (ML_A + ML_B) intermediate_logits = torch.cat( [semantic_logits[0, :self.num_stuff], FL], dim=0) cat = torch.tensor(cat, dtype=torch.long) intermediate_mask = torch.argmax(F.softmax(intermediate_logits, dim=0), dim=0) + 1 intermediate_mask = intermediate_mask - self.num_stuff intermediate_mask[intermediate_mask <= 0] = 0 unique = torch.unique(intermediate_mask) ignore_val = intermediate_mask.max().item() + 1 ignore_arr = torch.ones( (ignore_val, ), dtype=unique.dtype, device=unique.device) * ignore_val total_unique = unique.shape[0] ignore_arr[unique] = torch.arange(total_unique).cuda(ignore_arr.device) panoptic_mask = ignore_arr[intermediate_mask] panoptic_mask[intermediate_mask == ignore_val] = 0 cat_ = cat[unique].long() sem_pred[panoptic_mask > 0] = self.num_stuff sem_pred[sem_pred >= self.num_stuff] = self.num_stuff cls_stuff, area = torch.unique(sem_pred, return_counts=True) cls_stuff[ area < self.test_cfg.panoptic.min_stuff_area] = self.num_stuff cls_stuff = cls_stuff[cls_stuff != self.num_stuff] tmp = torch.ones((self.num_stuff + 1, ), dtype=cls_stuff.dtype, device=cls_stuff.device) * self.num_stuff tmp[cls_stuff] = torch.arange(cls_stuff.shape[0]).cuda(tmp.device) new_sem_pred = tmp[sem_pred] cat_ = torch.cat((cat_, cls_stuff.cpu().long()), -1) bool_mask = new_sem_pred != self.num_stuff panoptic_mask[bool_mask] = new_sem_pred[bool_mask] + total_unique return panoptic_mask.cpu(), cat_.cpu()
def forward_train(self, img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_metas, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] if mask_feats.shape[0] > 0: mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses