def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None, epoch=None, **kwargs): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_meta (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ # x = self.extract_feat(img) losses = dict() # keep the original usage if self.dfn_balance is None: x = self.extract_feat(img) else: # count the number of defects in each image defect_nums = [0] * img.shape[0] for i, gt_label in enumerate(gt_labels): defect_cnt = 0 for label in gt_label: if self.dfn_balance.background_id != int(label): defect_cnt += 1 defect_nums[i] = defect_cnt # split the images into defect(1) image and normal(0) image dfn_labels = [0 if i == 0 else 1 for i in defect_nums] dfn_labels = torch.Tensor(dfn_labels).long().cuda() # calculate dfn loss x, dfn_loss = self.extract_defect_feat(img, targets=dfn_labels) # balance different network loss dfn_weight = self.get_dfn_weight(epoch) loss_dfn = dfn_loss['loss'] loss_dfn = loss_dfn * dfn_weight losses.update(loss_dfn=loss_dfn) # delete the ignored annotations if self.ignore_ids is not None: for ignore_id in self.ignore_ids: for i, gt_label in enumerate(gt_labels): keep_ind = gt_label != ignore_id gt_labels[i] = gt_labels[i][keep_ind] gt_bboxes[i] = gt_bboxes[i][keep_ind] keep_ind = [True] * len(gt_labels) for i in range(len(gt_labels) - 1, -1, -1): if gt_labels[i].shape[0] == 0: img_meta.pop(i) gt_bboxes.pop(i) gt_labels.pop(i) keep_ind[i] = False img = img[keep_ind] x = list(x) for i in range(len(x)): x[i] = x[i][keep_ind] x = tuple(x) # if have no any images, then set all loss to be 0 except dfn loss if len(gt_labels) == 0 or len( gt_bboxes) == 0 or img.shape[0] == 0: loss_zeros = [ torch.Tensor([0.]).float().cuda() for i in range(len(x)) ] loss_ones = [ torch.Tensor([1.]).float().cuda() for i in range(len(x)) ] losses.update(loss_rpn_cls=loss_zeros, loss_rpn_bbox=loss_zeros) loss_zero = torch.Tensor([0.]).float().cuda() loss_one = torch.Tensor([1.]).float().cuda() losses['loss_cls'] = loss_zero losses['loss_bbox'] = loss_zero losses['acc'] = loss_one return losses # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] if mask_feats.shape[0] > 0: mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() # trans gt_masks to gt_obbs # gt_obbs = gt_mask_bp_obbs_list(gt_masks) gt_obbs = gt_mask_bp_obbs_list(mask2obb_mask(gt_bboxes, gt_masks)) # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals (hbb assign) if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn[0].assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn[0].sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) ## rbbox # rbbox_targets = self.bbox_head.get_target( # sampling_results, gt_masks, gt_labels, self.train_cfg.rcnn[0]) rbbox_targets = self.bbox_head.get_target( sampling_results, mask2obb_mask(gt_bboxes, gt_masks), gt_labels, self.train_cfg.rcnn[0]) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *rbbox_targets) # losses.update(loss_bbox) for name, value in loss_bbox.items(): losses['s{}.{}'.format(0, name)] = (value) pos_is_gts = [res.pos_is_gt for res in sampling_results] roi_labels = rbbox_targets[0] with torch.no_grad(): # import pdb # pdb.set_trace() rotated_proposal_list = self.bbox_head.refine_rbboxes( roi2droi(rois), roi_labels, bbox_pred, pos_is_gts, img_meta) # import pdb # pdb.set_trace() # assign gts and sample proposals (rbb assign) if self.with_rbbox: bbox_assigner = build_assigner(self.train_cfg.rcnn[1].assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn[1].sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): gt_obbs_best_roi = choose_best_Rroi_batch(gt_obbs[i]) assign_result = bbox_assigner.assign(rotated_proposal_list[i], gt_obbs_best_roi, gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, rotated_proposal_list[i], torch.from_numpy(gt_obbs_best_roi).float().to( rotated_proposal_list[i].device), gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) if self.with_rbbox: # (batch_ind, x_ctr, y_ctr, w, h, angle) rrois = dbbox2roi([res.bboxes for res in sampling_results]) # feat enlarge # rrois[:, 3] = rrois[:, 3] * 1.2 # rrois[:, 4] = rrois[:, 4] * 1.4 rrois[:, 3] = rrois[:, 3] * self.rbbox_roi_extractor.w_enlarge rrois[:, 4] = rrois[:, 4] * self.rbbox_roi_extractor.h_enlarge rbbox_feats = self.rbbox_roi_extractor( x[:self.rbbox_roi_extractor.num_inputs], rrois) if self.with_shared_head_rbbox: rbbox_feats = self.shared_head_rbbox(rbbox_feats) cls_score, rbbox_pred = self.rbbox_head(rbbox_feats) rbbox_targets = self.rbbox_head.get_target_rbbox( sampling_results, gt_obbs, gt_labels, self.train_cfg.rcnn[1]) loss_rbbox = self.rbbox_head.loss(cls_score, rbbox_pred, *rbbox_targets) for name, value in loss_rbbox.items(): losses['s{}.{}'.format(1, name)] = (value) # hbb mask forward and loss pos_rrois = dbbox2roi([res.pos_bboxes for res in sampling_results]) pos_rois = rrois2rois(pos_rrois) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target_hbb(sampling_results, gt_masks, self.train_cfg.rcnn[1]) pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) loss_mask['loss_mask_hbb'] = loss_mask.pop('loss_mask') losses.update(loss_mask) # obb mask forward and loss mask_feats = self.mask_rroi_extractor( x[:self.mask_rroi_extractor.num_inputs], pos_rrois) mask_pred = self.mask_rhead(mask_feats) mask_targets = self.mask_rhead.get_rotate_adaptive_target( sampling_results, gt_masks, self.train_cfg.rcnn[1], self.mask_rroi_extractor.ratio_max) pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_rhead.loss(mask_pred, mask_targets, pos_labels) loss_mask['loss_mask_obb'] = loss_mask.pop('loss_mask') losses.update(loss_mask) return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() # trans gt_masks to gt_obbs gt_obbs = gt_mask_bp_obbs_list(gt_masks) gt_obbs = [choose_best_Rroi_batch(gt) for gt in gt_obbs] # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, gt_labels, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals (hbb assign) if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn[0].assigner) bbox_sampler = build_sampler( self.train_cfg.rcnn[0].sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) ## rbbox gt_obbs_torch = [torch.from_numpy(gt).float().to(rois.device) for gt in gt_obbs] rbbox_targets = self.bbox_head.get_target( sampling_results, gt_obbs_torch, gt_labels, self.train_cfg.rcnn[0]) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, rois[:, 1:], *rbbox_targets) # loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, sampling_results, # gt_obbs_torch, *rbbox_targets) # loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, # *rbbox_targets) # losses.update(loss_bbox) for name, value in loss_bbox.items(): losses['s{}.{}'.format(0, name)] = (value) return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None, proposals=None): x = self.extract_feat(img) losses = dict() # RPN part, the same as normal two-stage detectors if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # semantic segmentation part # 2 outputs: segmentation prediction and embedded features if self.with_semantic: semantic_pred, semantic_feat = self.semantic_head(x) loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_seg) losses['loss_semantic_seg'] = loss_seg else: semantic_feat = None for i in range(self.num_stages): self.current_stage = i rcnn_train_cfg = self.train_cfg.rcnn[i] lw = self.train_cfg.stage_loss_weights[i] # assign gts and sample proposals sampling_results = [] bbox_assigner = build_assigner(rcnn_train_cfg.assigner) bbox_sampler = build_sampler(rcnn_train_cfg.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] for j in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j], gt_labels[j]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[j], gt_bboxes[j], gt_labels[j], feats=[lvl_feat[j][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss loss_bbox, rois, bbox_targets, bbox_pred = \ self._bbox_forward_train( i, x, sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg, semantic_feat) roi_labels = bbox_targets[0] for name, value in loss_bbox.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # mask head forward and loss if self.with_mask: # interleaved execution: use regressed bboxes by the box branch # to train the mask branch if self.interleaved: pos_is_gts = [res.pos_is_gt for res in sampling_results] with torch.no_grad(): proposal_list = self.bbox_head[i].refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_meta) # re-assign and sample 512 RoIs from 512 RoIs sampling_results = [] for j in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j], gt_labels[j]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[j], gt_bboxes[j], gt_labels[j], feats=[lvl_feat[j][None] for lvl_feat in x]) sampling_results.append(sampling_result) loss_mask = self._mask_forward_train(i, x, sampling_results, gt_masks, rcnn_train_cfg, semantic_feat) for name, value in loss_mask.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # refine bboxes (same as Cascade R-CNN) if i < self.num_stages - 1 and not self.interleaved: pos_is_gts = [res.pos_is_gt for res in sampling_results] with torch.no_grad(): proposal_list = self.bbox_head[i].refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_meta) return losses
def __init__(self, num_classes, in_channels, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8, 16, 32], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', clip_border=True, target_means=(.0, .0, .0, .0), target_stds=(1.0, 1.0, 1.0, 1.0)), reg_decoded_bbox=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict( type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), train_cfg=None, test_cfg=None): super(AnchorHead, self).__init__() self.in_channels = in_channels self.num_classes = num_classes self.feat_channels = feat_channels self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) # TODO better way to determine whether sample or not self.sampling = loss_cls['type'] not in [ 'FocalLoss', 'GHMC', 'QualityFocalLoss' ] if self.use_sigmoid_cls: self.cls_out_channels = num_classes else: self.cls_out_channels = num_classes + 1 if self.cls_out_channels <= 0: raise ValueError(f'num_classes={num_classes} is too small') self.reg_decoded_bbox = reg_decoded_bbox self.bbox_coder = build_bbox_coder(bbox_coder) self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.train_cfg = train_cfg self.test_cfg = test_cfg if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) # use PseudoSampler when sampling is False if self.sampling and hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.fp16_enabled = False self.anchor_generator = build_anchor_generator(anchor_generator) # usually the numbers of anchors for each level are the same # except SSD detectors self.num_anchors = self.anchor_generator.num_base_anchors[0] self._init_layers()
def __init__( self, num_classes, in_channels, num_query=100, num_reg_fcs=2, transformer=None, sync_cls_avg_factor=False, positional_encoding=dict(type='SinePositionalEncoding', num_feats=128, normalize=True), loss_cls=dict(type='CrossEntropyLoss', bg_cls_weight=0.1, use_sigmoid=False, loss_weight=1.0, class_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=5.0), loss_iou=dict(type='GIoULoss', loss_weight=2.0), train_cfg=dict(assigner=dict( type='HungarianAssigner', cls_cost=dict(type='ClassificationCost', weight=1.), reg_cost=dict(type='BBoxL1Cost', weight=5.0), iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))), test_cfg=dict(max_per_img=100), init_cfg=None, **kwargs): # NOTE here use `AnchorFreeHead` instead of `TransformerHead`, # since it brings inconvenience when the initialization of # `AnchorFreeHead` is called. super(AnchorFreeHead, self).__init__(init_cfg) self.bg_cls_weight = 0 self.sync_cls_avg_factor = sync_cls_avg_factor class_weight = loss_cls.get('class_weight', None) if class_weight is not None and (self.__class__ is DETRHead): assert isinstance(class_weight, float), 'Expected ' \ 'class_weight to have type float. Found ' \ f'{type(class_weight)}.' # NOTE following the official DETR rep0, bg_cls_weight means # relative classification weight of the no-object class. bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight) assert isinstance(bg_cls_weight, float), 'Expected ' \ 'bg_cls_weight to have type float. Found ' \ f'{type(bg_cls_weight)}.' class_weight = torch.ones(num_classes + 1) * class_weight # set background class as the last indice class_weight[num_classes] = bg_cls_weight loss_cls.update({'class_weight': class_weight}) if 'bg_cls_weight' in loss_cls: loss_cls.pop('bg_cls_weight') self.bg_cls_weight = bg_cls_weight if train_cfg: assert 'assigner' in train_cfg, 'assigner should be provided '\ 'when train_cfg is set.' assigner = train_cfg['assigner'] assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'], \ 'The classification weight for loss and matcher should be' \ 'exactly the same.' assert loss_bbox['loss_weight'] == assigner['reg_cost'][ 'weight'], 'The regression L1 weight for loss and matcher ' \ 'should be exactly the same.' assert loss_iou['loss_weight'] == assigner['iou_cost']['weight'], \ 'The regression iou weight for loss and matcher should be' \ 'exactly the same.' self.assigner = build_assigner(assigner) # DETR sampling=False, so use PseudoSampler sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.num_query = num_query self.num_classes = num_classes self.in_channels = in_channels self.num_reg_fcs = num_reg_fcs self.train_cfg = train_cfg self.test_cfg = test_cfg self.fp16_enabled = False self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_iou = build_loss(loss_iou) if self.loss_cls.use_sigmoid: self.cls_out_channels = num_classes else: self.cls_out_channels = num_classes + 1 self.act_cfg = transformer.get('act_cfg', dict(type='ReLU', inplace=True)) self.activate = build_activation_layer(self.act_cfg) self.positional_encoding = build_positional_encoding( positional_encoding) self.transformer = build_transformer(transformer) self.embed_dims = self.transformer.embed_dims assert 'num_feats' in positional_encoding num_feats = positional_encoding['num_feats'] assert num_feats * 2 == self.embed_dims, 'embed_dims should' \ f' be exactly 2 times of num_feats. Found {self.embed_dims}' \ f' and {num_feats}.' self._init_layers()
def forward_train( self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None ): x = self.extract_feat(img) losses dict() if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore ) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal2', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposals_list = proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.bbox_sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign( proposals_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i] ) sampling_result = bbox_sampler.sample( assign_result, proposals_list[i], gt_bboxes[i], gt_bboxes_ignore[i], feats=[lvl_feat[i][None] for lvl_feat in x] ) sampling_results.append(sampling_result) if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois ) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_target = self.bbox_head.get_target( sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn ) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_target) losses.update(loss_bbox) if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois ) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0]), device=device, dtype=torch.uint8 ) pos_inds.append( torch.zeros( res.neg_bboxes.shape[0], device=device, dtype=torch.uint8 ) ) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] if mask_feats.shape[0] > 0: mask_pred = self.mask_head(mask_head) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn ) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results] ) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def __init__(self, num_classes, in_channels, regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 512), (512, INF)), center_sampling=False, center_sample_radius=1.5, sync_num_pos=True, gradient_mul=0.1, bbox_norm_type='reg_denom', loss_cls_fl=dict(type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), use_vfl=True, loss_cls=dict(type='VarifocalLoss', use_sigmoid=True, alpha=0.75, gamma=2.0, iou_weighted=True, loss_weight=1.0), loss_bbox=dict(type='GIoULoss', loss_weight=1.5), loss_bbox_refine=dict(type='GIoULoss', loss_weight=2.0), norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), use_atss=True, reg_cls_branch=False, anchor_generator=dict(type='AnchorGenerator', ratios=[1.0], octave_base_scale=8, scales_per_octave=1, center_offset=0.0, strides=[8, 16, 32, 64, 128]), **kwargs): # dcn base offsets, adapted from reppoints_head.py self.num_out = len(anchor_generator["strides"]) self.anchor_generator = build_anchor_generator(anchor_generator) self.num_anchors = self.anchor_generator.num_base_anchors[0] self.reg_cls_branch = reg_cls_branch super(FCOSHead, self).__init__(num_classes, in_channels, norm_cfg=norm_cfg, **kwargs) self.regress_ranges = regress_ranges self.reg_denoms = [ regress_range[-1] for regress_range in regress_ranges ] self.reg_denoms[-1] = self.reg_denoms[-2] * 2 self.center_sampling = center_sampling self.center_sample_radius = center_sample_radius self.sync_num_pos = sync_num_pos self.bbox_norm_type = bbox_norm_type self.gradient_mul = gradient_mul self.use_vfl = use_vfl if self.use_vfl: self.loss_cls = build_loss(loss_cls) else: self.loss_cls = build_loss(loss_cls_fl) self.loss_bbox = build_loss(loss_bbox) self.loss_bbox_refine = build_loss(loss_bbox_refine) # for getting ATSS targets self.use_atss = use_atss self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) self.anchor_center_offset = anchor_generator['center_offset'] self.num_anchors = self.anchor_generator.num_base_anchors[0] self.sampling = False if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self)
def forward_train(self, img, img_meta, gt_bboxes, gt_polygons, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) if self.with_attention: x = self.add_attention(x) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals if self.with_deform_adjust: x = self.deform_adjust(x, rpn_outs[1]) # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_polygons[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_polygons, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target(sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def forward_train_pair(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): # img: b, 2*c = 6, h, w b, c, h, w = img.shape img = img.reshape(-1, c // 2, h, w) # img: 2*b, c, h, w [0,1] , [2,3] , ...... x = self.extract_feat(img) # x : tuple,5 layer losses = dict() if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_outs_half = [] for outs_0 in rpn_outs: tmp = [] for outs_1 in outs_0: tmp.append(outs_1[::2, :, :, :]) rpn_outs_half.append(tmp) rpn_loss_inputs = tuple(rpn_outs_half) + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) # copy train img_meta to pair. 1,2,3->1,1,2,2,3,3 img_meta = [img_meta[i // 2] for i in range(2 * len(img_meta))] proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # generate blank gt for normal img gt_bboxes_ = [] gt_labels_ = [] for i in range(len(gt_bboxes)): gt_bboxes_.append(gt_bboxes[i]) gt_bboxes_.append(torch.Tensor([[1, 1, 1, 1]]).to(gt_bboxes[i].device)) gt_labels_.append(gt_labels[i]) gt_labels_.append(torch.Tensor([[0]]).to(gt_labels[i].device)) for i in range(self.num_stages): self.current_stage = i rcnn_train_cfg = self.train_cfg.rcnn[i] lw = self.train_cfg.stage_loss_weights[i] # assign gts and sample proposals sampling_results = [] if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(rcnn_train_cfg.assigner) bbox_sampler = build_sampler( rcnn_train_cfg.sampler, context=self) assert img.size(0) % 2 == 0 num_pairs = img.size(0) // 2 num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for j in range(num_pairs): i_train = 2 * j i_normal = i_train + 1 assign_result_train = bbox_assigner.assign(proposal_list[i_train], gt_bboxes_[i_train], gt_bboxes_ignore[i_train], gt_labels_[i_train]) assign_result_normal = bbox_assigner.assign(proposal_list[i_normal], gt_bboxes_[i_normal], gt_bboxes_ignore[i_normal], gt_labels_[i_normal]) sampling_result_train, sampling_results_normal = bbox_sampler.pair_sample( assign_result_train, assign_result_normal, proposal_list[i_train], proposal_list[i_normal], gt_bboxes_[i_train], gt_labels_[i_train], feats_train=[lvl_feat[i_train][None] for lvl_feat in x], feats_normal=[lvl_feat[i_normal][None] for lvl_feat in x]) sampling_results.append(sampling_result_train) sampling_results.append(sampling_results_normal) # bbox head forward and loss bbox_roi_extractor = self.bbox_roi_extractor[i] bbox_head = self.bbox_head[i] rois = bbox2roi([res.bboxes for res in sampling_results]) bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = bbox_head(bbox_feats) bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg) loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets) for name, value in loss_bbox.items(): losses['s{}.{}'.format(i, name)] = ( value * lw if 'loss' in name else value) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: mask_roi_extractor = self.mask_roi_extractor[i] pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = mask_roi_extractor( x[:mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: # reuse positive bbox feats pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones( res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros( res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_head = self.mask_head[i] mask_pred = mask_head(mask_feats) mask_targets = mask_head.get_target(sampling_results, gt_masks, rcnn_train_cfg) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels) for name, value in loss_mask.items(): losses['s{}.{}'.format(i, name)] = ( value * lw if 'loss' in name else value) # refine bboxes if i < self.num_stages - 1: pos_is_gts = [res.pos_is_gt for res in sampling_results] roi_labels = bbox_targets[0] # bbox_targets is a tuple # print('stage:{}'.format(i)) # ic(proposal_list[1]) with torch.no_grad(): proposal_list = bbox_head.refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_meta) # ic(proposal_list[1]) # exit(0) return losses
def __init__( self, num_classes, in_channels, feat_channels=256, approx_anchor_generator=dict( type='AnchorGenerator', octave_base_scale=8, scales_per_octave=3, ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), square_anchor_generator=dict( type='AnchorGenerator', ratios=[1.0], scales=[8], strides=[4, 8, 16, 32, 64]), anchor_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0] ), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0] ), reg_decoded_bbox=False, deform_groups=4, loc_filter_thr=0.01, train_cfg=None, test_cfg=None, loss_loc=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0), init_cfg=dict(type='Normal', layer='Conv2d', std=0.01, override=dict(type='Normal', name='conv_loc', std=0.01, bias_prob=0.01))): # yapf: disable super(AnchorHead, self).__init__(init_cfg) self.in_channels = in_channels self.num_classes = num_classes self.feat_channels = feat_channels self.deform_groups = deform_groups self.loc_filter_thr = loc_filter_thr # build approx_anchor_generator and square_anchor_generator assert (approx_anchor_generator['octave_base_scale'] == square_anchor_generator['scales'][0]) assert (approx_anchor_generator['strides'] == square_anchor_generator['strides']) self.approx_anchor_generator = build_anchor_generator( approx_anchor_generator) self.square_anchor_generator = build_anchor_generator( square_anchor_generator) self.approxs_per_octave = self.approx_anchor_generator \ .num_base_anchors[0] self.reg_decoded_bbox = reg_decoded_bbox # one anchor per location self.num_anchors = 1 self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) self.loc_focal_loss = loss_loc['type'] in ['FocalLoss'] self.sampling = loss_cls['type'] not in ['FocalLoss'] self.ga_sampling = train_cfg is not None and hasattr( train_cfg, 'ga_sampler') if self.use_sigmoid_cls: self.cls_out_channels = self.num_classes else: self.cls_out_channels = self.num_classes + 1 # build bbox_coder self.anchor_coder = build_bbox_coder(anchor_coder) self.bbox_coder = build_bbox_coder(bbox_coder) # build losses self.loss_loc = build_loss(loss_loc) self.loss_shape = build_loss(loss_shape) self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.train_cfg = train_cfg self.test_cfg = test_cfg if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) # use PseudoSampler when sampling is False if self.sampling and hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.ga_assigner = build_assigner(self.train_cfg.ga_assigner) if self.ga_sampling: ga_sampler_cfg = self.train_cfg.ga_sampler else: ga_sampler_cfg = dict(type='PseudoSampler') self.ga_sampler = build_sampler(ga_sampler_cfg, context=self) self.fp16_enabled = False self._init_layers()
def _get_targets_single(self, flat_anchors, valid_flags, num_level_anchors, gt_bboxes, gt_bboxes_ignore, gt_labels, img_meta, label_channels=1, unmap_outputs=True): """Compute regression and classification targets for anchors in a single image. Args: flat_anchors (Tensor): Multi-level anchors of the image, which are concatenated into a single tensor of shape (num_anchors ,4) valid_flags (Tensor): Multi level valid flags of the image, which are concatenated into a single tensor of shape (num_anchors,). gt_bboxes (Tensor): Ground truth bboxes of the image, shape (num_gts, 4). img_meta (dict): Meta info of the image. gt_bboxes_ignore (Tensor): Ground truth bboxes to be ignored, shape (num_ignored_gts, 4). img_meta (dict): Meta info of the image. gt_labels (Tensor): Ground truth labels of each box, shape (num_gts,). label_channels (int): Channel of label. unmap_outputs (bool): Whether to map outputs back to the original set of anchors. Returns: tuple: labels_list (list[Tensor]): Labels of each level label_weights_list (list[Tensor]): Label weights of each level bbox_targets_list (list[Tensor]): BBox targets of each level bbox_weights_list (list[Tensor]): BBox weights of each level num_total_pos (int): Number of positive samples in all images num_total_neg (int): Number of negative samples in all images """ inside_flags = anchor_inside_flags(flat_anchors, valid_flags, img_meta['img_shape'][:2], self.train_cfg.allowed_border) if not inside_flags.any(): return (None, ) * 7 # assign gt and sample anchors anchors = flat_anchors[inside_flags, :] # assign_result = self.assigner.assign( # anchors, gt_bboxes, gt_bboxes_ignore, # None if self.sampling else gt_labels) # sampling_result = self.sampler.sample(assign_result, anchors, # gt_bboxes) num_level_anchors_inside = get_num_level_anchors_inside( num_level_anchors, inside_flags) if self.sampling: bbox_assigner = build_assigner(self.train_cfg.assigner) bbox_sampler = build_sampler(self.train_cfg.sampler) if self.train_cfg.assigner['type'] == 'ATSSAssigner': assign_result = bbox_assigner.assign(anchors, num_level_anchors_inside, gt_bboxes, gt_bboxes_ignore, gt_labels) else: assign_result = bbox_assigner.assign(anchors, gt_bboxes, gt_bboxes_ignore, gt_labels) sampling_result = bbox_sampler.sample(assign_result, anchors, gt_bboxes, gt_labels) else: assign_result = self.assigner.assign( anchors, gt_bboxes, gt_bboxes_ignore, None if self.sampling else gt_labels) sampling_result = self.sampler.sample(assign_result, anchors, gt_bboxes) num_valid_anchors = anchors.shape[0] bbox_targets = torch.zeros_like(anchors) bbox_weights = torch.zeros_like(anchors) labels = anchors.new_full((num_valid_anchors, ), self.background_label, dtype=torch.long) label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds if len(pos_inds) > 0: if not self.reg_decoded_bbox: pos_bbox_targets = self.bbox_coder.encode( sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes) else: pos_bbox_targets = sampling_result.pos_gt_bboxes bbox_targets[pos_inds, :] = pos_bbox_targets bbox_weights[pos_inds, :] = 1.0 if gt_labels is None: # only rpn gives gt_labels as None, this time FG is 1 labels[pos_inds] = 1 else: labels[pos_inds] = gt_labels[ sampling_result.pos_assigned_gt_inds] if self.train_cfg.pos_weight <= 0: label_weights[pos_inds] = 1.0 else: label_weights[pos_inds] = self.train_cfg.pos_weight if len(neg_inds) > 0: label_weights[neg_inds] = 1.0 # map up to original set of anchors if unmap_outputs: num_total_anchors = flat_anchors.size(0) labels = unmap(labels, num_total_anchors, inside_flags, fill=self.background_label) # fill bg label label_weights = unmap(label_weights, num_total_anchors, inside_flags) bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags) bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags) return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds, sampling_result)
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_meta (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and my also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ x = self.extract_feat(img) # x得到的是一个元组,对于resnet,可包含多个conv层的结果 conv4 = (x[0], ) conv5 = x[1] losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(conv4) # rpn使用的是resnet的conv4 # rpn_outs = rpn_cls_score + rpn_bbox_pred rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals large_seperate_conv_output = (self.large_seperate_conv(conv5), ) # assign gts and sample proposals if self.with_bbox: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # rois是基于特征图尺寸的,不是基于原图尺寸的 # roi:(roi_batch_inds,x1,y1,x2,y2) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( large_seperate_conv_output[:self.bbox_roi_extractor. num_inputs], rois) # bbox_feats是基于特征图尺寸的,如果需要基于原图尺寸,需要传入scale_factor参数 cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) return losses
def __init__(self, num_classes, in_channels, out_channels=(1024, 512, 256), anchor_generator=dict(type='YOLOAnchorGenerator', base_sizes=[[(116, 90), (156, 198), (373, 326)], [(30, 61), (62, 45), (59, 119)], [(10, 13), (16, 30), (33, 23)]], strides=[32, 16, 8]), bbox_coder=dict(type='YOLOBBoxCoder'), featmap_strides=[32, 16, 8], one_hot_smoother=0., conv_cfg=None, norm_cfg=dict(type='BN', requires_grad=True), act_cfg=dict(type='LeakyReLU', negative_slope=0.1), loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_conf=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_xy=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_wh=dict(type='MSELoss', loss_weight=1.0), train_cfg=None, test_cfg=None): super(YOLOV3Head, self).__init__() # Check params assert (len(in_channels) == len(out_channels) == len(featmap_strides)) self.num_classes = num_classes self.in_channels = in_channels self.out_channels = out_channels self.featmap_strides = featmap_strides self.train_cfg = train_cfg self.test_cfg = test_cfg if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) if hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.one_hot_smoother = one_hot_smoother self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.bbox_coder = build_bbox_coder(bbox_coder) self.anchor_generator = build_anchor_generator(anchor_generator) self.loss_cls = build_loss(loss_cls) self.loss_conf = build_loss(loss_conf) self.loss_xy = build_loss(loss_xy) self.loss_wh = build_loss(loss_wh) # usually the numbers of anchors for each level are the same # except SSD detectors self.num_anchors = self.anchor_generator.num_base_anchors[0] assert len( self.anchor_generator.num_base_anchors) == len(featmap_strides) self._init_layers()
def __init__(self, num_classes, in_channels, stacked_convs=4, feat_channels=256, approx_anchor_generator=dict(type='AnchorGenerator', octave_base_scale=4, scales_per_octave=3, ratios=[0.5, 1.0, 2.0], strides=[8, 16, 32, 64, 128]), square_anchor_generator=dict(type='AnchorGenerator', ratios=[1.0], scales=[4], strides=[8, 16, 32, 64, 128]), conv_cfg=None, norm_cfg=None, bbox_coder=dict(type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0), reg_decoded_bbox=False, train_cfg=None, test_cfg=None, loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox_cls=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5), loss_bbox_reg=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5), init_cfg=dict(type='Normal', layer='Conv2d', std=0.01, override=dict(type='Normal', name='retina_cls', std=0.01, bias_prob=0.01))): super(SABLRetinaHead, self).__init__(init_cfg) self.in_channels = in_channels self.num_classes = num_classes self.feat_channels = feat_channels self.num_buckets = bbox_coder['num_buckets'] self.side_num = int(np.ceil(self.num_buckets / 2)) assert (approx_anchor_generator['octave_base_scale'] == square_anchor_generator['scales'][0]) assert (approx_anchor_generator['strides'] == square_anchor_generator['strides']) self.approx_anchor_generator = build_anchor_generator( approx_anchor_generator) self.square_anchor_generator = build_anchor_generator( square_anchor_generator) self.approxs_per_octave = ( self.approx_anchor_generator.num_base_anchors[0]) # one anchor per location self.num_anchors = 1 self.stacked_convs = stacked_convs self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.reg_decoded_bbox = reg_decoded_bbox self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) self.sampling = loss_cls['type'] not in [ 'FocalLoss', 'GHMC', 'QualityFocalLoss' ] if self.use_sigmoid_cls: self.cls_out_channels = num_classes else: self.cls_out_channels = num_classes + 1 self.bbox_coder = build_bbox_coder(bbox_coder) self.loss_cls = build_loss(loss_cls) self.loss_bbox_cls = build_loss(loss_bbox_cls) self.loss_bbox_reg = build_loss(loss_bbox_reg) self.train_cfg = train_cfg self.test_cfg = test_cfg if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) # use PseudoSampler when sampling is False if self.sampling and hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.fp16_enabled = False self._init_layers()
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_meta (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and my also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ x = self.extract_feat(img) # pdb.set_trace() losses = dict() if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals for i in range(self.num_stages): self.current_stage = i # pdb.set_trace() rcnn_train_cfg = self.train_cfg.rcnn[i] lw = self.train_cfg.stage_loss_weights[i] # assign gts and sample proposals sampling_results = [] if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(rcnn_train_cfg.assigner) bbox_sampler = build_sampler(rcnn_train_cfg.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] for j in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j], gt_labels[j]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[j], gt_bboxes[j], gt_labels[j], feats=[lvl_feat[j][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss bbox_roi_extractor = self.bbox_roi_extractor[i] bbox_head = self.bbox_head[i] rois = bbox2roi([res.bboxes for res in sampling_results]) if len(rois) == 0: # If there are no predicted and/or truth boxes, then we cannot # compute head / mask losses continue bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = bbox_head(bbox_feats) bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg, img_meta) loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets) for name, value in loss_bbox.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: mask_roi_extractor = self.mask_roi_extractor[i] pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = mask_roi_extractor( x[:mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: # reuse positive bbox feats pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_head = self.mask_head[i] mask_pred = mask_head(mask_feats) mask_targets = mask_head.get_target(sampling_results, gt_masks, rcnn_train_cfg) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels) for name, value in loss_mask.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # refine bboxes if i < self.num_stages - 1: pos_is_gts = [res.pos_is_gt for res in sampling_results] roi_labels = bbox_targets[0] # bbox_targets is a tuple with torch.no_grad(): proposal_list = bbox_head.refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_meta) return losses
def forward_train(self, imgs, img_meta, imgs_2, img_meta_2, imgs_3, img_meta_3, gt_bboxes, gt_bboxes_2, gt_bboxes_3, gt_labels, gt_labels_2, gt_labels_3, gt_bboxes_ignore=None, gt_masks=None, gt_masks_2=None, proposals=None): # self.print_iterations() assert imgs.shape[ 1] == 3 and imgs_2.shape[1] == 3 # make sure channel size is 3 x = self.extract_feat(imgs) x_2 = self.extract_feat(imgs_2) x_3 = self.extract_feat(imgs_3) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_outs_2 = self.rpn_head_2(x_2) rpn_outs_3 = self.rpn_head_3(x_3) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_loss_inputs_2 = rpn_outs_2 + (gt_bboxes_2, img_meta, self.train_cfg.rpn) rpn_loss_inputs_3 = rpn_outs_3 + (gt_bboxes_3, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore, iteration=self.iteration) rpn_losses_2 = self.rpn_head_2.loss( *rpn_loss_inputs_2, gt_bboxes_ignore=gt_bboxes_ignore, iteration=self.iteration, img_meta_2=img_meta_2) rpn_losses_3 = self.rpn_head_3.loss( *rpn_loss_inputs_3, gt_bboxes_ignore=gt_bboxes_ignore, iteration=self.iteration, img_meta_3=img_meta_3) losses.update(rpn_losses) losses.update(rpn_losses_2) losses.update(rpn_losses_3) proposal_inputs = rpn_outs + (img_meta, self.train_cfg.rpn_proposal) proposal_inputs_2 = rpn_outs_2 + (img_meta, self.train_cfg.rpn_proposal) proposal_inputs_3 = rpn_outs_3 + (img_meta, self.train_cfg.rpn_proposal) proposal_list, anchors = self.rpn_head.get_bboxes(*proposal_inputs) proposal_list_2, anchors_2 = self.rpn_head_2.get_bboxes( *proposal_inputs_2, img_meta_2=img_meta_2) proposal_list_3, anchors_3 = self.rpn_head_3.get_bboxes( *proposal_inputs_3, img_meta_3=img_meta_3) # self.rpn_head.visualize_anchor_boxes(imgs, rpn_outs[0], img_meta, slice_num=45, shuffle=True) # debug only # self.visualize_proposals(imgs, proposal_list, gt_bboxes, img_meta, slice_num=None, isProposal=True) #debug only # self.visualize_proposals(imgs, anchors, gt_bboxes, img_meta, slice_num=None, isProposal=False) #debug only # self.visualize_gt_bboxes(imgs, gt_bboxes, img_meta) #debug only # breakpoint() # self.visualize_gt_bboxes(imgs_2, gt_bboxes_2, img_meta_2) #debug only # breakpoint() # self.visualize_gt_bboxes_masks(imgs, gt_bboxes, img_meta, gt_masks) # debug only else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = imgs.size(0) gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): gt_bboxes_cur_pat = gt_bboxes[i] gt_bboxes_ignore_cur_pat = gt_bboxes_ignore[i] gt_labels_cur_pat = gt_labels[i] assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes_cur_pat, gt_bboxes_ignore_cur_pat, gt_labels_cur_pat) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes_cur_pat, gt_labels_cur_pat, feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) bbox_assigner_2 = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler_2 = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs_2 = imgs_2.size(0) gt_bboxes_ignore_2 = [None for _ in range(num_imgs_2)] sampling_results_2 = [] for i in range(num_imgs_2): gt_bboxes_cur_pat_2 = gt_bboxes_2[i] gt_bboxes_ignore_cur_pat_2 = gt_bboxes_ignore_2[i] gt_labels_cur_pat_2 = gt_labels_2[i] assign_result_2 = bbox_assigner_2.assign( proposal_list_2[i], gt_bboxes_cur_pat_2, gt_bboxes_ignore_cur_pat_2, gt_labels_cur_pat_2) sampling_result_2 = bbox_sampler_2.sample( assign_result_2, proposal_list_2[i], gt_bboxes_cur_pat_2, gt_labels_cur_pat_2, feats=[lvl_feat[i][None] for lvl_feat in x_2]) sampling_results_2.append(sampling_result_2) bbox_assigner_3 = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler_3 = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs_3 = imgs_3.size(0) gt_bboxes_ignore_3 = [None for _ in range(num_imgs_3)] sampling_results_3 = [] for i in range(num_imgs_3): gt_bboxes_cur_pat_3 = gt_bboxes_3[i] gt_bboxes_ignore_cur_pat_3 = gt_bboxes_ignore_3[i] gt_labels_cur_pat_3 = gt_labels_3[i] assign_result_3 = bbox_assigner_3.assign( proposal_list_3[i], gt_bboxes_cur_pat_3, gt_bboxes_ignore_cur_pat_3, gt_labels_cur_pat_3) sampling_result_3 = bbox_sampler_3.sample( assign_result_3, proposal_list_3[i], gt_bboxes_cur_pat_3, gt_labels_cur_pat_3, feats=[lvl_feat[i][None] for lvl_feat in x_3]) sampling_results_3.append(sampling_result_3) # bbox head forward and loss if self.with_bbox: rois = bbox2roi3D([res.bboxes for res in sampling_results]) rois_2 = bbox2roi3D([res.bboxes for res in sampling_results_2]) rois_3 = bbox2roi3D([res.bboxes for res in sampling_results_3]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) bbox_feats_2 = self.bbox_roi_extractor( x_2[:self.bbox_roi_extractor.num_inputs], rois_2) bbox_feats_3 = self.bbox_roi_extractor( x_3[:self.bbox_roi_extractor.num_inputs], rois_3) cls_score, bbox_pred = self.bbox_head(bbox_feats) cls_score_2, bbox_pred_2 = self.bbox_head(bbox_feats_2) cls_score_3, bbox_pred_3 = self.bbox_head(bbox_feats_3) cls_score = torch.cat((cls_score, cls_score_2, cls_score_3), 0) bbox_pred = torch.cat((bbox_pred, bbox_pred_2, bbox_pred_3), 0) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) bbox_targets_2 = self.bbox_head.get_target(sampling_results_2, gt_bboxes_2, gt_labels_2, self.train_cfg.rcnn) bbox_targets_3 = self.bbox_head.get_target(sampling_results_3, gt_bboxes_3, gt_labels_3, self.train_cfg.rcnn) bbox_targets_combined = [] for bbox_target, bbox_target_2, bbox_target_3 in zip( bbox_targets, bbox_targets_2, bbox_targets_3): bbox_targets_combined.append( torch.cat((bbox_target, bbox_target_2, bbox_target_3), 0)) bbox_targets_combined = tuple(bbox_targets_combined) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets_combined) losses.update(loss_bbox) # prepare upscaled data for refinement head upscaled_factor = img_meta_2[0]['ori_shape'][0] / img_meta[0][ 'ori_shape'][0] # convert parameterized adjustments to actual bounding boxes coordinates pred_bboxes_2 = self.bbox_head.convert_adjustments_to_bboxes( rois_2, bbox_pred_2, img_meta_2[0] ['img_shape'])[:, 6:].cpu().detach().numpy() / upscaled_factor pred_cls_score_2 = cls_score_2[:, 1, None].cpu().detach().numpy() pred_bboxes_2 = np.concatenate((pred_bboxes_2, pred_cls_score_2), axis=1) pred_bboxes_2 = [torch.from_numpy(pred_bboxes_2).cuda()] bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = imgs.size(0) gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results_refinement = [] for i in range(num_imgs): gt_bboxes_cur_pat = gt_bboxes[i] gt_bboxes_ignore_cur_pat = gt_bboxes_ignore[i] gt_labels_cur_pat = gt_labels[i] assign_result = bbox_assigner.assign(pred_bboxes_2[i], gt_bboxes_cur_pat, gt_bboxes_ignore_cur_pat, gt_labels_cur_pat) sampling_result = bbox_sampler.sample( assign_result, pred_bboxes_2[i], gt_bboxes_cur_pat, gt_labels_cur_pat, feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results_refinement.append(sampling_result) rois_refinement = bbox2roi3D( [res.bboxes for res in sampling_results_refinement]) bbox_feats_refinement = self.bbox_roi_extractor_refinement( x[:self.bbox_roi_extractor_refinement.num_inputs], rois_refinement) # prepare second upscaled data for refinement head upscaled_factor = img_meta_3[0]['ori_shape'][0] / img_meta[0][ 'ori_shape'][0] pred_bboxes_3 = self.bbox_head.convert_adjustments_to_bboxes( rois_3, bbox_pred_3, img_meta_3[0] ['img_shape'])[:, 6:].cpu().detach().numpy() / upscaled_factor pred_cls_score_3 = cls_score_3[:, 1, None].cpu().detach().numpy() pred_bboxes_3 = np.concatenate((pred_bboxes_3, pred_cls_score_3), axis=1) pred_bboxes_3 = [torch.from_numpy(pred_bboxes_3).cuda()] bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = imgs.size(0) gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results_refinement_2 = [] for i in range(num_imgs): gt_bboxes_cur_pat = gt_bboxes[i] gt_bboxes_ignore_cur_pat = gt_bboxes_ignore[i] gt_labels_cur_pat = gt_labels[i] assign_result = bbox_assigner.assign(pred_bboxes_3[i], gt_bboxes_cur_pat, gt_bboxes_ignore_cur_pat, gt_labels_cur_pat) sampling_result = bbox_sampler.sample( assign_result, pred_bboxes_3[i], gt_bboxes_cur_pat, gt_labels_cur_pat, feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results_refinement_2.append(sampling_result) rois_refinement_2 = bbox2roi3D( [res.bboxes for res in sampling_results_refinement_2]) bbox_feats_refinement_2 = self.bbox_roi_extractor_refinement( x[:self.bbox_roi_extractor_refinement.num_inputs], rois_refinement_2) # training refinement head refined_bbox_pred = self.refinement_head(bbox_feats_refinement) refined_bbox_pred_2 = self.refinement_head(bbox_feats_refinement_2) refined_bbox_pred_combined = torch.cat( (refined_bbox_pred, refined_bbox_pred_2)) bbox_targets_refinement = self.refinement_head.get_target( sampling_results_refinement, gt_bboxes, gt_labels, self.train_cfg.rcnn) bbox_targets_refinement_2 = self.refinement_head.get_target( sampling_results_refinement_2, gt_bboxes, gt_labels, self.train_cfg.rcnn) bbox_targets_refinement_combined = [] for bbox_target_refinement, bbox_target_refinement_2 in zip( bbox_targets_refinement, bbox_targets_refinement_2): bbox_targets_refinement_combined.append( torch.cat( (bbox_target_refinement, bbox_target_refinement_2))) loss_refinement = self.refinement_head.loss( refined_bbox_pred_combined, *bbox_targets_refinement_combined) losses.update(loss_refinement) # mask head forward and loss if self.with_mask: # TODO: not yet implemented pos_rois = bbox2roi3D([res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target(sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) pos_rois_refined = bbox2roi3D( [res.pos_bboxes for res in sampling_results_refinement]) mask_feats_refined = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois_refined) mask_pred_refined = self.mask_head(mask_feats_refined) mask_targets_refined = self.mask_head.get_target( sampling_results_refinement, gt_masks, self.train_cfg.rcnn) pos_labels_refined = torch.cat( [res.pos_gt_labels for res in sampling_results_refinement]) mask_pred_combined = torch.cat((mask_pred, mask_pred_refined)) mask_targets_combined = torch.cat( (mask_targets, mask_targets_refined)) pos_labels_combined = torch.cat((pos_labels, pos_labels_refined)) loss_mask = self.mask_head.loss(mask_pred_combined, mask_targets_combined, pos_labels_combined) losses.update(loss_mask) # self.save_losses_plt(losses) #debug only... self.iteration += 1 return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_meta (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and my also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ x = self.extract_feat(img) # x得到的是一个元组,对于resnet,可包含多个conv层的结果 losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) # rpn_outs = rpn_cls_score + rpn_bbox_pred # 注意:rpn_bbox_pred预测的是delta,是(tx,ty,tw,th) # 计算loss时使用的是train的配置 rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) # loss:生成anchor,将anchor与gt_bbox匹配,生成正负样本,计算delta # 得到每个anchor的label/label_weights/delta/delta_weights,以及pos_inds/neg_inds # 利用这些使用交叉熵和SmoothL1Loss,得到loss rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) # rpn_losses 字典类型,包含loss_rpn_cls,loss_rpn_bbox losses.update(rpn_losses) # 将rpn_loss加入到总体的loss中 # 得到proposal时使用的是train的配置 proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) # get_bboxes:得到nms之后的proposals proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals ''' with_bbox和with_mask是训练时的选项 eg.在faster_rcnn_r50_caffe_c4_1x.py中 train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), ] ''' if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler( self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): # 在RPN部分完成了anchor的匹配和正负样本生成,用于计算loss # 并根据bbox_pred和NMS得到proposal # 在RCNN部分,将proposal和gt_bbox匹配并生成正负样本,用于计算这部分的loss assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # 所有图的roi,(img_ind, x1, y1, x2, y2) # TODO: a more flexible way to decide which feature maps to use # todo:看SingleRoIExtractor bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) # bbox_feats尺寸(roi_num, roi_out_channels, roi_w(7), roi_h(7)) if self.with_shared_head: # 将ROI再经过一段特征提取层,这里用的是Resnet的conv5 bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones( res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros( res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target(sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_meta (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ #img=-img #img=F.max_pool2d(img,kernel_size=5,stride=1,padding=2) #img=-img #print('using min pooling for training') x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] if mask_feats.shape[0] > 0: mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_bboxes_ignore, gt_labels, ref_img, # images of reference frame ref_bboxes, # gt bbox of reference frame gt_pids, # gt ids of current frame bbox mapped to reference frame gt_masks=None, proposals=None): x = self.extract_feat(img) ref_x = self.extract_feat(ref_img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs) losses.update(rpn_losses) proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler( self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i], gt_pids[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], gt_pids[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) bbox_img_n = [res.bboxes.size(0) for res in sampling_results] ref_rois = bbox2roi(ref_bboxes) ref_bbox_img_n = [x.size(0) for x in ref_bboxes] # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) ref_bbox_feats = self.bbox_roi_extractor( ref_x[:self.bbox_roi_extractor.num_inputs], ref_rois) cls_score, bbox_pred = self.bbox_head(bbox_feats) # fetch bbox and object_id targets bbox_targets, (ids, id_weights) = self.bbox_head.get_target( sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) match_score = self.track_head(bbox_feats, ref_bbox_feats, bbox_img_n, ref_bbox_img_n) loss_match = self.track_head.loss(match_score, ids, id_weights) losses.update(loss_match) # mask head forward and loss if self.with_mask: pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def atss_target_single(self, flat_anchors, valid_flags, num_level_anchors, gt_bboxes, gt_bboxes_ignore, gt_labels, img_meta, cfg, label_channels=1, unmap_outputs=True): inside_flags = anchor_inside_flags(flat_anchors, valid_flags, img_meta['img_shape'][:2], cfg.allowed_border) if not inside_flags.any(): return (None, ) * 6 # assign gt and sample anchors anchors = flat_anchors[inside_flags.type(torch.bool), :] num_level_anchors_inside = self.get_num_level_anchors_inside( num_level_anchors, inside_flags) bbox_assigner = build_assigner(cfg.assigner) assign_result = bbox_assigner.assign(anchors, num_level_anchors_inside, gt_bboxes, gt_bboxes_ignore, gt_labels) bbox_sampler = PseudoSampler() sampling_result = bbox_sampler.sample(assign_result, anchors, gt_bboxes) num_valid_anchors = anchors.shape[0] bbox_targets = torch.zeros_like(anchors) bbox_weights = torch.zeros_like(anchors) labels = anchors.new_zeros(num_valid_anchors, dtype=torch.long) label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds if len(pos_inds) > 0: pos_bbox_targets = bbox2delta(sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes, self.target_means, self.target_stds) bbox_targets[pos_inds, :] = pos_bbox_targets bbox_weights[pos_inds, :] = 1.0 if gt_labels is None: labels[pos_inds] = 1 else: labels[pos_inds] = gt_labels[ sampling_result.pos_assigned_gt_inds] if cfg.pos_weight <= 0: label_weights[pos_inds] = 1.0 else: label_weights[pos_inds] = cfg.pos_weight if len(neg_inds) > 0: label_weights[neg_inds] = 1.0 # map up to original set of anchors if unmap_outputs: inside_flags = inside_flags.type(torch.bool) num_total_anchors = flat_anchors.size(0) anchors = unmap(anchors, num_total_anchors, inside_flags) labels = unmap(labels, num_total_anchors, inside_flags) label_weights = unmap(label_weights, num_total_anchors, inside_flags) bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags) bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags) return (anchors, labels, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds)
def forward_train(self, img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ x = self.extract_feat( img ) # backbone 和 neck 阶段 提取特征 由于基本上会用rpn,所以会是一个tuple 里面都是[bs, c, h, w]的特征 losses = dict() # RPN forward and loss RPN网络阶段并计算损失 if self.with_rpn: # 分别得到分类结果 和 回归结果 rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas, self.train_cfg.rpn) # rpn_head 继承的是anchor——head 可以计算损失 rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore ) # 得到一个字典 key分别是loss_rpn_cls 和 loss_rpn_bbox losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_metas, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals #类似于fast rcnn 就会直接得到提取准备的proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: # 区分正负样本并进行采样 bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results ]) # 主要是给每一个bbox添加图片id 便于roi pooling # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( # ROI pooing过程 x[:self.bbox_roi_extractor.num_inputs], rois) # [ bs*512, 256, 7, 7] if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) #感觉这里是对特征进行了一个变换过程 cls_score, bbox_pred = self.bbox_head( bbox_feats) # 进行refine的head 进行分类和位置的精调 bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss mask-RCNN网络的head部分 if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] if mask_feats.shape[0] > 0: mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def __init__(self, num_classes, in_channels, regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 512), (512, INF)), center_sampling=False, center_sample_radius=1.5, sync_num_pos=True, gradient_mul=0.1, bbox_norm_type='reg_denom', loss_cls_fl=dict(type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), use_vfl=True, loss_cls=dict(type='VarifocalLoss', use_sigmoid=True, alpha=0.75, gamma=2.0, iou_weighted=True, loss_weight=1.0), loss_bbox=dict(type='GIoULoss', loss_weight=1.5), loss_bbox_refine=dict(type='GIoULoss', loss_weight=2.0), norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), use_atss=True, anchor_generator=dict(type='AnchorGenerator', ratios=[1.0], octave_base_scale=8, scales_per_octave=1, center_offset=0.0, strides=[8, 16, 32, 64, 128]), **kwargs): # dcn base offsets, adapted from reppoints_head.py self.num_dconv_points = 9 self.dcn_kernel = int(np.sqrt(self.num_dconv_points)) self.dcn_pad = int((self.dcn_kernel - 1) / 2) dcn_base = np.arange(-self.dcn_pad, self.dcn_pad + 1).astype(np.float64) dcn_base_y = np.repeat(dcn_base, self.dcn_kernel) dcn_base_x = np.tile(dcn_base, self.dcn_kernel) dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape( (-1)) self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1) super(FCOSHead, self).__init__(num_classes, in_channels, norm_cfg=norm_cfg, **kwargs) self.regress_ranges = regress_ranges self.reg_denoms = [ regress_range[-1] for regress_range in regress_ranges ] self.reg_denoms[-1] = self.reg_denoms[-2] * 2 self.center_sampling = center_sampling self.center_sample_radius = center_sample_radius self.sync_num_pos = sync_num_pos self.bbox_norm_type = bbox_norm_type self.gradient_mul = gradient_mul self.use_vfl = use_vfl if self.use_vfl: self.loss_cls = build_loss(loss_cls) else: self.loss_cls = build_loss(loss_cls_fl) self.loss_bbox = build_loss(loss_bbox) self.loss_bbox_refine = build_loss(loss_bbox_refine) # for getting ATSS targets self.use_atss = use_atss self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) self.anchor_generator = build_anchor_generator(anchor_generator) self.anchor_center_offset = anchor_generator['center_offset'] self.num_anchors = self.anchor_generator.num_base_anchors[0] self.sampling = False if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self)
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss CHANGE = False if CHANGE: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred, bbox_iou = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(gt_bboxes, cls_score, bbox_pred, bbox_iou, *bbox_targets) losses.update(loss_bbox) else: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] #mask_pred, mask_edge_pred = self.mask_head(mask_feats) mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target(sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) #edge_targets, edge_weight = self.mask_head.get_edge(mask_targets) #loss_edge = self.mask_head.loss_e(mask_edge_pred, edge_targets, edge_weight) #losses.update(loss_edge) # mask iou head forward and loss pos_mask_pred = mask_pred[range(mask_pred.size(0)), pos_labels] mask_iou_pred = self.mask_iou_head(mask_feats, pos_mask_pred) pos_mask_iou_pred = mask_iou_pred[range(mask_iou_pred.size(0)), pos_labels] mask_iou_targets = self.mask_iou_head.get_target( sampling_results, gt_masks, pos_mask_pred, mask_targets, self.train_cfg.rcnn) loss_mask_iou = self.mask_iou_head.loss(pos_mask_iou_pred, mask_iou_targets) losses.update(loss_mask_iou) return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_mask: if not self.share_roi_extractor: mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_pred, mask_refine = self.mask_head(mask_feats) mask_targets = self.mask_head.get_all_target( sampling_results, gt_masks, self.train_cfg.rcnn) # fetched_mask = self.mask_head.fetch_mask(mask_pred.detach(), mask_cls_pred) # bbox_feats = F.adaptive_max_pool2d(mask_feats, (7, 7)) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats, mask_refine.detach()) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) labels = torch.cat([res.labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_refine, mask_targets, labels) losses.update(loss_mask) # Modified by Pleaplusone # _,_,H,W = bbox_feats.size() # seg_feats = F.adaptive_max_pool2d(mask_pred, (H,W)) # bbox_feats = torch.cat([bbox_feats, seg_feats], dim=1) # bbox_feats = F.adaptive_max_pool2d(mask_feats, (7,7)) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) return losses
def __init__(self, num_classes, in_channels, point_feat_channels=256, num_points=9, gradient_mul=0.1, point_strides=[8, 16, 32, 64, 128], point_base_scale=4, loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox_init=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.5), loss_bbox_refine=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), use_grid_points=False, center_init=True, transform_method='moment', moment_mul=0.01, **kwargs): self.num_points = num_points self.point_feat_channels = point_feat_channels self.use_grid_points = use_grid_points self.center_init = center_init # we use deform conv to extract points features self.dcn_kernel = int(np.sqrt(num_points)) self.dcn_pad = int((self.dcn_kernel - 1) / 2) assert self.dcn_kernel * self.dcn_kernel == num_points, \ 'The points number should be a square number.' assert self.dcn_kernel % 2 == 1, \ 'The points number should be an odd square number.' dcn_base = np.arange(-self.dcn_pad, self.dcn_pad + 1).astype(np.float64) dcn_base_y = np.repeat(dcn_base, self.dcn_kernel) dcn_base_x = np.tile(dcn_base, self.dcn_kernel) dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape( (-1)) self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1) super().__init__(num_classes, in_channels, loss_cls=loss_cls, **kwargs) self.gradient_mul = gradient_mul self.point_base_scale = point_base_scale self.point_strides = point_strides self.point_generators = [PointGenerator() for _ in self.point_strides] self.sampling = loss_cls['type'] not in ['FocalLoss'] if self.train_cfg: self.init_assigner = build_assigner(self.train_cfg.init.assigner) self.refine_assigner = build_assigner( self.train_cfg.refine.assigner) # use PseudoSampler when sampling is False if self.sampling and hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.transform_method = transform_method if self.transform_method == 'moment': self.moment_transfer = nn.Parameter(data=torch.zeros(2), requires_grad=True) self.moment_mul = moment_mul self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) if self.use_sigmoid_cls: self.cls_out_channels = self.num_classes else: self.cls_out_channels = self.num_classes + 1 self.loss_bbox_init = build_loss(loss_bbox_init) self.loss_bbox_refine = build_loss(loss_bbox_refine)
def forward(self, imgs, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): N = imgs.shape[0] H = imgs.shape[3] W = imgs.shape[4] T = self.T tgt_t = int((T - 1) / 2) imgs = imgs.view(N * T, 3, H, W) extracted_feat = self.extract_feat(imgs) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(extracted_feat) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) rpn_outs_tgt = self.rpn_head(extracted_feat[N * tgt_t:N * (tgt_t + 1)]) proposal_inputs_tgt = rpn_outs_tgt + (img_meta, proposal_cfg) proposal_list_tgt = self.rpn_head.get_bboxes(*proposal_inputs_tgt) if losses is not None: # TODO test rpn outs. rpn_loss_inputs = rpn_outs[N * tgt_t:N * (tgt_t + 1)] + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) # assign gts and sample proposals bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(N)] sampling_results = [] for i in range(N): if gt_bboxes[i] is None or len(gt_bboxes[i]) == 0: continue assign_result = bbox_assigner.assign(proposal_list_tgt[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list_tgt[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in extracted_feat]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi(proposal_list) bbox_feats = self.bbox_roi_extractor( extracted_feat[:self.bbox_roi_extractor.num_inputs], rois) # TODO: a more flexible way to decide which feature maps to use rois_tgt = bbox2roi([res.bboxes for res in sampling_results]) print(bbox_feats.shape) bbox_feats_tgt = self.bbox_roi_extractor( extracted_feat[N * tgt_t:N * (tgt_t + 1)][:self.bbox_roi_extractor.num_inputs], rois_tgt) bbox_feats = self.box_feat_conv(bbox_feats) bbox_feats_tgt = self.box_feat_conv(bbox_feats_tgt) ref_keys, ref_vals = self.kv_enc(bbox_feats) cur_key, cur_val = self.kv_enc(bbox_feats_tgt) cur_val = self.sp_att(ref_keys, ref_vals, cur_key, cur_val) cur_val = self.ch_att(ref_keys, ref_vals, cur_key, cur_val) cls_score, bbox_pred = self.bbox_head(cur_val) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) return losses
def forward_train( self, img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None, ): x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get("rpn_proposal", self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_metas, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals if self.with_bbox: # assign gts and sample proposals bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x], ) sampling_results.append(sampling_result) # bbox head forward and loss rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # Grid head forward and loss sampling_results = self._random_jitter(sampling_results, img_metas) pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) grid_feats = self.grid_roi_extractor( x[:self.grid_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: grid_feats = self.shared_head(grid_feats) # Accelerate training max_sample_num_grid = self.train_cfg.rcnn.get("max_num_grid", 192) sample_idx = torch.randperm( grid_feats.shape[0])[:min(grid_feats. shape[0], max_sample_num_grid)] grid_feats = grid_feats[sample_idx] grid_pred = self.grid_head(grid_feats) grid_targets = self.grid_head.get_target(sampling_results, self.train_cfg.rcnn) grid_targets = grid_targets[sample_idx] loss_grid = self.grid_head.loss(grid_pred, grid_targets) losses.update(loss_grid) return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals for i in range(self.num_stages): self.current_stage = i rcnn_train_cfg = self.train_cfg.rcnn[i] lw = self.train_cfg.stage_loss_weights[i] # assign gts and sample proposals sampling_results = [] if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(rcnn_train_cfg.assigner) bbox_sampler = build_sampler(rcnn_train_cfg.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] for j in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j], gt_labels[j]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[j], gt_bboxes[j], gt_labels[j], feats=[lvl_feat[j][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss bbox_roi_extractor = self.bbox_roi_extractor[i] bbox_head = self.bbox_head[i] rois = bbox2roi([res.bboxes for res in sampling_results]) bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = bbox_head(bbox_feats) bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg) loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets) for name, value in loss_bbox.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: mask_roi_extractor = self.mask_roi_extractor[i] pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = mask_roi_extractor( x[:mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: # reuse positive bbox feats pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_head = self.mask_head[i] mask_pred = mask_head(mask_feats) mask_targets = mask_head.get_target(sampling_results, gt_masks, rcnn_train_cfg) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels) for name, value in loss_mask.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # refine bboxes if i < self.num_stages - 1: pos_is_gts = [res.pos_is_gt for res in sampling_results] roi_labels = bbox_targets[0] # bbox_targets is a tuple with torch.no_grad(): proposal_list = bbox_head.refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_meta) return losses
def forward_train(self, # 核心双阶段检测器的流程 img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format(left top point, right bottom point). gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ # 前向 backbone 和 neck x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: # rpn_head在上面__init__函数里面build了,返回了rpn_head对象, # 但是因为都继承了nn.Module(实现了__call__),可以直接用实例名字调用里面的forward函数,从而进行了前向传播 rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas, self.train_cfg.rpn) # 这里loss是rpn_head里面的实现了,因为rpn_head继承了anchor_head, # 所以用了父类anchor_head实现的loss,其实就是anchor的那一套,返回是一个字典 rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_metas, proposal_cfg) # 得到proposal,把anchor转化为对应的框的信息,然后NMS再取top-N个候选框 proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals ''' ============ proposal_list 2 torch.Size([2000, 5]) ****** torch.Size([2000, 5]) ============ ============ ''' # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler( self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) # bbox_targets: 长度为4的tuple,元素是tensor ''' bbox_targets:长度为4 四个元素分别的维度 labels: torch.Size([1024]) label_weights: torch.Size([1024]) bbox_targets torch.Size([1024, 4]) bbox_weights torch.Size([1024, 4]) ''' # tensor维度为1维与tensor长度和cls_score,bbox_pred一致,值为0或1,用来表示预测是否命中真实框 bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones( res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros( res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] if mask_feats.shape[0] > 0: mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses