def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler( self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) bbox_targets = (bbox_targets[0],)+bbox_targets[3:] loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones( res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros( res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target(sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def __init__( self, num_classes, in_channels, num_query=100, num_reg_fcs=2, transformer=None, sync_cls_avg_factor=False, positional_encoding=dict(type='SinePositionalEncoding', num_feats=128, normalize=True), loss_cls=dict(type='CrossEntropyLoss', bg_cls_weight=0.1, use_sigmoid=False, loss_weight=1.0, class_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=5.0), loss_iou=dict(type='GIoULoss', loss_weight=2.0), train_cfg=dict(assigner=dict( type='HungarianAssigner', cls_cost=dict(type='ClassificationCost', weight=1.), reg_cost=dict(type='BBoxL1Cost', weight=5.0), iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))), test_cfg=dict(max_per_img=100), init_cfg=None, **kwargs): # NOTE here use `AnchorFreeHead` instead of `TransformerHead`, # since it brings inconvenience when the initialization of # `AnchorFreeHead` is called. super(AnchorFreeHead, self).__init__(init_cfg) self.bg_cls_weight = 0 self.sync_cls_avg_factor = sync_cls_avg_factor class_weight = loss_cls.get('class_weight', None) if class_weight is not None and (self.__class__ is DETRHead): assert isinstance(class_weight, float), 'Expected ' \ 'class_weight to have type float. Found ' \ f'{type(class_weight)}.' # NOTE following the official DETR rep0, bg_cls_weight means # relative classification weight of the no-object class. bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight) assert isinstance(bg_cls_weight, float), 'Expected ' \ 'bg_cls_weight to have type float. Found ' \ f'{type(bg_cls_weight)}.' class_weight = torch.ones(num_classes + 1) * class_weight # set background class as the last indice class_weight[num_classes] = bg_cls_weight loss_cls.update({'class_weight': class_weight}) if 'bg_cls_weight' in loss_cls: loss_cls.pop('bg_cls_weight') self.bg_cls_weight = bg_cls_weight if train_cfg: assert 'assigner' in train_cfg, 'assigner should be provided '\ 'when train_cfg is set.' assigner = train_cfg['assigner'] assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'], \ 'The classification weight for loss and matcher should be' \ 'exactly the same.' assert loss_bbox['loss_weight'] == assigner['reg_cost'][ 'weight'], 'The regression L1 weight for loss and matcher ' \ 'should be exactly the same.' assert loss_iou['loss_weight'] == assigner['iou_cost']['weight'], \ 'The regression iou weight for loss and matcher should be' \ 'exactly the same.' self.assigner = build_assigner(assigner) # DETR sampling=False, so use PseudoSampler sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.num_query = num_query self.num_classes = num_classes self.in_channels = in_channels self.num_reg_fcs = num_reg_fcs self.train_cfg = train_cfg self.test_cfg = test_cfg self.fp16_enabled = False self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_iou = build_loss(loss_iou) if self.loss_cls.use_sigmoid: self.cls_out_channels = num_classes else: self.cls_out_channels = num_classes + 1 self.act_cfg = transformer.get('act_cfg', dict(type='ReLU', inplace=True)) self.activate = build_activation_layer(self.act_cfg) self.positional_encoding = build_positional_encoding( positional_encoding) self.transformer = build_transformer(transformer) self.embed_dims = self.transformer.embed_dims assert 'num_feats' in positional_encoding num_feats = positional_encoding['num_feats'] assert num_feats * 2 == self.embed_dims, 'embed_dims should' \ f' be exactly 2 times of num_feats. Found {self.embed_dims}' \ f' and {num_feats}.' self._init_layers()
def forward_train(self, img, img_t, img_meta, gt_bboxes, gt_bboxes_ignore, gt_labels, gt_masks=None, proposals=None): x = self.extract_feat(img, img_t) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs) losses.update(rpn_losses) """ edited by Yuan,original code use same nms config of RPN during training and testing. they are different now, more proposals can be output to next step during training. """ proposal_inputs = rpn_outs + (img_meta, self.train_cfg.rpn.nms) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) # print('number of proposals:{}'.format(len(proposal_list[0]))) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_upper_neck: bbox_feats = self.upper_neck(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.shared_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_upper_neck: mask_feats = self.upper_neck(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target(sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def forward_debug(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_meta (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and my also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) # print(cls_score.shape, bbox_targets[0].shape, bbox_targets[2].shape) # print(cls_score[0], bbox_pred[0], bbox_targets[0][0],bbox_targets[2][0]) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target(sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses, proposal_list, sampling_results, rois
def forward_train( self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None, ): mix_weight = [] for i in range(len(img_meta)): if img_meta[i]['mix_weight'] is not None: mix_weight.append( torch.tensor(img_meta[i]['mix_weight']).cuda()) else: mix_weight = None x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) if mix_weight is not None: mix_inds = [] for i in range(len(sampling_results)): mix_ind = torch.ones( sampling_results[i].bboxes.shape[0]).cuda() for j in range( len(sampling_results[i].pos_assigned_gt_inds)): mix_ind[j] = mix_weight[i][ sampling_results[i].pos_assigned_gt_inds[j]] mix_inds.append(mix_ind) mix_inds = torch.cat(mix_inds, 0) else: mix_inds = None loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets, mix_inds) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target(sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def __init__(self, num_classes, in_channels, point_feat_channels=256, num_points=9, gradient_mul=0.1, point_strides=[8, 16, 32, 64, 128], point_base_scale=4, loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox_init=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.5), loss_bbox_refine=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), use_grid_points=False, center_init=True, transform_method='moment', moment_mul=0.01, **kwargs): self.num_points = num_points self.point_feat_channels = point_feat_channels self.use_grid_points = use_grid_points self.center_init = center_init # we use deform conv to extract points features self.dcn_kernel = int(np.sqrt(num_points)) self.dcn_pad = int((self.dcn_kernel - 1) / 2) assert self.dcn_kernel * self.dcn_kernel == num_points, \ 'The points number should be a square number.' assert self.dcn_kernel % 2 == 1, \ 'The points number should be an odd square number.' dcn_base = np.arange(-self.dcn_pad, self.dcn_pad + 1).astype(np.float64) dcn_base_y = np.repeat(dcn_base, self.dcn_kernel) dcn_base_x = np.tile(dcn_base, self.dcn_kernel) dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape( (-1)) self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1) super().__init__(num_classes, in_channels, loss_cls=loss_cls, **kwargs) self.gradient_mul = gradient_mul self.point_base_scale = point_base_scale self.point_strides = point_strides self.point_generators = [PointGenerator() for _ in self.point_strides] self.sampling = loss_cls['type'] not in ['FocalLoss'] if self.train_cfg: self.init_assigner = build_assigner(self.train_cfg.init.assigner) self.refine_assigner = build_assigner( self.train_cfg.refine.assigner) # use PseudoSampler when sampling is False if self.sampling and hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.transform_method = transform_method if self.transform_method == 'moment': self.moment_transfer = nn.Parameter(data=torch.zeros(2), requires_grad=True) self.moment_mul = moment_mul self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) if self.use_sigmoid_cls: self.cls_out_channels = self.num_classes else: self.cls_out_channels = self.num_classes + 1 self.loss_bbox_init = build_loss(loss_bbox_init) self.loss_bbox_refine = build_loss(loss_bbox_refine)
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() # trans gt_masks to gt_obbs gt_obbs = gt_mask_bp_obbs_list(gt_masks) # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, gt_labels, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals (hbb assign) if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn[0].assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn[0].sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) ## rbbox rbbox_targets = self.bbox_head.get_target(sampling_results, gt_masks, gt_labels, self.train_cfg.rcnn[0]) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *rbbox_targets) # losses.update(loss_bbox) for name, value in loss_bbox.items(): losses['s{}.{}'.format(0, name)] = (value) pos_is_gts = [res.pos_is_gt for res in sampling_results] roi_labels = rbbox_targets[0] with torch.no_grad(): # import pdb # pdb.set_trace() rotated_proposal_list = self.bbox_head.refine_rbboxes( roi2droi(rois), roi_labels, bbox_pred, pos_is_gts, img_meta) # import pdb # pdb.set_trace() # assign gts and sample proposals (rbb assign) if self.with_rbbox: bbox_assigner = build_assigner(self.train_cfg.rcnn[1].assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn[1].sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): gt_obbs_best_roi = choose_best_Rroi_batch(gt_obbs[i]) assign_result = bbox_assigner.assign(rotated_proposal_list[i], gt_obbs_best_roi, gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, rotated_proposal_list[i], torch.from_numpy(gt_obbs_best_roi).float().to( rotated_proposal_list[i].device), gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) if self.with_rbbox: # (batch_ind, x_ctr, y_ctr, w, h, angle) rrois = dbbox2roi([res.bboxes for res in sampling_results]) # feat enlarge # rrois[:, 3] = rrois[:, 3] * 1.2 # rrois[:, 4] = rrois[:, 4] * 1.4 rrois[:, 3] = rrois[:, 3] * self.rbbox_roi_extractor.w_enlarge rrois[:, 4] = rrois[:, 4] * self.rbbox_roi_extractor.h_enlarge rbbox_feats = self.rbbox_roi_extractor( x[:self.rbbox_roi_extractor.num_inputs], rrois) if self.with_shared_head_rbbox: rbbox_feats = self.shared_head_rbbox(rbbox_feats) cls_score, rbbox_pred = self.rbbox_head(rbbox_feats) rbbox_targets = self.rbbox_head.get_target_rbbox( sampling_results, gt_obbs, gt_labels, self.train_cfg.rcnn[1]) loss_rbbox = self.rbbox_head.loss(cls_score, rbbox_pred, *rbbox_targets) for name, value in loss_rbbox.items(): losses['s{}.{}'.format(1, name)] = (value) return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_bboxes_ignore, gt_labels, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs) losses.update(rpn_losses) proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn) proposal_list = self.rpn_head.get_proposals(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler( self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) pred = self.action__head(bbox_feats,x[:self.bbox_roi_extractor.num_inputs]) loss_bbox = self.action_head.loss(pred,gt_labels) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def forward_train_vis(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): losses = dict() # separate gt num_imgs = img.size(0) text_gt_bboxes = [] text_gt_labels = [] char_gt_bboxes = [] char_gt_labels = [] for img_i in range(num_imgs): text_num = gt_masks[img_i].shape[0] # text line gt text_gt_bboxes.append(gt_bboxes[img_i][:text_num]) text_gt_labels.append(gt_labels[img_i][:text_num]) # character gt char_gt_bboxes.append(gt_bboxes[img_i][text_num:]) char_gt_labels.append(gt_labels[img_i][text_num:]) x = self.extract_feat(img) # RPN forward and loss rpn_outs = self.rpn_head(x) stage_num = len(rpn_outs[0]) # text line proposals text_rpn_outs = ([], []) for stage_i in range(stage_num): text_rpn_outs[0].append(rpn_outs[0][stage_i]) text_rpn_outs[1].append(rpn_outs[1][stage_i]) text_rpn_loss_inputs = text_rpn_outs + (text_gt_bboxes, img_meta, self.train_cfg.rpn) text_rpn_losses = self.rpn_head.loss(*text_rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore, type='text') losses.update(text_rpn_losses) text_proposal_inputs = text_rpn_outs + ( img_meta, self.train_cfg.text_rpn_proposal) text_proposal_list = self.rpn_head.get_bboxes(*text_proposal_inputs, type='text') # character proposals char_rpn_outs = ([], []) for stage_i in range(stage_num): char_rpn_outs[0].append(rpn_outs[2][stage_i]) char_rpn_outs[1].append(rpn_outs[3][stage_i]) char_rpn_loss_inputs = char_rpn_outs + (char_gt_bboxes, img_meta, self.train_cfg.rpn) char_rpn_losses = self.rpn_head.loss(*char_rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore, type='char') losses.update(char_rpn_losses) char_proposal_inputs = char_rpn_outs + ( img_meta, self.train_cfg.char_rpn_proposal) char_proposal_list = self.rpn_head.get_bboxes(*char_proposal_inputs, type='char') # assign gts and sample proposals bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] text_sampling_results, char_sampling_results = [], [] for i in range(num_imgs): # sample text line proposals text_assign_result = bbox_assigner.assign(text_proposal_list[i], text_gt_bboxes[i], gt_bboxes_ignore[i], text_gt_labels[i]) text_sampling_result = bbox_sampler.sample( text_assign_result, text_proposal_list[i], text_gt_bboxes[i], text_gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) text_sampling_results.append(text_sampling_result) # sample character proposals char_assign_result = bbox_assigner.assign(char_proposal_list[i], char_gt_bboxes[i], gt_bboxes_ignore[i], char_gt_labels[i]) char_sampling_result = bbox_sampler.sample( char_assign_result, char_proposal_list[i], char_gt_bboxes[i], char_gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) char_sampling_results.append(char_sampling_result) # text detection module text_rois = bbox2roi([res.bboxes for res in text_sampling_results]) text_bbox_feats = self.text_bbox_roi_extractor( x[:self.text_bbox_roi_extractor.num_inputs], text_rois) text_cls_score, text_bbox_pred = self.text_bbox_head(text_bbox_feats) text_bbox_targets = self.text_bbox_head.get_target( text_sampling_results, text_gt_bboxes, text_gt_labels, self.train_cfg.rcnn) text_loss_bbox = self.text_bbox_head.loss(text_cls_score, text_bbox_pred, *text_bbox_targets, type='text') losses.update(text_loss_bbox) pos_rois = bbox2roi([res.pos_bboxes for res in text_sampling_results]) text_mask_feats = self.text_mask_roi_extractor( x[:self.text_mask_roi_extractor.num_inputs], pos_rois) mask_pred = self.text_mask_head(text_mask_feats) mask_targets = self.text_mask_head.get_target(text_sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in text_sampling_results]) loss_mask = self.text_mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) # character-based recognition module char_rois = bbox2roi([res.bboxes for res in char_sampling_results]) char_bbox_feats = self.char_bbox_roi_extractor( x[:self.char_bbox_roi_extractor.num_inputs], char_rois) char_cls_score, char_bbox_pred = self.char_bbox_head( char_bbox_feats) # the input may be a tuple char_bbox_targets = self.char_bbox_head.get_target( char_sampling_results, char_gt_bboxes, char_gt_labels, self.train_cfg.rcnn) char_loss_bbox = self.char_bbox_head.loss(char_cls_score, char_bbox_pred, *char_bbox_targets, type='char') losses.update(char_loss_bbox) # print(losses) # exit() return losses
def forward_train(self, img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None): # change the neck segm_pred, proto, x = self.extract_feat(img) losses = dict() cls_scores, bbox_preds, bbox_embeds = self.bbox_head(x) outs = (cls_scores, bbox_preds) loss_inputs = outs + (gt_bboxes, gt_labels, img_metas, self.train_cfg.bbox_head) loss_bbox = self.bbox_head.loss( *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(loss_bbox) # proposal proposal_cfg = self.train_cfg.get('proposals') proposal_inputs = outs + (bbox_embeds, img_metas, proposal_cfg) proposal_list = self.bbox_head.get_proposals(*proposal_inputs) proposal_bbox, proposal_labels, proposal_embeds = map(list, zip(*proposal_list)) # assign gts and sample proposals assigner = build_assigner(self.train_cfg.mask_head.assigner) sampler = build_sampler(self.train_cfg.mask_head.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = assigner.assign( proposal_bbox[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = sampler.sample( assign_result, proposal_bbox[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # mask_rois pos_rois = bbox2roi( res.pos_bboxes for res in sampling_results) pos_gt_rois = bbox2roi( res.pos_gt_bboxes for res in sampling_results) pos_embed_list = [ proposal_embeds[i][res.pos_inds] for i, res in enumerate(sampling_results)] pos_embeds = torch.cat(pos_embed_list) assert pos_embeds.size(0) == pos_gt_rois.size(0) # crop the rois from proto_masks, then calculate the loss # using pos_bboxes but not pos_gt_rois # crop the rois from proto_masks, then calculate the loss # using pos_bboxes but not pos_gt_rois mask_targets = self.get_target(sampling_results, gt_masks, stride=self.train_cfg.proto.final_stride) mask_preds = self.extract_proto(proto, pos_rois, pos_embeds, stride=self.train_cfg.proto.stride, final_stride=self.train_cfg.proto.final_stride) # for segm # segm_targets = self.segm_target(segm_pred, gt_semantic_seg) loss_mask = self.mask_loss(mask_preds, mask_targets, segm_pred, gt_semantic_seg) losses.update(loss_mask) return losses
def forward_train_pair(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): #img: b, 2*c = 6, h, w b, c, h, w = img.shape img = img.reshape(-1,c//2,h,w) #img: 2*b, c, h, w [0,1] , [2,3] , ...... x = self.extract_feat(img) #x : tuple,5 layer losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) #[cls_score:layer,bchw, bbox_pred, shape_pred, loc_pred] rpn_outs_half = [] for outs_0 in rpn_outs: tmp = [] for outs_1 in outs_0: tmp.append(outs_1[::2,:,:,:]) rpn_outs_half.append(tmp) rpn_loss_inputs = tuple(rpn_outs_half) + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) #copy train img_meta to pair. 1,2,3->1,1,2,2,3,3 img_meta = [img_meta[i//2] for i in range(2*len(img_meta))] proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # proposal_list : batch * [det_bboxes] -> det_bboxes: [n,5] 5: x,y,x,y,score #generate blank gt for normal img gt_bboxes_ = [] gt_labels_ = [] for i in range(len(gt_bboxes)): gt_bboxes_.append(gt_bboxes[i]) gt_bboxes_.append(torch.Tensor([[1,1,1,1]]).to(gt_bboxes[i].device)) gt_labels_.append(gt_labels[i]) gt_labels_.append(torch.Tensor([[0]]).to(gt_labels[i].device)) # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler( self.train_cfg.rcnn.sampler, context=self) assert img.size(0) % 2 == 0 num_pairs = img.size(0) // 2 num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_pairs): i_train = 2*i i_normal = i_train + 1 assign_result_train = bbox_assigner.assign(proposal_list[i_train], gt_bboxes_[i_train], gt_bboxes_ignore[i_train], gt_labels_[i_train]) assign_result_normal = bbox_assigner.assign(proposal_list[i_normal], gt_bboxes_[i_normal], gt_bboxes_ignore[i_normal], gt_labels_[i_normal]) sampling_result_train, sampling_results_normal = bbox_sampler.pair_sample( assign_result_train, assign_result_normal, proposal_list[i_train], proposal_list[i_normal], gt_bboxes_[i_train], gt_labels_[i_train], feats_train=[lvl_feat[i_train][None] for lvl_feat in x], feats_normal=[lvl_feat[i_normal][None] for lvl_feat in x]) sampling_results.append(sampling_result_train) sampling_results.append(sampling_results_normal) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones( res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros( res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target(sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss(mask_pred, mask_targets, pos_labels) losses.update(loss_mask) return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals if self.with_bbox: # assign gts and sample proposals bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) #####dense local regression head ################################ sampling_results = self._random_jitter(sampling_results, img_meta) pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) reg_feats = self.reg_roi_extractor( x[:self.reg_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: reg_feats = self.shared_head(reg_feats) # Accelerate training max_sample_num_reg = self.train_cfg.rcnn.get('max_num_reg', 192) sample_idx = torch.randperm( reg_feats.shape[0])[:min(reg_feats.shape[0], max_sample_num_reg )] reg_feats = reg_feats[sample_idx] pos_gt_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) pos_gt_labels = pos_gt_labels[sample_idx] if self.MASK_ON == False: #####################instance segmentation############################ reg_pred, reg_masks_pred = self.D2Det_head(reg_feats) reg_points, reg_targets, reg_masks = self.D2Det_head.get_target( sampling_results) reg_targets = reg_targets[sample_idx] reg_points = reg_points[sample_idx] reg_masks = reg_masks[sample_idx] x1 = reg_points[:, 0, :, :] - reg_pred[:, 0, :, :] * reg_points[:, 2, :, :] x2 = reg_points[:, 0, :, :] + reg_pred[:, 1, :, :] * reg_points[:, 2, :, :] y1 = reg_points[:, 1, :, :] - reg_pred[:, 2, :, :] * reg_points[:, 3, :, :] y2 = reg_points[:, 1, :, :] + reg_pred[:, 3, :, :] * reg_points[:, 3, :, :] pos_decoded_bbox_preds = torch.stack([x1, y1, x2, y2], dim=1) x1_1 = reg_points[:, 0, :, :] - reg_targets[:, 0, :, :] x2_1 = reg_points[:, 0, :, :] + reg_targets[:, 1, :, :] y1_1 = reg_points[:, 1, :, :] - reg_targets[:, 2, :, :] y2_1 = reg_points[:, 1, :, :] + reg_targets[:, 3, :, :] pos_decoded_target_preds = torch.stack( [x1_1, y1_1, x2_1, y2_1], dim=1) loss_reg = self.loss_roi_reg( pos_decoded_bbox_preds.permute(0, 2, 3, 1).reshape(-1, 4), pos_decoded_target_preds.permute(0, 2, 3, 1).reshape(-1, 4), weight=reg_masks.reshape(-1, 1)) loss_mask = self.loss_roi_mask( reg_masks_pred.reshape( -1, reg_masks.shape[2] * reg_masks.shape[3]), reg_masks.reshape(-1, reg_masks.shape[2] * reg_masks.shape[3])) losses.update(dict(loss_reg=loss_reg, loss_mask=loss_mask)) ############################################# else: #####################object detection############################ reg_pred, reg_masks_pred, reg_instances_pred, reg_iou = self.D2Det_head( reg_feats, pos_gt_labels) reg_points, reg_targets, reg_masks, reg_instances = self.D2Det_head.get_target_mask( sampling_results, gt_masks, self.train_cfg.rcnn) reg_targets = reg_targets[sample_idx] reg_points = reg_points[sample_idx] reg_masks = reg_masks[sample_idx] reg_instances = reg_instances[sample_idx] x1 = reg_points[:, 0, :, :] - reg_pred[:, 0, :, :] * reg_points[:, 2, :, :] x2 = reg_points[:, 0, :, :] + reg_pred[:, 1, :, :] * reg_points[:, 2, :, :] y1 = reg_points[:, 1, :, :] - reg_pred[:, 2, :, :] * reg_points[:, 3, :, :] y2 = reg_points[:, 1, :, :] + reg_pred[:, 3, :, :] * reg_points[:, 3, :, :] pos_decoded_bbox_preds = torch.stack([x1, y1, x2, y2], dim=1) x1_1 = reg_points[:, 0, :, :] - reg_targets[:, 0, :, :] x2_1 = reg_points[:, 0, :, :] + reg_targets[:, 1, :, :] y1_1 = reg_points[:, 1, :, :] - reg_targets[:, 2, :, :] y2_1 = reg_points[:, 1, :, :] + reg_targets[:, 3, :, :] pos_decoded_target_preds = torch.stack( [x1_1, y1_1, x2_1, y2_1], dim=1) loss_reg = self.loss_roi_reg( pos_decoded_bbox_preds.permute(0, 2, 3, 1).reshape(-1, 4), pos_decoded_target_preds.permute(0, 2, 3, 1).reshape(-1, 4), weight=reg_masks.reshape(-1, 1)) loss_mask = self.loss_roi_mask( reg_masks_pred.reshape( -1, reg_masks.shape[1] * reg_masks.shape[2]), reg_masks.reshape(-1, reg_masks.shape[1] * reg_masks.shape[2])) loss_instance = self.loss_roi_instance(reg_instances_pred, reg_instances, pos_gt_labels) reg_iou_targets = self.D2Det_head.get_target_maskiou( sampling_results, gt_masks, reg_instances_pred[pos_gt_labels > 0, pos_gt_labels], reg_instances, sample_idx) reg_iou_weights = ((reg_iou_targets > 0.1) & (reg_iou_targets <= 1.0)).float() loss_iou = self.loss_iou(reg_iou[pos_gt_labels > 0, pos_gt_labels], reg_iou_targets, weight=reg_iou_weights) losses.update( dict(loss_reg=loss_reg, loss_mask=loss_mask, loss_instance=loss_instance, loss_iou=loss_iou)) ############################################# return losses
def __init__(self, num_classes=80, in_channels=(512, 1024, 512, 256, 256, 256), anchor_generator=dict(type='SSDAnchorGenerator', scale_major=False, input_size=300, strides=[8, 16, 32, 64, 100, 300], ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]), basesize_ratio_range=(0.1, 0.9)), background_label=None, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], ), reg_decoded_bbox=False, train_cfg=None, test_cfg=None): super(AnchorHead, self).__init__() self.num_classes = num_classes self.in_channels = in_channels self.cls_out_channels = num_classes + 1 # add background class self.anchor_generator = build_anchor_generator(anchor_generator) num_anchors = self.anchor_generator.num_base_anchors reg_convs = [] cls_convs = [] for i in range(len(in_channels)): reg_convs.append( nn.Conv2d(in_channels[i], num_anchors[i] * 4, kernel_size=3, padding=1)) cls_convs.append( nn.Conv2d(in_channels[i], num_anchors[i] * (num_classes + 1), kernel_size=3, padding=1)) self.reg_convs = nn.ModuleList(reg_convs) self.cls_convs = nn.ModuleList(cls_convs) self.background_label = (num_classes if background_label is None else background_label) # background_label should be either 0 or num_classes assert (self.background_label == 0 or self.background_label == num_classes) self.bbox_coder = build_bbox_coder(bbox_coder) self.reg_decoded_bbox = reg_decoded_bbox self.use_sigmoid_cls = False self.cls_focal_loss = False self.train_cfg = train_cfg self.test_cfg = test_cfg # set sampling=False for archor_target self.sampling = False if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) # SSD sampling=False so use PseudoSampler sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.fp16_enabled = False
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_pids, ref_img, ref_bboxes, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i], gt_pids[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], gt_pids[i], ref_bboxes[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # bbox_img_n = [res.bboxes.size(0) for res in sampling_results] # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) ref_x = self.extract_feat(ref_img) if self.with_prop_track: """Difference between this implementation and the paper `Detect to Track and Track to Detect`: 1. Based on FPN Faster R-CNN instead of R-FCN, that means: a. Correlations are implemented on specific levels. b. Contacted features for TrackHead is from backbone/neck. 2. Training with positive proposals instead of only gts. """ track_x = self.extract_corr_feat(x, ref_x) # TODO consider training use gt or (gt + props)[NOW] # TODO consider whether to include semantic consistence[NO] # TODO consider how to calculate the correlation features pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) prop_track_feats = self.prop_track_roi_extractor( track_x[:self.prop_track_roi_extractor.num_inputs], pos_rois) prop_cls, prop_reg = self.prop_track_head(prop_track_feats) prop_targets = self.prop_track_head.get_target( sampling_results, self.train_cfg.rcnn) loss_prop_track = self.prop_track_head.loss( prop_cls, prop_reg, *prop_targets) losses.update(loss_prop_track) if self.with_asso_track: """Associate tracking, based on appearance features. """ ref_rois = bbox2roi(ref_bboxes) num_bbox_x = [res.bboxes.size(0) for res in sampling_results] num_bbox_ref_x = [res.size(0) for res in ref_bboxes] bbox_feats = self.asso_track_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) ref_bbox_feats = self.asso_track_roi_extractor( ref_x[:self.asso_track_roi_extractor.num_inputs], ref_rois) asso_probs = self.asso_track_head(bbox_feats, ref_bbox_feats, num_bbox_x, num_bbox_ref_x) asso_targets = self.asso_track_head.get_target( sampling_results, self.train_cfg.track) loss_asso_track = self.asso_track_head.loss( asso_probs, *asso_targets) losses.update(loss_asso_track) return losses
def __init__( self, num_classes, in_channels, feat_channels=256, approx_anchor_generator=dict( type='AnchorGenerator', octave_base_scale=8, scales_per_octave=3, ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), square_anchor_generator=dict( type='AnchorGenerator', ratios=[1.0], scales=[8], strides=[4, 8, 16, 32, 64]), anchor_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0] ), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0] ), reg_decoded_bbox=False, deform_groups=4, loc_filter_thr=0.01, train_cfg=None, test_cfg=None, loss_loc=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)): # yapf: disable super(AnchorHead, self).__init__() self.in_channels = in_channels self.num_classes = num_classes self.feat_channels = feat_channels self.deform_groups = deform_groups self.loc_filter_thr = loc_filter_thr # build approx_anchor_generator and square_anchor_generator assert (approx_anchor_generator['octave_base_scale'] == square_anchor_generator['scales'][0]) assert (approx_anchor_generator['strides'] == square_anchor_generator['strides']) self.approx_anchor_generator = build_anchor_generator( approx_anchor_generator) self.square_anchor_generator = build_anchor_generator( square_anchor_generator) self.approxs_per_octave = self.approx_anchor_generator \ .num_base_anchors[0] self.reg_decoded_bbox = reg_decoded_bbox # one anchor per location self.num_anchors = 1 self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) self.loc_focal_loss = loss_loc['type'] in ['FocalLoss'] self.sampling = loss_cls['type'] not in ['FocalLoss'] self.ga_sampling = train_cfg is not None and hasattr( train_cfg, 'ga_sampler') if self.use_sigmoid_cls: self.cls_out_channels = self.num_classes else: self.cls_out_channels = self.num_classes + 1 # build bbox_coder self.anchor_coder = build_bbox_coder(anchor_coder) self.bbox_coder = build_bbox_coder(bbox_coder) # build losses self.loss_loc = build_loss(loss_loc) self.loss_shape = build_loss(loss_shape) self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.train_cfg = train_cfg self.test_cfg = test_cfg if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) # use PseudoSampler when sampling is False if self.sampling and hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.ga_assigner = build_assigner(self.train_cfg.ga_assigner) if self.ga_sampling: ga_sampler_cfg = self.train_cfg.ga_sampler else: ga_sampler_cfg = dict(type='PseudoSampler') self.ga_sampler = build_sampler(ga_sampler_cfg, context=self) self.fp16_enabled = False self._init_layers()
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None, gt_semantic_seg_Nx=None, proposals=None, ref_img=None, # images of reference frame ref_bboxes=None, # gt bbox of reference frame ref_labels=None, ref_masks=None, ref_semantic_seg=None, ref_semantic_seg_Nx=None, ref_obj_ids=None, gt_pids=None, # gt ids of target objs mapped to reference objs gt_obj_ids=None, # gt_flow=None, ): losses = dict() # ******************************** # Initial Flow and Feature Warping # ******************************** flowR2T, _ = self.compute_flow(img.clone(), ref_img.clone(), scale_factor=0.25) x = self.extract_feat(img) ref_x = self.extract_feat(ref_img) x = self.extra_neck(x, ref_x, flowR2T) # ********************************** # FCN Semantic Head forward and loss # ********************************** if hasattr(self, 'panopticFPN') and self.panopticFPN is not None: #### semantic FCN GT gt_semantic_seg = gt_semantic_seg.long() gt_semantic_seg = gt_semantic_seg.squeeze(1) fcn_output, fcn_score = self.panopticFPN( x[0:self.panopticFPN.num_levels]) loss_fcn = F.cross_entropy( fcn_output, gt_semantic_seg, ignore_index=255) loss_fcn = {'loss_segm': loss_fcn} losses.update(loss_fcn) # *************************** # RPN forward and loss # *************************** if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # ******************************* # assign gts and sample proposals # ******************************* if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler( self.train_cfg.rcnn.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i], gt_labels[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes[i], gt_labels[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) # ******************************* # bbox head forward and loss # ******************************* if self.with_bbox: rois = bbox2roi([res.bboxes for res in sampling_results]) # TODO: a more flexible decision which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = self.bbox_head(bbox_feats) bbox_targets = self.bbox_head.get_target( sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_bbox = self.bbox_head.loss( cls_score, bbox_pred, *bbox_targets) losses.update(loss_bbox) # ******************************* # mask head forward and loss # ******************************* if self.with_mask: if not self.share_roi_extractor: pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones( res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros( res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target( sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = self.mask_head.loss( mask_pred, mask_targets, pos_labels) losses.update(loss_mask) # *************************************** # PANOPTIC HEAD - Only for BATCH SIZE: 1 # *************************************** if hasattr(self.train_cfg, 'loss_pano_weight'): # extract gt rois for panpotic head gt_rois = bbox2roi(gt_bboxes) # [#bbox, 5] cls_idx = gt_labels[0] # [#bbox] / batch_size must be 1 # fcn_score # [1,20,200,400] # compute mask logits with gt rois mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], gt_rois) # [#bbox,256,14,14] mask_score = self.mask_head(mask_feats) # [#bbox,#things+1,28,28], #things+1=9 nobj,_,H,W = mask_score.shape mask_score = mask_score.gather( 1, cls_idx.view(-1,1,1,1).expand(-1,-1,H,W)) # [#bbox,1,28,28] # compute panoptic logits seg_stuff_logits, seg_inst_logits = self.seg_term( cls_idx, fcn_score, gt_rois) mask_logits = self.mask_term( mask_score, gt_rois, cls_idx, fcn_score) # panoptic_logits: [1,#stuff+#bbox,200,400] panoptic_logits = torch.cat( [seg_stuff_logits, (seg_inst_logits + mask_logits)], dim=1) # generate gt for panoptic head # added for panoptic gt generation : gt_masks_4x gt_masks_4x = gt_masks[0][:,::4,::4] with torch.no_grad(): # gt_semantic_seg_Nx[0] [1,200,400], # gt_masks_4x [#bbox,200,400] panoptic_gt = self.mask_matching( gt_semantic_seg_Nx[0], gt_masks_4x) panoptic_gt = panoptic_gt.long() panoptic_loss = F.cross_entropy( panoptic_logits, panoptic_gt, ignore_index = 255) pano_loss = {'loss_pano': panoptic_loss * self.train_cfg.loss_pano_weight} losses.update(pano_loss) return losses
def forward_train(self, imgs, img_meta, imgs_2, img_meta_2, gt_bboxes, gt_bboxes_2, gt_labels, gt_labels_2, gt_bboxes_ignore=None, gt_masks=None, gt_masks_2=None, proposals=None): # self.print_iterations() assert imgs.shape[ 1] == 3 and imgs_2.shape[1] == 3 # make sure channel size is 3 x = self.extract_feat(imgs) x_2 = self.extract_feat(imgs_2) losses = dict() # RPN forward and loss if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_outs_2 = self.rpn_head(x_2) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_loss_inputs_2 = rpn_outs_2 + (gt_bboxes_2, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore, iteration=self.iteration) rpn_losses_2 = self.rpn_head.loss( *rpn_loss_inputs_2, gt_bboxes_ignore=gt_bboxes_ignore, iteration=self.iteration, img_meta_2=img_meta_2) losses.update(rpn_losses) losses.update(rpn_losses_2) proposal_inputs = rpn_outs + (img_meta, self.train_cfg.rpn_proposal) proposal_inputs_2 = rpn_outs_2 + (img_meta, self.train_cfg.rpn_proposal) proposal_list, anchors = self.rpn_head.get_bboxes(*proposal_inputs) proposal_list_2, anchors_2 = self.rpn_head.get_bboxes( *proposal_inputs_2, img_meta_2=img_meta_2) # self.rpn_head.visualize_anchor_boxes(imgs, rpn_outs[0], img_meta, slice_num=45, shuffle=True) # debug only # self.visualize_proposals(imgs, proposal_list, gt_bboxes, img_meta, slice_num=None, isProposal=True) #debug only # self.visualize_proposals(imgs, anchors, gt_bboxes, img_meta, slice_num=None, isProposal=False) #debug only # self.visualize_gt_bboxes(imgs, gt_bboxes, img_meta) #debug only # breakpoint() # self.visualize_gt_bboxes(imgs_2, gt_bboxes_2, img_meta_2) #debug only # breakpoint() # self.visualize_gt_bboxes_masks(imgs, gt_bboxes, img_meta, gt_masks) # debug only else: proposal_list = proposals # assign gts and sample proposals if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = imgs.size(0) gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): gt_bboxes_cur_pat = gt_bboxes[i] gt_bboxes_ignore_cur_pat = gt_bboxes_ignore[i] gt_labels_cur_pat = gt_labels[i] assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes_cur_pat, gt_bboxes_ignore_cur_pat, gt_labels_cur_pat) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes_cur_pat, gt_labels_cur_pat, feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) bbox_assigner_2 = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler_2 = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs_2 = imgs_2.size(0) gt_bboxes_ignore_2 = [None for _ in range(num_imgs_2)] sampling_results_2 = [] for i in range(num_imgs_2): gt_bboxes_cur_pat_2 = gt_bboxes_2[i] gt_bboxes_ignore_cur_pat_2 = gt_bboxes_ignore_2[i] gt_labels_cur_pat_2 = gt_labels_2[i] assign_result_2 = bbox_assigner_2.assign( proposal_list_2[i], gt_bboxes_cur_pat_2, gt_bboxes_ignore_cur_pat_2, gt_labels_cur_pat_2) sampling_result_2 = bbox_sampler_2.sample( assign_result_2, proposal_list_2[i], gt_bboxes_cur_pat_2, gt_labels_cur_pat_2, feats=[lvl_feat[i][None] for lvl_feat in x_2]) sampling_results_2.append(sampling_result_2) # bbox head forward and loss if self.with_bbox: rois = bbox2roi3D([res.bboxes for res in sampling_results]) rois_2 = bbox2roi3D([res.bboxes for res in sampling_results_2]) # TODO: a more flexible way to decide which feature maps to use bbox_feats = self.bbox_roi_extractor( x[:self.bbox_roi_extractor.num_inputs], rois) bbox_feats_2 = self.bbox_roi_extractor( x_2[:self.bbox_roi_extractor.num_inputs], rois_2) cls_score, bbox_pred = self.bbox_head(bbox_feats) cls_score_2, bbox_pred_2 = self.bbox_head(bbox_feats_2) cls_score = torch.cat((cls_score, cls_score_2), 0) bbox_pred = torch.cat((bbox_pred, bbox_pred_2), 0) bbox_targets = self.bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, self.train_cfg.rcnn) bbox_targets_2 = self.bbox_head.get_target(sampling_results_2, gt_bboxes_2, gt_labels_2, self.train_cfg.rcnn) bbox_targets_combined = [] for bbox_target, bbox_target_2 in zip(bbox_targets, bbox_targets_2): bbox_targets_combined.append( torch.cat((bbox_target, bbox_target_2), 0)) bbox_targets_combined = tuple(bbox_targets_combined) loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, *bbox_targets_combined) losses.update(loss_bbox) # prepare upscaled data for refinement head upscaled_factor = img_meta_2[0]['ori_shape'][0] / img_meta[0][ 'ori_shape'][0] # convert parameterized adjustments to actual bounding boxes coordinates pred_bboxes_2 = self.bbox_head.convert_adjustments_to_bboxes( rois_2, bbox_pred_2, img_meta_2[0] ['img_shape'])[:, 6:].cpu().detach().numpy() / upscaled_factor pred_cls_score_2 = cls_score_2[:, 1, None].cpu().detach().numpy() pred_bboxes_2 = np.concatenate((pred_bboxes_2, pred_cls_score_2), axis=1) pred_bboxes_2 = [torch.from_numpy(pred_bboxes_2).cuda()] bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) bbox_sampler = build_sampler(self.train_cfg.rcnn.sampler, context=self) num_imgs = imgs.size(0) gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results_refinement = [] for i in range(num_imgs): gt_bboxes_cur_pat = gt_bboxes[i] gt_bboxes_ignore_cur_pat = gt_bboxes_ignore[i] gt_labels_cur_pat = gt_labels[i] assign_result = bbox_assigner.assign(pred_bboxes_2[i], gt_bboxes_cur_pat, gt_bboxes_ignore_cur_pat, gt_labels_cur_pat) sampling_result = bbox_sampler.sample( assign_result, pred_bboxes_2[i], gt_bboxes_cur_pat, gt_labels_cur_pat, feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results_refinement.append(sampling_result) rois_refinement = bbox2roi3D( [res.bboxes for res in sampling_results_refinement]) bbox_feats_refinement = self.bbox_roi_extractor_refinement( x[:self.bbox_roi_extractor_refinement.num_inputs], rois_refinement) # training refinement head refined_bbox_pred = self.refinement_head(bbox_feats_refinement) bbox_targets_refinement = self.refinement_head.get_target( sampling_results_refinement, gt_bboxes, gt_labels, self.train_cfg.rcnn) loss_refinement = self.refinement_head.loss( refined_bbox_pred, *bbox_targets_refinement) losses.update(loss_refinement) # mask head forward and loss if self.with_mask: pos_rois = bbox2roi3D([res.pos_bboxes for res in sampling_results]) mask_feats = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois) mask_pred = self.mask_head(mask_feats) mask_targets = self.mask_head.get_target(sampling_results, gt_masks, self.train_cfg.rcnn) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) pos_rois_refined = bbox2roi3D( [res.pos_bboxes for res in sampling_results_refinement]) mask_feats_refined = self.mask_roi_extractor( x[:self.mask_roi_extractor.num_inputs], pos_rois_refined) mask_pred_refined = self.mask_head(mask_feats_refined) mask_targets_refined = self.mask_head.get_target( sampling_results_refinement, gt_masks, self.train_cfg.rcnn) pos_labels_refined = torch.cat( [res.pos_gt_labels for res in sampling_results_refinement]) mask_pred_combined = torch.cat((mask_pred, mask_pred_refined)) mask_targets_combined = torch.cat( (mask_targets, mask_targets_refined)) pos_labels_combined = torch.cat((pos_labels, pos_labels_refined)) loss_mask = self.mask_head.loss(mask_pred_combined, mask_targets_combined, pos_labels_combined) losses.update(loss_mask) # self.save_losses_plt(losses) #debug only... self.iteration += 1 return losses
def __init__(self, in_channels, feat_channels, out_channels, num_things_classes=80, num_stuff_classes=53, num_queries=100, pixel_decoder=None, enforce_decoder_input_project=False, transformer_decoder=None, positional_encoding=None, loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0, class_weight=[1.0] * 133 + [0.1]), loss_mask=dict(type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=20.0), loss_dice=dict(type='DiceLoss', use_sigmoid=True, activate=True, naive_dice=True, loss_weight=1.0), train_cfg=None, test_cfg=None, init_cfg=None, **kwargs): super(AnchorFreeHead, self).__init__(init_cfg) self.num_things_classes = num_things_classes self.num_stuff_classes = num_stuff_classes self.num_classes = self.num_things_classes + self.num_stuff_classes self.num_queries = num_queries pixel_decoder.update(in_channels=in_channels, feat_channels=feat_channels, out_channels=out_channels) self.pixel_decoder = build_plugin_layer(pixel_decoder)[1] self.transformer_decoder = build_transformer_layer_sequence( transformer_decoder) self.decoder_embed_dims = self.transformer_decoder.embed_dims pixel_decoder_type = pixel_decoder.get('type') if pixel_decoder_type == 'PixelDecoder' and ( self.decoder_embed_dims != in_channels[-1] or enforce_decoder_input_project): self.decoder_input_proj = Conv2d(in_channels[-1], self.decoder_embed_dims, kernel_size=1) else: self.decoder_input_proj = nn.Identity() self.decoder_pe = build_positional_encoding(positional_encoding) self.query_embed = nn.Embedding(self.num_queries, out_channels) self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1) self.mask_embed = nn.Sequential( nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), nn.Linear(feat_channels, out_channels)) self.test_cfg = test_cfg self.train_cfg = train_cfg if train_cfg: self.assigner = build_assigner(train_cfg.assigner) self.sampler = build_sampler(train_cfg.sampler, context=self) self.class_weight = loss_cls.class_weight self.loss_cls = build_loss(loss_cls) self.loss_mask = build_loss(loss_mask) self.loss_dice = build_loss(loss_dice)
def __init__(self, num_classes, in_channels, stacked_convs=4, feat_channels=256, approx_anchor_generator=dict( type='AnchorGenerator', octave_base_scale=4, scales_per_octave=3, ratios=[0.5, 1.0, 2.0], strides=[8, 16, 32, 64, 128]), square_anchor_generator=dict( type='AnchorGenerator', ratios=[1.0], scales=[4], strides=[8, 16, 32, 64, 128]), conv_cfg=None, norm_cfg=None, bbox_coder=dict( type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0), reg_decoded_bbox=False, background_label=None, train_cfg=None, test_cfg=None, loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5), loss_bbox_reg=dict( type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)): super(SABLRetinaHead, self).__init__() self.in_channels = in_channels self.num_classes = num_classes self.feat_channels = feat_channels self.num_buckets = bbox_coder['num_buckets'] self.side_num = int(np.ceil(self.num_buckets / 2)) assert (approx_anchor_generator['octave_base_scale'] == square_anchor_generator['scales'][0]) assert (approx_anchor_generator['strides'] == square_anchor_generator['strides']) self.approx_anchor_generator = build_anchor_generator( approx_anchor_generator) self.square_anchor_generator = build_anchor_generator( square_anchor_generator) self.approxs_per_octave = ( self.approx_anchor_generator.num_base_anchors[0]) # one anchor per location self.num_anchors = 1 self.stacked_convs = stacked_convs self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.reg_decoded_bbox = reg_decoded_bbox self.background_label = ( num_classes if background_label is None else background_label) self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) self.sampling = loss_cls['type'] not in [ 'FocalLoss', 'GHMC', 'QualityFocalLoss' ] if self.use_sigmoid_cls: self.cls_out_channels = num_classes else: self.cls_out_channels = num_classes + 1 self.bbox_coder = build_bbox_coder(bbox_coder) self.loss_cls = build_loss(loss_cls) self.loss_bbox_cls = build_loss(loss_bbox_cls) self.loss_bbox_reg = build_loss(loss_bbox_reg) self.train_cfg = train_cfg self.test_cfg = test_cfg if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) # use PseudoSampler when sampling is False if self.sampling and hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.fp16_enabled = False self._init_layers()
def __init__(self, num_classes, in_channels, num_fcs=2, transformer=dict( type='Transformer', embed_dims=256, num_heads=8, num_encoder_layers=6, num_decoder_layers=6, feedforward_channels=2048, dropout=0.1, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN'), num_fcs=2, pre_norm=False, return_intermediate_dec=True), positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), loss_cls=dict( type='CrossEntropyLoss', bg_cls_weight=0.1, use_sigmoid=False, loss_weight=1.0, class_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=5.0), loss_iou=dict(type='GIoULoss', loss_weight=2.0), train_cfg=dict( assigner=dict( type='HungarianAssigner', cls_weight=1., bbox_weight=5., iou_weight=2., iou_calculator=dict(type='BboxOverlaps2D'), iou_mode='giou')), test_cfg=dict(max_per_img=100), **kwargs): # NOTE here use `AnchorFreeHead` instead of `TransformerHead`, # since it brings inconvenience when the initialization of # `AnchorFreeHead` is called. super(AnchorFreeHead, self).__init__() use_sigmoid_cls = loss_cls.get('use_sigmoid', False) assert not use_sigmoid_cls, 'setting use_sigmoid_cls as True is ' \ 'not supported in DETR, since background is needed for the ' \ 'matching process.' assert 'embed_dims' in transformer \ and 'num_feats' in positional_encoding num_feats = positional_encoding['num_feats'] embed_dims = transformer['embed_dims'] assert num_feats * 2 == embed_dims, 'embed_dims should' \ f' be exactly 2 times of num_feats. Found {embed_dims}' \ f' and {num_feats}.' assert test_cfg is not None and 'max_per_img' in test_cfg class_weight = loss_cls.get('class_weight', None) if class_weight is not None: assert isinstance(class_weight, float), 'Expected ' \ 'class_weight to have type float. Found ' \ f'{type(class_weight)}.' # NOTE following the official DETR rep0, bg_cls_weight means # relative classification weight of the no-object class. bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight) assert isinstance(bg_cls_weight, float), 'Expected ' \ 'bg_cls_weight to have type float. Found ' \ f'{type(bg_cls_weight)}.' class_weight = torch.ones(num_classes + 1) * class_weight # set background class as the last indice class_weight[num_classes] = bg_cls_weight loss_cls.update({'class_weight': class_weight}) if 'bg_cls_weight' in loss_cls: loss_cls.pop('bg_cls_weight') self.bg_cls_weight = bg_cls_weight if train_cfg: assert 'assigner' in train_cfg, 'assigner should be provided '\ 'when train_cfg is set.' assigner = train_cfg['assigner'] assert loss_cls['loss_weight'] == assigner['cls_weight'], \ 'The classification weight for loss and matcher should be' \ 'exactly the same.' assert loss_bbox['loss_weight'] == assigner['bbox_weight'], \ 'The regression L1 weight for loss and matcher should be' \ 'exactly the same.' assert loss_iou['loss_weight'] == assigner['iou_weight'], \ 'The regression iou weight for loss and matcher should be' \ 'exactly the same.' self.assigner = build_assigner(assigner) # DETR sampling=False, so use PseudoSampler sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.num_classes = num_classes self.cls_out_channels = num_classes + 1 self.in_channels = in_channels self.num_fcs = num_fcs self.train_cfg = train_cfg self.test_cfg = test_cfg self.use_sigmoid_cls = use_sigmoid_cls self.embed_dims = embed_dims self.num_query = test_cfg['max_per_img'] self.fp16_enabled = False self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_iou = build_loss(loss_iou) self.act_cfg = transformer.get('act_cfg', dict(type='ReLU', inplace=True)) self.activate = build_activation_layer(self.act_cfg) self.positional_encoding = build_positional_encoding( positional_encoding) self.transformer = build_transformer(transformer) self._init_layers()
def __init__(self, num_classes, in_channels, feat_channels=256, stacked_convs=2, strides=[8, 16, 32], use_depthwise=False, dcn_on_last_conv=False, conv_bias='auto', conv_cfg=None, norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), act_cfg=dict(type='Swish'), loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='sum', loss_weight=1.0), loss_bbox=dict(type='IoULoss', mode='square', eps=1e-16, reduction='sum', loss_weight=5.0), loss_obj=dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='sum', loss_weight=1.0), loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0), train_cfg=None, test_cfg=None, init_cfg=dict(type='Kaiming', layer='Conv2d', a=math.sqrt(5), distribution='uniform', mode='fan_in', nonlinearity='leaky_relu')): super().__init__(init_cfg=init_cfg) self.num_classes = num_classes self.cls_out_channels = num_classes self.in_channels = in_channels self.feat_channels = feat_channels self.stacked_convs = stacked_convs self.strides = strides self.use_depthwise = use_depthwise self.dcn_on_last_conv = dcn_on_last_conv assert conv_bias == 'auto' or isinstance(conv_bias, bool) self.conv_bias = conv_bias self.use_sigmoid_cls = True self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_obj = build_loss(loss_obj) self.use_l1 = False # This flag will be modified by hooks. self.loss_l1 = build_loss(loss_l1) self.prior_generator = MlvlPointGenerator(strides, offset=0) self.test_cfg = test_cfg self.train_cfg = train_cfg self.sampling = False if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) # sampling=False so use PseudoSampler sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.fp16_enabled = False self._init_layers()
def __init__(self, num_classes, in_channels, feat_channels=256, anchor_generator=dict(type='AnchorGenerator', scales=[8, 16, 32], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict(type='DeltaXYWHBBoxCoder', target_means=(.0, .0, .0, .0), target_stds=(1.0, 1.0, 1.0, 1.0)), reg_decoded_bbox=False, background_label=None, loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), train_cfg=None, test_cfg=None): super(AnchorHead, self).__init__() self.in_channels = in_channels self.num_classes = num_classes self.feat_channels = feat_channels self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) # TODO better way to determine whether sample or not self.sampling = loss_cls['type'] not in [ 'FocalLoss', 'GHMC', 'QualityFocalLoss' ] if self.use_sigmoid_cls: self.cls_out_channels = num_classes else: self.cls_out_channels = num_classes + 1 if self.cls_out_channels <= 0: raise ValueError(f'num_classes={num_classes} is too small') self.reg_decoded_bbox = reg_decoded_bbox self.background_label = (num_classes if background_label is None else background_label) # background_label should be either 0 or num_classes assert (self.background_label == 0 or self.background_label == num_classes) self.bbox_coder = build_bbox_coder(bbox_coder) self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.train_cfg = train_cfg self.test_cfg = test_cfg if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) # use PseudoSampler when sampling is False if self.sampling and hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.fp16_enabled = False self.anchor_generator = build_anchor_generator(anchor_generator) # usually the numbers of anchors for each level are the same # except SSD detectors self.num_anchors = self.anchor_generator.num_base_anchors[0] self._init_layers()
def __init__(self, num_classes, in_channels, out_channels=(1024, 512, 256), anchor_generator=dict(type='YOLOAnchorGenerator', base_sizes=[[(116, 90), (156, 198), (373, 326)], [(30, 61), (62, 45), (59, 119)], [(10, 13), (16, 30), (33, 23)]], strides=[32, 16, 8]), bbox_coder=dict(type='YOLOBBoxCoder'), featmap_strides=[32, 16, 8], one_hot_smoother=0., conv_cfg=None, norm_cfg=dict(type='BN', requires_grad=True), act_cfg=dict(type='LeakyReLU', negative_slope=0.1), loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_conf=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_xy=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_wh=dict(type='MSELoss', loss_weight=1.0), train_cfg=None, test_cfg=None): super(YOLOV3Head, self).__init__() # Check params assert (len(in_channels) == len(out_channels) == len(featmap_strides)) self.num_classes = num_classes self.in_channels = in_channels self.out_channels = out_channels self.featmap_strides = featmap_strides self.train_cfg = train_cfg self.test_cfg = test_cfg if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) if hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.one_hot_smoother = one_hot_smoother self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.bbox_coder = build_bbox_coder(bbox_coder) self.anchor_generator = build_anchor_generator(anchor_generator) self.loss_cls = build_loss(loss_cls) self.loss_conf = build_loss(loss_conf) self.loss_xy = build_loss(loss_xy) self.loss_wh = build_loss(loss_wh) # usually the numbers of anchors for each level are the same # except SSD detectors self.num_anchors = self.anchor_generator.num_base_anchors[0] assert len( self.anchor_generator.num_base_anchors) == len(featmap_strides) self._init_layers()
def forward_train(self, img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): """ Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and my also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[Tensor]): each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. gt_masks (None | Tensor) : true segmentation masks for each box used if the architecture supports a segmentation task. proposals : override rpn proposals with custom proposals. Use when `with_rpn` is False. Returns: dict[str, Tensor]: a dictionary of loss components """ x = self.extract_feat(img) losses = dict() if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_metas, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals for i in range(self.num_stages): self.current_stage = i rcnn_train_cfg = self.train_cfg.rcnn[i] lw = self.train_cfg.stage_loss_weights[i] # assign gts and sample proposals sampling_results = [] if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(rcnn_train_cfg.assigner) bbox_sampler = build_sampler(rcnn_train_cfg.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] for j in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j], gt_labels[j]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[j], gt_bboxes[j], gt_labels[j], feats=[lvl_feat[j][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss bbox_roi_extractor = self.bbox_roi_extractor[i] bbox_head = self.bbox_head[i] rois = bbox2roi([res.bboxes for res in sampling_results]) if len(rois) == 0: # If there are no predicted and/or truth boxes, then we cannot # compute head / mask losses continue bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = bbox_head(bbox_feats) bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg) loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets) for name, value in loss_bbox.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: mask_roi_extractor = self.mask_roi_extractor[i] pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = mask_roi_extractor( x[:mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: # reuse positive bbox feats pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds.type(torch.bool)] mask_head = self.mask_head[i] mask_pred = mask_head(mask_feats) mask_targets = mask_head.get_target(sampling_results, gt_masks, rcnn_train_cfg) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels) for name, value in loss_mask.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # refine bboxes if i < self.num_stages - 1: pos_is_gts = [res.pos_is_gt for res in sampling_results] roi_labels = bbox_targets[0] # bbox_targets is a tuple with torch.no_grad(): proposal_list = bbox_head.refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_metas) return losses
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, gt_semantic_seg=None, proposals=None): x = self.extract_feat(img) losses = dict() # RPN part, the same as normal two-stage detectors if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_meta, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # semantic segmentation part # 2 outputs: segmentation prediction and embedded features if self.with_semantic: semantic_pred, semantic_feat = self.semantic_head(x) loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_seg) losses['loss_semantic_seg'] = loss_seg else: semantic_feat = None for i in range(self.num_stages): self.current_stage = i rcnn_train_cfg = self.train_cfg.rcnn[i] lw = self.train_cfg.stage_loss_weights[i] # assign gts and sample proposals sampling_results = [] bbox_assigner = build_assigner(rcnn_train_cfg.assigner) bbox_sampler = build_sampler(rcnn_train_cfg.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] for j in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j], gt_labels[j]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[j], gt_bboxes[j], gt_labels[j], feats=[lvl_feat[j][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss loss_bbox, rois, bbox_targets, bbox_pred = \ self._bbox_forward_train( i, x, sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg, semantic_feat) roi_labels = bbox_targets[0] for name, value in loss_bbox.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # mask head forward and loss if self.with_mask: # interleaved execution: use regressed bboxes by the box branch # to train the mask branch if self.interleaved: pos_is_gts = [res.pos_is_gt for res in sampling_results] with torch.no_grad(): proposal_list = self.bbox_head[i].refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_meta) # re-assign and sample 512 RoIs from 512 RoIs sampling_results = [] for j in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j], gt_labels[j]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[j], gt_bboxes[j], gt_labels[j], feats=[lvl_feat[j][None] for lvl_feat in x]) sampling_results.append(sampling_result) loss_mask = self._mask_forward_train(i, x, sampling_results, gt_masks, rcnn_train_cfg, semantic_feat) for name, value in loss_mask.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # refine bboxes (same as Cascade R-CNN) if i < self.num_stages - 1 and not self.interleaved: pos_is_gts = [res.pos_is_gt for res in sampling_results] with torch.no_grad(): proposal_list = self.bbox_head[i].refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_meta) return losses
def __init__(self, num_classes, in_channels, regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 512), (512, INF)), center_sampling=False, center_sample_radius=1.5, sync_num_pos=True, gradient_mul=0.1, bbox_norm_type='reg_denom', loss_cls_fl=dict(type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), use_vfl=True, loss_cls=dict(type='VarifocalLoss', use_sigmoid=True, alpha=0.75, gamma=2.0, iou_weighted=True, loss_weight=1.0), loss_bbox=dict(type='GIoULoss', loss_weight=1.5), loss_bbox_refine=dict(type='GIoULoss', loss_weight=2.0), norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), use_atss=True, anchor_generator=dict(type='AnchorGenerator', ratios=[1.0], octave_base_scale=8, scales_per_octave=1, center_offset=0.0, strides=[8, 16, 32, 64, 128]), **kwargs): # dcn base offsets, adapted from reppoints_head.py self.num_dconv_points = 9 self.dcn_kernel = int(np.sqrt(self.num_dconv_points)) self.dcn_pad = int((self.dcn_kernel - 1) / 2) dcn_base = np.arange(-self.dcn_pad, self.dcn_pad + 1).astype(np.float64) dcn_base_y = np.repeat(dcn_base, self.dcn_kernel) dcn_base_x = np.tile(dcn_base, self.dcn_kernel) dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape( (-1)) self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1) super(FCOSHead, self).__init__(num_classes, in_channels, norm_cfg=norm_cfg, **kwargs) self.regress_ranges = regress_ranges self.reg_denoms = [ regress_range[-1] for regress_range in regress_ranges ] self.reg_denoms[-1] = self.reg_denoms[-2] * 2 self.center_sampling = center_sampling self.center_sample_radius = center_sample_radius self.sync_num_pos = sync_num_pos self.bbox_norm_type = bbox_norm_type self.gradient_mul = gradient_mul self.use_vfl = use_vfl if self.use_vfl: self.loss_cls = build_loss(loss_cls) else: self.loss_cls = build_loss(loss_cls_fl) self.loss_bbox = build_loss(loss_bbox) self.loss_bbox_refine = build_loss(loss_bbox_refine) # for getting ATSS targets self.use_atss = use_atss self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) self.anchor_generator = build_anchor_generator(anchor_generator) self.anchor_center_offset = anchor_generator['center_offset'] self.num_anchors = self.anchor_generator.num_base_anchors[0] self.sampling = False if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self)
def forward_train(self, img, img_meta, gt_bboxes, gt_labels, gt_bboxes_ignore=None, gt_masks=None, proposals=None): x = self.extract_feat(img) losses = dict() if self.with_rpn: rpn_outs = self.rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn) rpn_losses = self.rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals for i in range(self.num_stages): self.current_stage = i rcnn_train_cfg = self.train_cfg.rcnn[i] lw = self.train_cfg.stage_loss_weights[i] # assign gts and sample proposals sampling_results = [] if self.with_bbox or self.with_mask: bbox_assigner = build_assigner(rcnn_train_cfg.assigner) bbox_sampler = build_sampler(rcnn_train_cfg.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] for j in range(num_imgs): assign_result = bbox_assigner.assign( proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j], gt_labels[j]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[j], gt_bboxes[j], gt_labels[j], feats=[lvl_feat[j][None] for lvl_feat in x]) sampling_results.append(sampling_result) # bbox head forward and loss bbox_roi_extractor = self.bbox_roi_extractor[i] bbox_head = self.bbox_head[i] rois = bbox2roi([res.bboxes for res in sampling_results]) bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], rois) if self.with_shared_head: bbox_feats = self.shared_head(bbox_feats) cls_score, bbox_pred = bbox_head(bbox_feats) bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg) loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets) for name, value in loss_bbox.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # mask head forward and loss if self.with_mask: if not self.share_roi_extractor: mask_roi_extractor = self.mask_roi_extractor[i] pos_rois = bbox2roi( [res.pos_bboxes for res in sampling_results]) mask_feats = mask_roi_extractor( x[:mask_roi_extractor.num_inputs], pos_rois) if self.with_shared_head: mask_feats = self.shared_head(mask_feats) else: # reuse positive bbox feats pos_inds = [] device = bbox_feats.device for res in sampling_results: pos_inds.append( torch.ones(res.pos_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds.append( torch.zeros(res.neg_bboxes.shape[0], device=device, dtype=torch.uint8)) pos_inds = torch.cat(pos_inds) mask_feats = bbox_feats[pos_inds] mask_head = self.mask_head[i] mask_pred = mask_head(mask_feats) mask_targets = mask_head.get_target(sampling_results, gt_masks, rcnn_train_cfg) pos_labels = torch.cat( [res.pos_gt_labels for res in sampling_results]) loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels) for name, value in loss_mask.items(): losses['s{}.{}'.format( i, name)] = (value * lw if 'loss' in name else value) # refine bboxes if i < self.num_stages - 1: pos_is_gts = [res.pos_is_gt for res in sampling_results] roi_labels = bbox_targets[0] # bbox_targets is a tuple with torch.no_grad(): proposal_list = bbox_head.refine_bboxes( rois, roi_labels, bbox_pred, pos_is_gts, img_meta) return losses
def forward_train(self, img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None): if self.use_consistent_supervision: x, y = self.extract_feat(img) gt_bboxes_auxiliary = [gt.clone() for gt in gt_bboxes] gt_labels_auxiliary = [label.clone() for label in gt_labels] else: x = self.extract_feat(img) outs = self.bbox_head(x) loss_inputs = outs + (gt_bboxes, gt_labels, img_metas, self.train_cfg) losses = self.bbox_head.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) if self.use_consistent_supervision: proposal_cfg = self.train_cfg.auxiliary.proposal proposal_inputs = outs + (img_metas, proposal_cfg) proposal_list = self.bbox_head.get_bboxes_auxiliary( *proposal_inputs) bbox_assigner = build_assigner(self.train_cfg.auxiliary.assigner) bbox_sampler = build_sampler(self.train_cfg.auxiliary.sampler, context=self) num_imgs = img.size(0) if gt_bboxes_ignore is None: gt_bboxes_ignore = [None for _ in range(num_imgs)] sampling_results = [] for i in range(num_imgs): assign_result = bbox_assigner.assign(proposal_list[i], gt_bboxes_auxiliary[i], gt_bboxes_ignore[i], gt_labels_auxiliary[i]) sampling_result = bbox_sampler.sample( assign_result, proposal_list[i], gt_bboxes_auxiliary[i], gt_labels_auxiliary[i], feats=[lvl_feat[i][None] for lvl_feat in x]) sampling_results.append(sampling_result) rois = bbox2roi([res.bboxes for res in sampling_results]) bbox_feats_raw = self.auxiliary_bbox_roi_extractor( y[:self.auxiliary_bbox_roi_extractor.num_inputs], rois) cls_score_auxiliary, bbox_pred_auxiliary = self.auxiliary_bbox_head( bbox_feats_raw) bbox_targets = self.auxiliary_bbox_head.get_target( sampling_results, gt_bboxes, gt_labels, self.train_cfg.auxiliary.rcnn) loss_bbox_auxiliary = self.auxiliary_bbox_head.loss( cls_score_auxiliary, bbox_pred_auxiliary, *bbox_targets, alpha=0.25, num_level=3) losses.update(loss_bbox_auxiliary) return losses
def __init__(self, num_classes, in_channels, anchor_generator=dict(type='YOLOAnchorGenerator', base_sizes=[[(32, 32), (48, 48), (24, 32), (32, 48)], [(64, 64), (72, 72), (72, 96), (96, 96)], [(72, 96), (96, 96), (128, 128), (96, 128)]], strides=[16, 32, 64]), bbox_coder=dict(type='YOLOBBoxCoder'), featmap_strides=[16, 32, 64], one_hot_smoother=0., loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_conf=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_xy=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_wh=dict(type='MSELoss', loss_weight=1.0), loss_iou=dict(type='CIoULoss', loss_weight=2.0), train_cfg=None, test_cfg=None): super(CZ_CoarseHead, self).__init__() # Check params assert (len(in_channels) == len(featmap_strides)) self.num_classes = num_classes self.in_channels = in_channels self.featmap_strides = featmap_strides self.train_cfg = train_cfg self.test_cfg = test_cfg self.sampling = False if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) if hasattr(self.train_cfg, 'sampler'): sampler_cfg = self.train_cfg.sampler else: sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.one_hot_smoother = one_hot_smoother self.bbox_coder = build_bbox_coder(bbox_coder) self.anchor_generator = build_anchor_generator(anchor_generator) self.loss_cls = build_loss(loss_cls) self.loss_conf = build_loss(loss_conf) self.loss_xy = build_loss(loss_xy) self.loss_wh = build_loss(loss_wh) self.loss_iou = build_loss(loss_iou) # usually the numbers of anchors for each level are the same # except SSD detectors self.num_anchors = self.anchor_generator.num_base_anchors[0] assert len( self.anchor_generator.num_base_anchors) == len(featmap_strides) self._init_layers()