def loss(self, keypoint_scores: List[torch.Tensor], keypoint_offsets: List[torch.Tensor], keypoint_types: List[str], gt_points: List[torch.Tensor], gt_bboxes: List[torch.Tensor], gt_labels: List[torch.Tensor], img_metas: List[dict]) -> Dict[str, torch.Tensor]: """Compute loss of single head. Note: For multiple head, we propose to concatenate the tensor along batch dimension to speed up this process. Args: keypoint_scores (List[torch.Tensor]): keypoint scores for each level for each head. keypoint_offsets (List[torch.Tensor]): keypoint offsets for each level for each head. keypoint_types: List[str]: The types of keypoint heads. gt_points (List[torch.Tensor]): Ground truth points for each image with shape (num_gts, 2) in [x, y] format. gt_bboxes (List[torch.Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (List[torch.Tensor]): class indices corresponding to each box. img_metas (List[dict]): Meta information of each image, e.g., image size, scaling factor, etc. Returns: Dict[str,torch.Tensor]: Loss for head """ featmap_sizes = [score.size()[-2:] for score in keypoint_scores] points = self.get_points(featmap_sizes, gt_points[0].dtype, gt_points[0].device) keypoint_scores = _flatten_concat(keypoint_scores).permute( 0, 2, 1) # [batch,num_points,num_classes] keypoint_offsets = _flatten_concat(keypoint_offsets).permute( 0, 2, 1) # [batch,num_points,2] score_targets, offset_targets, pos_masks = self.get_targets( points, gt_points, gt_bboxes, gt_labels) avg_factor = reduce_mean(torch.sum(pos_masks)) # TODO: Maybe positive samples and negative samples should have # different avg factors. loss_cls = self.loss_cls(keypoint_scores.sigmoid(), score_targets, avg_factor=avg_factor) loss_offset = self.loss_offset( keypoint_offsets, offset_targets, weight=pos_masks.expand_as(keypoint_offsets), avg_factor=avg_factor) return {'loss_point_cls': loss_cls, 'loss_point_offset': loss_offset}
def loss(self, cls_scores, bbox_preds, centernesses, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None): """Compute losses of the head. Args: cls_scores (list[Tensor]): Box scores for each scale level Has shape (N, num_anchors * num_classes, H, W) bbox_preds (list[Tensor]): Box energies / deltas for each scale level with shape (N, num_anchors * 4, H, W) centernesses (list[Tensor]): Centerness for each scale level with shape (N, num_anchors * 1, H, W) gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (list[Tensor] | None): specify which bounding boxes can be ignored when computing the loss. Returns: dict[str, Tensor]: A dictionary of loss components. """ featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] assert len(featmap_sizes) == self.prior_generator.num_levels device = cls_scores[0].device anchor_list, valid_flag_list = self.get_anchors( featmap_sizes, img_metas, device=device) label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1 cls_reg_targets = self.get_targets( anchor_list, valid_flag_list, gt_bboxes, img_metas, gt_bboxes_ignore_list=gt_bboxes_ignore, gt_labels_list=gt_labels, label_channels=label_channels) if cls_reg_targets is None: return None (anchor_list, labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets num_total_samples = reduce_mean( torch.tensor(num_total_pos, dtype=torch.float, device=device)).item() num_total_samples = max(num_total_samples, 1.0) losses_cls, losses_bbox, loss_centerness,\ bbox_avg_factor = multi_apply( self.loss_single, anchor_list, cls_scores, bbox_preds, centernesses, labels_list, label_weights_list, bbox_targets_list, num_total_samples=num_total_samples) bbox_avg_factor = sum(bbox_avg_factor) bbox_avg_factor = reduce_mean(bbox_avg_factor).clamp_(min=1).item() losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox)) return dict( loss_cls=losses_cls, loss_bbox=losses_bbox, loss_centerness=loss_centerness)
def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, soft_target, img_metas, gt_bboxes_ignore=None): """Compute losses of the head. Args: cls_scores (list[Tensor]): Cls and quality scores for each scale level has shape (N, num_classes, H, W). bbox_preds (list[Tensor]): Box distribution logits for each scale level with shape (N, 4*(n+1), H, W), n is max value of integral set. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (list[Tensor] | None): specify which bounding boxes can be ignored when computing the loss. Returns: dict[str, Tensor]: A dictionary of loss components. """ featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] assert len(featmap_sizes) == self.anchor_generator.num_levels device = cls_scores[0].device anchor_list, valid_flag_list = self.get_anchors(featmap_sizes, img_metas, device=device) label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1 cls_reg_targets = self.get_targets( anchor_list, valid_flag_list, gt_bboxes, img_metas, gt_bboxes_ignore_list=gt_bboxes_ignore, gt_labels_list=gt_labels, label_channels=label_channels) if cls_reg_targets is None: return None (anchor_list, labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets num_total_samples = reduce_mean( torch.tensor(num_total_pos, dtype=torch.float, device=device)).item() num_total_samples = max(num_total_samples, 1.0) losses_cls, losses_bbox, losses_dfl, losses_ld, \ avg_factor = multi_apply( self.loss_single, anchor_list, cls_scores, bbox_preds, labels_list, label_weights_list, bbox_targets_list, self.anchor_generator.strides, soft_target, num_total_samples=num_total_samples) avg_factor = sum(avg_factor) + 1e-6 avg_factor = reduce_mean(avg_factor).item() losses_bbox = [x / avg_factor for x in losses_bbox] losses_dfl = [x / avg_factor for x in losses_dfl] return dict(loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dfl=losses_dfl, loss_ld=losses_ld)
def loss(self, cls_scores, bbox_preds, centernesses, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None): """Compute loss of the head. Args: cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_classes. bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_points * 4. centernesses (list[Tensor]): centerness for each scale level, each is a 4D-tensor, the channel number is num_points * 1. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert len(cls_scores) == len(bbox_preds) == len(centernesses) featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device) labels, bbox_targets = self.get_targets(all_level_points, gt_bboxes, gt_labels) num_imgs = cls_scores[0].size(0) # flatten cls_scores, bbox_preds and centerness flatten_cls_scores = [ cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) for cls_score in cls_scores ] flatten_bbox_preds = [ bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) for bbox_pred in bbox_preds ] flatten_centerness = [ centerness.permute(0, 2, 3, 1).reshape(-1) for centerness in centernesses ] flatten_cls_scores = torch.cat(flatten_cls_scores) flatten_bbox_preds = torch.cat(flatten_bbox_preds) flatten_centerness = torch.cat(flatten_centerness) flatten_labels = torch.cat(labels) flatten_bbox_targets = torch.cat(bbox_targets) # repeat points to align with bbox_preds flatten_points = torch.cat( [points.repeat(num_imgs, 1) for points in all_level_points]) # FG cat_id: [0, num_classes -1], BG cat_id: num_classes bg_class_ind = self.num_classes pos_inds = ((flatten_labels >= 0) & (flatten_labels < bg_class_ind)).nonzero().reshape(-1) num_pos = torch.tensor(len(pos_inds), dtype=torch.float, device=bbox_preds[0].device) num_pos = max(reduce_mean(num_pos), 1.0) loss_cls = self.loss_cls(flatten_cls_scores, flatten_labels, avg_factor=num_pos) pos_bbox_preds = flatten_bbox_preds[pos_inds] pos_centerness = flatten_centerness[pos_inds] pos_bbox_targets = flatten_bbox_targets[pos_inds] pos_centerness_targets = self.centerness_target(pos_bbox_targets) # centerness weighted iou loss centerness_denorm = max( reduce_mean(pos_centerness_targets.sum().detach()), 1e-6) if len(pos_inds) > 0: pos_points = flatten_points[pos_inds] pos_decoded_bbox_preds = distance2bbox(pos_points, pos_bbox_preds) pos_decoded_target_preds = distance2bbox(pos_points, pos_bbox_targets) loss_bbox = self.loss_bbox(pos_decoded_bbox_preds, pos_decoded_target_preds, weight=pos_centerness_targets, avg_factor=centerness_denorm) loss_centerness = self.loss_centerness(pos_centerness, pos_centerness_targets, avg_factor=num_pos) else: loss_bbox = pos_bbox_preds.sum() loss_centerness = pos_centerness.sum() return dict(loss_cls=loss_cls, loss_bbox=loss_bbox, loss_centerness=loss_centerness)
def loss_single(self, cls_scores, bbox_preds, gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore_list=None): """"Loss function for outputs from a single decoder layer of a single feature level. Args: cls_scores (Tensor): Box score logits from a single decoder layer for all images. Shape [bs, num_query, cls_out_channels]. bbox_preds (Tensor): Sigmoid outputs from a single decoder layer for all images, with normalized coordinate (cx, cy, w, h) and shape [bs, num_query, 4]. gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels_list (list[Tensor]): Ground truth class indices for each image with shape (num_gts, ). img_metas (list[dict]): List of image meta information. gt_bboxes_ignore_list (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: dict[str, Tensor]: A dictionary of loss components for outputs from a single decoder layer. """ num_imgs = cls_scores.size(0) cls_scores_list = [cls_scores[i] for i in range(num_imgs)] bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore_list) (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets labels = torch.cat(labels_list, 0) label_weights = torch.cat(label_weights_list, 0) bbox_targets = torch.cat(bbox_targets_list, 0) bbox_weights = torch.cat(bbox_weights_list, 0) # classification loss cls_scores = cls_scores.reshape(-1, self.cls_out_channels) # construct weighted avg_factor to match with the official DETR repo cls_avg_factor = num_total_pos * 1.0 + \ num_total_neg * self.bg_cls_weight loss_cls = self.loss_cls( cls_scores, labels, label_weights, avg_factor=cls_avg_factor) # Compute the average number of gt boxes accross all gpus, for # normalization purposes num_total_pos = loss_cls.new_tensor([num_total_pos]) num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() # construct factors used for rescale bboxes factors = [] for img_meta, bbox_pred in zip(img_metas, bbox_preds): img_h, img_w, _ = img_meta['img_shape'] factor = bbox_pred.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0).repeat( bbox_pred.size(0), 1) factors.append(factor) factors = torch.cat(factors, 0) # DETR regress the relative position of boxes (cxcywh) in the image, # thus the learning target is normalized by the image size. So here # we need to re-scale them for calculating IoU loss bbox_preds = bbox_preds.reshape(-1, 4) bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors # regression IoU loss, defaultly GIoU loss loss_iou = self.loss_iou( bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos) # regression L1 loss loss_bbox = self.loss_bbox( bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) return loss_cls, loss_bbox, loss_iou
def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None): """Compute losses of the head. Args: cls_scores (list[Tensor]): Box scores for each scale level Has shape (batch, num_anchors * num_classes, h, w) bbox_preds (list[Tensor]): Box energies / deltas for each scale level with shape (batch, num_anchors * 4, h, w) gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. Default: None Returns: dict[str, Tensor]: A dictionary of loss components. """ assert len(cls_scores) == 1 assert self.anchor_generator.num_levels == 1 device = cls_scores[0].device featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] anchor_list, valid_flag_list = self.get_anchors(featmap_sizes, img_metas, device=device) # The output level is always 1 anchor_list = [anchors[0] for anchors in anchor_list] valid_flag_list = [valid_flags[0] for valid_flags in valid_flag_list] cls_scores_list = levels_to_images(cls_scores) bbox_preds_list = levels_to_images(bbox_preds) label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1 cls_reg_targets = self.get_targets( cls_scores_list, bbox_preds_list, anchor_list, valid_flag_list, gt_bboxes, img_metas, gt_bboxes_ignore_list=gt_bboxes_ignore, gt_labels_list=gt_labels, label_channels=label_channels) if cls_reg_targets is None: return None (batch_labels, batch_label_weights, num_total_pos, num_total_neg, batch_bbox_weights, batch_pos_predicted_boxes, batch_target_boxes) = cls_reg_targets flatten_labels = batch_labels.reshape(-1) batch_label_weights = batch_label_weights.reshape(-1) cls_score = cls_scores[0].permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) num_total_samples = (num_total_pos + num_total_neg) if self.sampling else num_total_pos num_total_samples = reduce_mean( cls_score.new_tensor(num_total_samples)).clamp_(1.0).item() # classification loss loss_cls = self.loss_cls(cls_score, flatten_labels, batch_label_weights, avg_factor=num_total_samples) # regression loss if batch_pos_predicted_boxes.shape[0] == 0: # no pos sample loss_bbox = batch_pos_predicted_boxes.sum() * 0 else: loss_bbox = self.loss_bbox(batch_pos_predicted_boxes, batch_target_boxes, batch_bbox_weights.float(), avg_factor=num_total_samples) return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
def loss_single(self, cls_scores, mask_preds, gt_labels_list, gt_masks_list, img_metas): """Loss function for outputs from a single decoder layer. Args: cls_scores (Tensor): Mask score logits from a single decoder layer for all images. Shape (batch_size, num_queries, cls_out_channels). mask_preds (Tensor): Mask logits for a pixel decoder for all images. Shape (batch_size, num_queries, h, w). gt_labels_list (list[Tensor]): Ground truth class indices for each image, each with shape (n, ). n is the sum of number of stuff types and number of instances in a image. gt_masks_list (list[Tensor]): Ground truth mask for each image, each with shape (n, h, w). img_metas (list[dict]): List of image meta information. Returns: tuple[Tensor]: Loss components for outputs from a single decoder\ layer. """ num_imgs = cls_scores.size(0) cls_scores_list = [cls_scores[i] for i in range(num_imgs)] mask_preds_list = [mask_preds[i] for i in range(num_imgs)] (labels_list, label_weights_list, mask_targets_list, mask_weights_list, num_total_pos, num_total_neg) = self.get_targets(cls_scores_list, mask_preds_list, gt_labels_list, gt_masks_list, img_metas) # shape (batch_size, num_queries) labels = torch.stack(labels_list, dim=0) # shape (batch_size, num_queries) label_weights = torch.stack(label_weights_list, dim=0) # shape (num_total_gts, h, w) mask_targets = torch.cat(mask_targets_list, dim=0) # shape (batch_size, num_queries) mask_weights = torch.stack(mask_weights_list, dim=0) # classfication loss # shape (batch_size * num_queries, ) cls_scores = cls_scores.flatten(0, 1) labels = labels.flatten(0, 1) label_weights = label_weights.flatten(0, 1) class_weight = cls_scores.new_ones(self.num_classes + 1) class_weight[-1] = self.bg_cls_weight loss_cls = self.loss_cls(cls_scores, labels, label_weights, avg_factor=class_weight[labels].sum()) num_total_masks = reduce_mean(cls_scores.new_tensor([num_total_pos])) num_total_masks = max(num_total_masks, 1) # extract positive ones # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w) mask_preds = mask_preds[mask_weights > 0] target_shape = mask_targets.shape[-2:] if mask_targets.shape[0] == 0: # zero match loss_dice = mask_preds.sum() loss_mask = mask_preds.sum() return loss_cls, loss_mask, loss_dice # upsample to shape of target # shape (num_total_gts, h, w) mask_preds = F.interpolate(mask_preds.unsqueeze(1), target_shape, mode='bilinear', align_corners=False).squeeze(1) # dice loss loss_dice = self.loss_dice(mask_preds, mask_targets, avg_factor=num_total_masks) # mask loss # FocalLoss support input of shape (n, num_class) h, w = mask_preds.shape[-2:] # shape (num_total_gts, h, w) -> (num_total_gts * h * w, 1) mask_preds = mask_preds.reshape(-1, 1) # shape (num_total_gts, h, w) -> (num_total_gts * h * w) mask_targets = mask_targets.reshape(-1) # target is (1 - mask_targets) !!! loss_mask = self.loss_mask(mask_preds, 1 - mask_targets, avg_factor=num_total_masks * h * w) return loss_cls, loss_mask, loss_dice
def loss(self, cls_scores, bbox_preds, objectnesses, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None): """Compute loss of the head. Args: cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_priors * num_classes. bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_priors * 4. objectnesses (list[Tensor], Optional): Score factor for all scale level, each is a 4D-tensor, has shape (batch_size, 1, H, W). gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. """ num_imgs = len(img_metas) featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] mlvl_priors = self.prior_generator.grid_priors( featmap_sizes, dtype=cls_scores[0].dtype, device=cls_scores[0].device, with_stride=True) flatten_cls_preds = [ cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, self.cls_out_channels) for cls_pred in cls_scores ] flatten_bbox_preds = [ bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) for bbox_pred in bbox_preds ] flatten_objectness = [ objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) for objectness in objectnesses ] flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) flatten_objectness = torch.cat(flatten_objectness, dim=1) flatten_priors = torch.cat(mlvl_priors) flatten_bboxes = self._bbox_decode(flatten_priors, flatten_bbox_preds) (pos_masks, cls_targets, obj_targets, bbox_targets, l1_targets, num_fg_imgs) = multi_apply( self._get_target_single, flatten_cls_preds.detach(), flatten_objectness.detach(), flatten_priors.unsqueeze(0).repeat(num_imgs, 1, 1), flatten_bboxes.detach(), gt_bboxes, gt_labels) # The experimental results show that ‘reduce_mean’ can improve # performance on the COCO dataset. num_pos = torch.tensor(sum(num_fg_imgs), dtype=torch.float, device=flatten_cls_preds.device) num_total_samples = max(reduce_mean(num_pos), 1.0) pos_masks = torch.cat(pos_masks, 0) cls_targets = torch.cat(cls_targets, 0) obj_targets = torch.cat(obj_targets, 0) bbox_targets = torch.cat(bbox_targets, 0) if self.use_l1: l1_targets = torch.cat(l1_targets, 0) loss_bbox = self.loss_bbox( flatten_bboxes.view(-1, 4)[pos_masks], bbox_targets) / num_total_samples loss_obj = self.loss_obj(flatten_objectness.view(-1, 1), obj_targets) / num_total_samples loss_cls = self.loss_cls( flatten_cls_preds.view(-1, self.num_classes)[pos_masks], cls_targets) / num_total_samples loss_dict = dict(loss_cls=loss_cls, loss_bbox=loss_bbox, loss_obj=loss_obj) if self.use_l1: loss_l1 = self.loss_l1( flatten_bbox_preds.view(-1, 4)[pos_masks], l1_targets) / num_total_samples loss_dict.update(loss_l1=loss_l1) return loss_dict
def loss(self, cls_scores, bbox_preds, bbox_preds_refine, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None): """Compute loss of the head. Args: cls_scores (list[Tensor]): Box iou-aware scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_classes. bbox_preds (list[Tensor]): Box offsets for each scale level, each is a 4D-tensor, the channel number is num_points * 4. bbox_preds_refine (list[Tensor]): Refined Box offsets for each scale level, each is a 4D-tensor, the channel number is num_points * 4. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. Default: None. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert len(cls_scores) == len(bbox_preds) == len(bbox_preds_refine) featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device) labels, label_weights, bbox_targets, bbox_weights = self.get_targets( cls_scores, all_level_points, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore) num_imgs = cls_scores[0].size(0) # flatten cls_scores, bbox_preds and bbox_preds_refine flatten_cls_scores = [ cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels).contiguous() for cls_score in cls_scores ] flatten_bbox_preds = [ bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4).contiguous() for bbox_pred in bbox_preds ] flatten_bbox_preds_refine = [ bbox_pred_refine.permute(0, 2, 3, 1).reshape(-1, 4).contiguous() for bbox_pred_refine in bbox_preds_refine ] flatten_cls_scores = torch.cat(flatten_cls_scores) flatten_bbox_preds = torch.cat(flatten_bbox_preds) flatten_bbox_preds_refine = torch.cat(flatten_bbox_preds_refine) flatten_labels = torch.cat(labels) flatten_bbox_targets = torch.cat(bbox_targets) # repeat points to align with bbox_preds flatten_points = torch.cat( [points.repeat(num_imgs, 1) for points in all_level_points]) # FG cat_id: [0, num_classes - 1], BG cat_id: num_classes bg_class_ind = self.num_classes pos_inds = torch.where( ((flatten_labels >= 0) & (flatten_labels < bg_class_ind)) > 0)[0] num_pos = len(pos_inds) pos_bbox_preds = flatten_bbox_preds[pos_inds] pos_bbox_preds_refine = flatten_bbox_preds_refine[pos_inds] pos_labels = flatten_labels[pos_inds] # sync num_pos across all gpus if self.sync_num_pos: num_pos_avg_per_gpu = reduce_mean( pos_inds.new_tensor(num_pos).float()).item() num_pos_avg_per_gpu = max(num_pos_avg_per_gpu, 1.0) else: num_pos_avg_per_gpu = num_pos pos_bbox_targets = flatten_bbox_targets[pos_inds] pos_points = flatten_points[pos_inds] pos_decoded_bbox_preds = distance2bbox(pos_points, pos_bbox_preds) pos_decoded_target_preds = distance2bbox(pos_points, pos_bbox_targets) iou_targets_ini = bbox_overlaps(pos_decoded_bbox_preds, pos_decoded_target_preds.detach(), is_aligned=True).clamp(min=1e-6) bbox_weights_ini = iou_targets_ini.clone().detach() iou_targets_ini_avg_per_gpu = reduce_mean( bbox_weights_ini.sum()).item() bbox_avg_factor_ini = max(iou_targets_ini_avg_per_gpu, 1.0) if num_pos > 0: loss_bbox = self.loss_bbox(pos_decoded_bbox_preds, pos_decoded_target_preds.detach(), weight=bbox_weights_ini, avg_factor=bbox_avg_factor_ini) pos_decoded_bbox_preds_refine = \ distance2bbox(pos_points, pos_bbox_preds_refine) iou_targets_rf = bbox_overlaps(pos_decoded_bbox_preds_refine, pos_decoded_target_preds.detach(), is_aligned=True).clamp(min=1e-6) bbox_weights_rf = iou_targets_rf.clone().detach() iou_targets_rf_avg_per_gpu = reduce_mean( bbox_weights_rf.sum()).item() bbox_avg_factor_rf = max(iou_targets_rf_avg_per_gpu, 1.0) loss_bbox_refine = self.loss_bbox_refine( pos_decoded_bbox_preds_refine, pos_decoded_target_preds.detach(), weight=bbox_weights_rf, avg_factor=bbox_avg_factor_rf) # build IoU-aware cls_score targets if self.use_vfl: pos_ious = iou_targets_rf.clone().detach() cls_iou_targets = torch.zeros_like(flatten_cls_scores) cls_iou_targets[pos_inds, pos_labels] = pos_ious else: loss_bbox = pos_bbox_preds.sum() * 0 loss_bbox_refine = pos_bbox_preds_refine.sum() * 0 if self.use_vfl: cls_iou_targets = torch.zeros_like(flatten_cls_scores) if self.use_vfl: loss_cls = self.loss_cls(flatten_cls_scores, cls_iou_targets, avg_factor=num_pos_avg_per_gpu) else: loss_cls = self.loss_cls(flatten_cls_scores, flatten_labels, weight=label_weights, avg_factor=num_pos_avg_per_gpu) return dict(loss_cls=loss_cls, loss_bbox=loss_bbox, loss_bbox_rf=loss_bbox_refine)
def loss_single(self, cls_scores, mask_preds, gt_labels_list, gt_masks_list, img_metas): """Loss function for outputs from a single decoder layer. Args: cls_scores (Tensor): Mask score logits from a single decoder layer for all images. Shape (batch_size, num_queries, cls_out_channels). Note `cls_out_channels` should includes background. mask_preds (Tensor): Mask logits for a pixel decoder for all images. Shape (batch_size, num_queries, h, w). gt_labels_list (list[Tensor]): Ground truth class indices for each image, each with shape (num_gts, ). gt_masks_list (list[Tensor]): Ground truth mask for each image, each with shape (num_gts, h, w). img_metas (list[dict]): List of image meta information. Returns: tuple[Tensor]: Loss components for outputs from a single \ decoder layer. """ num_imgs = cls_scores.size(0) cls_scores_list = [cls_scores[i] for i in range(num_imgs)] mask_preds_list = [mask_preds[i] for i in range(num_imgs)] (labels_list, label_weights_list, mask_targets_list, mask_weights_list, num_total_pos, num_total_neg) = self.get_targets(cls_scores_list, mask_preds_list, gt_labels_list, gt_masks_list, img_metas) # shape (batch_size, num_queries) labels = torch.stack(labels_list, dim=0) # shape (batch_size, num_queries) label_weights = torch.stack(label_weights_list, dim=0) # shape (num_total_gts, h, w) mask_targets = torch.cat(mask_targets_list, dim=0) # shape (batch_size, num_queries) mask_weights = torch.stack(mask_weights_list, dim=0) # classfication loss # shape (batch_size * num_queries, ) cls_scores = cls_scores.flatten(0, 1) labels = labels.flatten(0, 1) label_weights = label_weights.flatten(0, 1) class_weight = cls_scores.new_tensor(self.class_weight) loss_cls = self.loss_cls(cls_scores, labels, label_weights, avg_factor=class_weight[labels].sum()) num_total_masks = reduce_mean(cls_scores.new_tensor([num_total_pos])) num_total_masks = max(num_total_masks, 1) # extract positive ones # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w) mask_preds = mask_preds[mask_weights > 0] if mask_targets.shape[0] == 0: # zero match loss_dice = mask_preds.sum() loss_mask = mask_preds.sum() return loss_cls, loss_mask, loss_dice with torch.no_grad(): points_coords = get_uncertain_point_coords_with_randomness( mask_preds.unsqueeze(1), None, self.num_points, self.oversample_ratio, self.importance_sample_ratio) # shape (num_total_gts, h, w) -> (num_total_gts, num_points) mask_point_targets = point_sample( mask_targets.unsqueeze(1).float(), points_coords).squeeze(1) # shape (num_queries, h, w) -> (num_queries, num_points) mask_point_preds = point_sample(mask_preds.unsqueeze(1), points_coords).squeeze(1) # dice loss loss_dice = self.loss_dice(mask_point_preds, mask_point_targets, avg_factor=num_total_masks) # mask loss # shape (num_queries, num_points) -> (num_queries * num_points, ) mask_point_preds = mask_point_preds.reshape(-1) # shape (num_total_gts, num_points) -> (num_total_gts * num_points, ) mask_point_targets = mask_point_targets.reshape(-1) loss_mask = self.loss_mask(mask_point_preds, mask_point_targets, avg_factor=num_total_masks * self.num_points) return loss_cls, loss_mask, loss_dice
def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None): """Compute losses of the head. Args: cls_scores (list[Tensor]): Box scores for each scale level Has shape (N, num_anchors * num_classes, H, W) bbox_preds (list[Tensor]): Decoded box for each scale level with shape (N, num_anchors * 4, H, W) in [tl_x, tl_y, br_x, br_y] format. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (list[Tensor] | None): specify which bounding boxes can be ignored when computing the loss. Returns: dict[str, Tensor]: A dictionary of loss components. """ num_imgs = len(img_metas) featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] assert len(featmap_sizes) == self.prior_generator.num_levels device = cls_scores[0].device anchor_list, valid_flag_list = self.get_anchors(featmap_sizes, img_metas, device=device) label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1 flatten_cls_scores = torch.cat([ cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, self.cls_out_channels) for cls_score in cls_scores ], 1) flatten_bbox_preds = torch.cat([ bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) * stride[0] for bbox_pred, stride in zip(bbox_preds, self.prior_generator.strides) ], 1) cls_reg_targets = self.get_targets( flatten_cls_scores, flatten_bbox_preds, anchor_list, valid_flag_list, gt_bboxes, img_metas, gt_bboxes_ignore_list=gt_bboxes_ignore, gt_labels_list=gt_labels, label_channels=label_channels) (anchor_list, labels_list, label_weights_list, bbox_targets_list, alignment_metrics_list) = cls_reg_targets losses_cls, losses_bbox,\ cls_avg_factors, bbox_avg_factors = multi_apply( self.loss_single, anchor_list, cls_scores, bbox_preds, labels_list, label_weights_list, bbox_targets_list, alignment_metrics_list, self.prior_generator.strides) cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item() losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls)) bbox_avg_factor = reduce_mean( sum(bbox_avg_factors)).clamp_(min=1).item() losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox)) return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
def loss(self, cls_scores, bbox_preds, iou_preds, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None): """Compute losses of the head. Args: cls_scores (list[Tensor]): Box scores for each scale level Has shape (N, num_base_priors * num_classes, H, W) bbox_preds (list[Tensor]): Box energies / deltas for each scale level with shape (N, num_base_priors * 4, H, W) iou_preds (list[Tensor]): Score factor for all scale level, each is a 4D-tensor, has shape (batch_size, 1, H, W). gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (list[Tensor] | None): specify which bounding boxes can be ignored when computing the loss. Returns: dict[str, Tensor]: A dictionary of loss components. """ featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] assert len(featmap_sizes) == self.prior_generator.num_levels device = cls_scores[0].device anchor_list, valid_flag_list = self.get_anchors(featmap_sizes, img_metas, device=device) label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1 # calculate common vars for cls and reg assigners at once targets_com = self.process_predictions_and_anchors( anchor_list, valid_flag_list, cls_scores, bbox_preds, img_metas, gt_bboxes_ignore) (anchor_list, valid_flag_list, num_level_anchors_list, cls_score_list, bbox_pred_list, gt_bboxes_ignore_list) = targets_com # classification branch assigner cls_targets = self.get_cls_targets( anchor_list, valid_flag_list, num_level_anchors_list, cls_score_list, bbox_pred_list, gt_bboxes, img_metas, gt_bboxes_ignore_list=gt_bboxes_ignore_list, gt_labels_list=gt_labels, label_channels=label_channels) if cls_targets is None: return None (cls_anchor_list, labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) = cls_targets num_total_samples = reduce_mean( torch.tensor(num_total_pos, dtype=torch.float, device=device)).item() num_total_samples = max(num_total_samples, 1.0) reweight_factor_per_level = self.calc_reweight_factor(labels_list) cls_losses_cls, = multi_apply(self.loss_cls_single, cls_scores, labels_list, label_weights_list, reweight_factor_per_level, num_total_samples=num_total_samples) # regression branch assigner reg_targets = self.get_reg_targets( anchor_list, valid_flag_list, num_level_anchors_list, cls_score_list, bbox_pred_list, gt_bboxes, img_metas, gt_bboxes_ignore_list=gt_bboxes_ignore_list, gt_labels_list=gt_labels, label_channels=label_channels) if reg_targets is None: return None (reg_anchor_list, labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) = reg_targets num_total_samples = reduce_mean( torch.tensor(num_total_pos, dtype=torch.float, device=device)).item() num_total_samples = max(num_total_samples, 1.0) reweight_factor_per_level = self.calc_reweight_factor(labels_list) reg_losses_bbox, reg_losses_iou = multi_apply( self.loss_reg_single, reg_anchor_list, bbox_preds, iou_preds, labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, reweight_factor_per_level, num_total_samples=num_total_samples) return dict(loss_cls=cls_losses_cls, loss_bbox=reg_losses_bbox, loss_iou=reg_losses_iou)
def loss(self, cls_scores, bbox_preds, iou_preds, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None): """Compute losses of the head. Args: cls_scores (list[Tensor]): Box scores for each scale level Has shape (N, num_anchors * num_classes, H, W) bbox_preds (list[Tensor]): Box energies / deltas for each scale level with shape (N, num_anchors * 4, H, W) gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. Default: None Returns: dict[str, Tensor]: A dictionary of loss components. """ featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] assert len(featmap_sizes) == self.anchor_generator.num_levels device = cls_scores[0].device anchor_list, valid_flag_list = self.get_anchors( featmap_sizes, img_metas, device=device) label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1 cls_reg_targets = self.get_targets( anchor_list, valid_flag_list, gt_bboxes, img_metas, gt_bboxes_ignore_list=gt_bboxes_ignore, gt_labels_list=gt_labels, label_channels=label_channels) if cls_reg_targets is None: return None (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets # NOTE DO NOT KNOW WHY here # avoid 0 samples? # num_total_samples = ( # num_total_pos + num_total_neg if self.sampling else num_total_pos) num_total_samples = reduce_mean( torch.tensor(1. * num_total_pos).cuda()).item() num_total_samples = max(num_total_samples, 1.0) # anchor number of multi levels num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] # concat all level anchors and flags to a single tensor concat_anchor_list = [] for i in range(len(anchor_list)): concat_anchor_list.append(torch.cat(anchor_list[i])) all_anchor_list = images_to_levels(concat_anchor_list, num_level_anchors) losses_cls, losses_bbox, losses_iou = multi_apply( self.loss_single, cls_scores, bbox_preds, iou_preds, all_anchor_list, labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_samples=num_total_samples) return dict(loss_cls=losses_cls, loss_bbox=losses_bbox, loss_iou=losses_iou)