def test_smooth_l1_loss(self) -> None: inputs = torch.tensor([1, 2, 3], dtype=torch.float32) targets = torch.tensor([1.1, 2, 4.5], dtype=torch.float32) beta = 0.5 loss = smooth_l1_loss(inputs, targets, beta=beta, reduction="none").numpy() self.assertTrue( np.allclose(loss, [0.5 * 0.1**2 / beta, 0, 1.5 - 0.5 * beta])) beta = 0.05 loss = smooth_l1_loss(inputs, targets, beta=beta, reduction="none").numpy() self.assertTrue( np.allclose(loss, [0.1 - 0.5 * beta, 0, 1.5 - 0.5 * beta]))
def smooth_l1_loss(self): if self._no_instances: return 0.0 * self.pred_proposal_deltas.sum() gt_proposal_deltas = self.box2box_transform.get_deltas( self.proposals.tensor, self.gt_boxes.tensor) box_dim = gt_proposal_deltas.size(1) # 4 or 5 cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim device = self.pred_proposal_deltas.device bg_class_ind = self.pred_class_logits.shape[1] - 1 fg_inds = nonzero_tuple((self.gt_classes >= 0) & (self.gt_classes < bg_class_ind))[0] if cls_agnostic_bbox_reg: # pred_proposal_deltas only corresponds to foreground class for agnostic gt_class_cols = torch.arange(box_dim, device=device) else: fg_gt_classes = self.gt_classes[fg_inds] gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange( box_dim, device=device) loss_box_reg = smooth_l1_loss( self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols], gt_proposal_deltas[fg_inds], self.smooth_l1_beta, reduction="sum", ) loss_box_reg = loss_box_reg / self.gt_classes.numel() return loss_box_reg
def smooth_l1_loss_vp_residual(self): """ Compute the smooth L1 loss for viewpoint regression. Returns: scalar Tensor """ gt_vp_deltas = self.get_vp_deltas() device = self.pred_proposal_deltas.device bg_class_ind = self.pred_class_logits.shape[1] - 1 fg_inds = torch.nonzero((self.gt_classes >= 0) & (self.gt_classes < bg_class_ind)).squeeze(1) fg_gt_classes = self.gt_classes[fg_inds] # pdb.set_trace() res_index_list = list() for idx, logit in enumerate(self.viewpoint_res_logits[fg_inds]): res_index_list.append(fg_gt_classes[idx] * self.vp_bins + self.gt_viewpoint[fg_inds][idx]) loss_box_reg = smooth_l1_loss( self.viewpoint_res_logits[fg_inds, res_index_list], gt_vp_deltas[fg_inds], self.smooth_l1_beta, reduction="sum", ) loss_box_reg = loss_box_reg / self.gt_classes.numel() return loss_box_reg
def smooth_l1_loss_height(self): """ Compute the smooth L1 loss for height regression. Returns: scalar Tensor """ gt_height_deltas = self.get_h_deltas() # dh,dz box_dim = gt_height_deltas.size(1) device = self.pred_proposal_deltas.device bg_class_ind = self.pred_class_logits.shape[1] - 1 fg_inds = torch.nonzero((self.gt_classes >= 0) & (self.gt_classes < bg_class_ind)).squeeze(1) fg_gt_classes = self.gt_classes[fg_inds] # 2 columns gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange( box_dim, device=device) loss_box_reg = smooth_l1_loss( self.height_logits[fg_inds[:, None], gt_class_cols], gt_height_deltas[fg_inds], self.smooth_l1_beta, reduction="sum", ) # The loss is normalized as in box delta regression task loss_box_reg = loss_box_reg / self.gt_classes.numel() return loss_box_reg
def smooth_l1_loss(self, gt_classes, gt_anchors_deltas, pred_anchor_deltas): """ Compute the smooth L1 loss for box regression. Returns: scalar Tensor """ box_delta_flattened = [ permute_to_N_HWA_K(x, 4) for x in pred_anchor_deltas ] pred_anchor_deltas = cat(box_delta_flattened, dim=1).reshape(-1, 4) # shapes: (N x R, 4) gt_classes = gt_classes.flatten() gt_anchors_deltas = gt_anchors_deltas.view(-1, 4) foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) num_foreground = foreground_idxs.sum() loss_box_reg = smooth_l1_loss( pred_anchor_deltas[foreground_idxs], gt_anchors_deltas[foreground_idxs], beta=self.smooth_l1_loss_beta, reduction="sum", ) / max(1, num_foreground) return loss_box_reg
def losses( self, anchors, pred_objectness_logits: List[torch.Tensor], gt_labels: List[torch.Tensor], pred_anchor_deltas: List[torch.Tensor], gt_boxes, ): num_images = len(gt_labels) gt_labels = torch.stack(gt_labels) # (N, sum(Hi*Wi*Ai)) anchors = type(anchors[0]).cat(anchors).tensor # Ax4 gt_anchor_deltas = [self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes] gt_anchor_deltas = torch.stack(gt_anchor_deltas) # (N, sum(Hi*Wi*Ai), 4) pos_mask = gt_labels == 1 localization_loss = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[pos_mask], gt_anchor_deltas[pos_mask], self.smooth_l1_beta, reduction="sum", ) valid_mask = gt_labels >= 0 objectness_loss = F.binary_cross_entropy_with_logits( cat(pred_objectness_logits, dim=1)[valid_mask], gt_labels[valid_mask].to(torch.float32), reduction="sum", ) normalizer = self.batch_size_per_image * num_images return { "loss_rpn_cls": objectness_loss / normalizer, "loss_rpn_loc": localization_loss / normalizer, }
def rpn_losses(gt_labels, gt_anchor_deltas, pred_objectness_logits, pred_anchor_deltas, smooth_l1_beta): """ Args: gt_labels (Tensor): shape (N,), each element in {-1, 0, 1} representing ground-truth objectness labels with: -1 = ignore; 0 = not object; 1 = object. gt_anchor_deltas (Tensor): shape (N, box_dim), row i represents ground-truth box2box transform targets (dx, dy, dw, dh) or (dx, dy, dw, dh, da) that map anchor i to its matched ground-truth box. pred_objectness_logits (Tensor): shape (N,), each element is a predicted objectness logit. pred_anchor_deltas (Tensor): shape (N, box_dim), each row is a predicted box2box transform (dx, dy, dw, dh) or (dx, dy, dw, dh, da) smooth_l1_beta (float): The transition point between L1 and L2 loss in the smooth L1 loss function. When set to 0, the loss becomes L1. When set to +inf, the loss becomes constant 0. Returns: objectness_loss, localization_loss, both unnormalized (summed over samples). """ pos_masks = gt_labels == 1 localization_loss = smooth_l1_loss(pred_anchor_deltas[pos_masks], gt_anchor_deltas[pos_masks], smooth_l1_beta, reduction="sum") valid_masks = gt_labels >= 0 objectness_loss = F.binary_cross_entropy_with_logits( pred_objectness_logits[valid_masks], gt_labels[valid_masks].to(torch.float32), reduction="sum", ) return objectness_loss, localization_loss
def losses(self, strides): # (N, X) pred_objectness_logits = torch.cat( [p.view(p.size(0), -1) for p in self.pred_objectness_logits], dim=1 ) # (N, X, 4) pred_bboxes = torch.cat([p.view(p.size(0), 4, -1) for p in self.pred_bboxes], dim=2) pos_masks = self.gt_labels > 0 pos_count = pos_masks.sum() neg_masks = ~pos_masks neg_count = torch.min(neg_masks.sum(), pos_count * 3).item() cls_loss = sigmoid_focal_loss_jit( pred_objectness_logits, self.gt_labels, alpha=0.25, reduction="none" ) neg_cls_loss, _ = cls_loss[neg_masks].topk(neg_count) cls_loss = cls_loss[pos_masks].mean() + neg_cls_loss.mean() # (N, X) pred_bboxes = pred_bboxes.permute(0, 2, 1) / strides[None, :, None] / 4 gt_bboxes = self.gt_boxes / strides[None, :, None] / 4 localization_loss = smooth_l1_loss( pred_bboxes[pos_masks], gt_bboxes[pos_masks], 0.11, reduction="mean" ) return { "cls_loss": cls_loss, "localization_loss": localization_loss }
def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes): """ Args: All boxes are tensors with the same shape Rx(4 or 5). gt_classes is a long tensor of shape R, the gt class label of each proposal. R shall be the number of proposals. """ box_dim = proposal_boxes.shape[1] # 4 or 5 # Regression loss is only computed for foreground proposals (those matched to a GT) fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0] if pred_deltas.shape[1] == box_dim: # cls-agnostic regression fg_pred_deltas = pred_deltas[fg_inds] else: fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[fg_inds, gt_classes[fg_inds]] if self.box_reg_loss_type == "smooth_l1": gt_pred_deltas = self.box2box_transform.get_deltas( proposal_boxes[fg_inds], gt_boxes[fg_inds], ) loss_box_reg = smooth_l1_loss(fg_pred_deltas, gt_pred_deltas, self.smooth_l1_beta, reduction="sum") elif self.box_reg_loss_type == "giou": fg_pred_boxes = self.box2box_transform.apply_deltas( fg_pred_deltas, proposal_boxes[fg_inds]) loss_box_reg = giou_loss(fg_pred_boxes, gt_boxes[fg_inds], reduction="sum") elif self.box_reg_loss_type == "diou": fg_pred_boxes = self.box2box_transform.apply_deltas( fg_pred_deltas, proposal_boxes[fg_inds]) loss_box_reg = diou_loss(fg_pred_boxes, gt_boxes[fg_inds], reduction="sum") elif self.box_reg_loss_type == "ciou": fg_pred_boxes = self.box2box_transform.apply_deltas( fg_pred_deltas, proposal_boxes[fg_inds]) loss_box_reg = ciou_loss(fg_pred_boxes, gt_boxes[fg_inds], reduction="sum") else: raise ValueError( f"Invalid bbox reg loss type '{self.box_reg_loss_type}'") # The reg loss is normalized using the total number of regions (R), not the number # of foreground regions even though the box regression loss is only defined on # foreground regions. Why? Because doing so gives equal training influence to # each foreground example. To see how, consider two different minibatches: # (1) Contains a single foreground region # (2) Contains 100 foreground regions # If we normalize by the number of foreground regions, the single example in # minibatch (1) will be given 100 times as much influence as each foreground # example in minibatch (2). Normalizing by the total number of regions, R, # means that the single example in minibatch (1) and each of the 100 examples # in minibatch (2) are given equal influence. return loss_box_reg / max(gt_classes.numel(), 1.0) # return 0 if empty
def losses(self, anchors, pred_objectness_logits, gt_labels, pred_anchor_deltas, gt_boxes, loss_weights=None): num_images = len(gt_labels) gt_labels = torch.stack(gt_labels) # (N, sum(Hi*Wi*Ai)) # Log the number of positive/negative anchors per-image that's used in training pos_mask = gt_labels == 1 num_pos_anchors = pos_mask.sum().item() num_neg_anchors = (gt_labels == 0).sum().item() storage = get_event_storage() storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images) storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images) reduction = "sum" if loss_weights is None else "none" if self.box_reg_loss_type == "smooth_l1": anchors = type(anchors[0]).cat(anchors).tensor # Ax(4 or 5) gt_anchor_deltas = [ self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes ] gt_anchor_deltas = torch.stack( gt_anchor_deltas) # (N, sum(Hi*Wi*Ai), 4 or 5) localization_loss = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[pos_mask], gt_anchor_deltas[pos_mask], self.smooth_l1_beta, reduction=reduction, ) elif self.box_reg_loss_type == "giou": pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas) pred_proposals = cat(pred_proposals, dim=1) pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1]) pos_mask = pos_mask.view(-1) localization_loss = giou_loss(pred_proposals[pos_mask], cat(gt_boxes)[pos_mask], reduction=reduction) else: raise ValueError( f"Invalid rpn box reg loss type '{self.box_reg_loss_type}'") valid_mask = gt_labels >= 0 objectness_loss = F.binary_cross_entropy_with_logits( cat(pred_objectness_logits, dim=1)[valid_mask], gt_labels[valid_mask].to(torch.float32), reduction=reduction, ) normalizer = self.batch_size_per_image * num_images losses = { "loss_rpn_cls": objectness_loss / normalizer, "loss_rpn_loc": localization_loss / normalizer, } losses = { k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items() } return losses
def losses( self, anchors, pred_objectness_logits: List[torch.Tensor], gt_labels: List[torch.Tensor], pred_anchor_deltas: List[torch.Tensor], gt_boxes, ): """ Return the losses from a set of RPN predictions and their associated ground-truth. Args: anchors (list[Boxes or RotatedBoxes]): anchors for each feature map, each has shape (Hi*Wi*A, B), where B is box dimension (4 or 5). pred_objectness_logits (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, Hi*Wi*A) representing the predicted objectness logits for all anchors. gt_labels (list[Tensor]): Output of :meth:`label_and_sample_anchors`. pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, Hi*Wi*A, 4 or 5) representing the predicted "deltas" used to transform anchors to proposals. gt_boxes (list[Boxes or RotatedBoxes]): Output of :meth:`label_and_sample_anchors`. Returns: dict[loss name -> loss value]: A dict mapping from loss name to loss value. Loss names are: `loss_rpn_cls` for objectness classification and `loss_rpn_loc` for proposal localization. """ num_images = len(gt_labels) gt_labels = torch.stack(gt_labels) # (N, sum(Hi*Wi*Ai)) anchors = type(anchors[0]).cat(anchors).tensor # Ax(4 or 5) #print(anchors.shape, gt_boxes[0].shape, len(gt_boxes)) gt_anchor_deltas = [self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes] gt_anchor_deltas = torch.stack(gt_anchor_deltas) # (N, sum(Hi*Wi*Ai), 4 or 5) # Log the number of positive/negative anchors per-image that's used in training pos_mask = gt_labels == 1 num_pos_anchors = pos_mask.sum().item() num_neg_anchors = (gt_labels == 0).sum().item() storage = get_event_storage() storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images) storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images) localization_loss = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[pos_mask], gt_anchor_deltas[pos_mask], self.smooth_l1_beta, reduction="sum", ) valid_mask = gt_labels >= 0 objectness_loss = F.binary_cross_entropy_with_logits( cat(pred_objectness_logits, dim=1)[valid_mask], gt_labels[valid_mask].to(torch.float32), reduction="sum", ) normalizer = self.batch_size_per_image * num_images return { "loss_rpn_cls": objectness_loss / normalizer, "loss_rpn_loc": localization_loss / normalizer, }
def smooth_l1_loss(self): """ Compute the smooth L1 loss for box regression. Returns: scalar Tensor """ if self._no_instances: return 0.0 * self.pred_proposal_deltas.sum() gt_proposal_deltas = self.box2box_transform.get_deltas( self.proposals.tensor, self.gt_boxes.tensor ) box_dim = gt_proposal_deltas.size(1) # 4 or 5 cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim device = self.pred_proposal_deltas.device bg_class_ind = self.pred_class_logits.shape[1] - 1 # Box delta loss is only computed between the prediction for the gt class k # (if 0 <= k < bg_class_ind) and the target; there is no loss defined on predictions # for non-gt classes and background. # Empty fg_inds produces a valid loss of zero as long as the size_average # arg to smooth_l1_loss is False (otherwise it uses torch.mean internally # and would produce a nan loss). fg_inds = torch.nonzero( (self.gt_classes >= 0) & (self.gt_classes < bg_class_ind), as_tuple=True )[0] if cls_agnostic_bbox_reg: # pred_proposal_deltas only corresponds to foreground class for agnostic gt_class_cols = torch.arange(box_dim, device=device) else: fg_gt_classes = self.gt_classes[fg_inds] # pred_proposal_deltas for class k are located in columns [b * k : b * k + b], # where b is the dimension of box representation (4 or 5) # Note that compared to Detectron1, # we do not perform bounding box regression for background classes. gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange(box_dim, device=device) loss_box_reg = smooth_l1_loss( self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols], gt_proposal_deltas[fg_inds], self.smooth_l1_beta, reduction="sum", ) # The loss is normalized using the total number of regions (R), not the number # of foreground regions even though the box regression loss is only defined on # foreground regions. Why? Because doing so gives equal training influence to # each foreground example. To see how, consider two different minibatches: # (1) Contains a single foreground region # (2) Contains 100 foreground regions # If we normalize by the number of foreground regions, the single example in # minibatch (1) will be given 100 times as much influence as each foreground # example in minibatch (2). Normalizing by the total number of regions, R, # means that the single example in minibatch (1) and each of the 100 examples # in minibatch (2) are given equal influence. loss_box_reg = loss_box_reg / self.gt_classes.numel() return loss_box_reg
def losses(self, pred_logits, pred_init_boxes, gt_init_bboxes, gt_cls: torch.Tensor, strides): """ Loss computation. Args: pred_logits: (N, X, C). Classification prediction, where X is the number of positions from all feature levels, C is the number of object classes. pred_init_boxes: (N, X, 4). Init box prediction. pred_refine_boxes: (N, X, 4). Refined box prediction. gt_init_objectness: (N, X). Foreground/background classification for initial prediction. gt_init_bboxes: (N, X, 4). Initial box prediction. gt_cls: (N, X), Long. GT for box classification, -1 indicates ignoring. gt_refine_bboxes: (N, X, 4). Refined box prediction. strides: (X). Scale factor at each position. Returns: dict[str, Tensor]: mapping from a named loss to a scalar tensor storing the loss. Used during training only. The dict keys are: "loss_cls", "loss_localization_init", and "loss_localization_refine". """ valid_idxs = gt_cls >= 0 foreground_idxs = valid_idxs.logical_and(gt_cls != self.num_classes) num_foreground = foreground_idxs.sum().item() / gt_init_bboxes.shape[0] get_event_storage().put_scalar("num_foreground", num_foreground) gt_cls_target = torch.zeros_like(pred_logits) gt_cls_target[foreground_idxs, gt_cls[foreground_idxs]] = 1 self.loss_normalizer = ( self.loss_normalizer_momentum * self.loss_normalizer + (1 - self.loss_normalizer_momentum) * num_foreground) loss_cls = sigmoid_focal_loss_jit(pred_logits[valid_idxs], gt_cls_target[valid_idxs], alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum") / max( 1, self.loss_normalizer) strides = strides[None].repeat(pred_logits.shape[0], 1) coords_norm_init = strides[foreground_idxs].unsqueeze(-1) * 4 loss_localization_init = smooth_l1_loss( pred_init_boxes[foreground_idxs] / coords_norm_init, gt_init_bboxes[foreground_idxs] / coords_norm_init, 0.11, reduction='sum') / max(1, self.loss_normalizer) # coords_norm = strides[foreground_idxs].unsqueeze(-1) * 4 # loss_localization = smooth_l1_loss( # pred_boxes[foreground_idxs] / coords_norm, # gt_bboxes[foreground_idxs] / coords_norm, # 0.11, reduction="sum") / max(1, self.loss_normalizer) return { "loss_cls": loss_cls, "loss_localization": loss_localization_init }
def test_empty_inputs(self) -> None: inputs = torch.empty([0, 10], dtype=torch.float32).requires_grad_() targets = torch.empty([0, 10], dtype=torch.float32) loss = smooth_l1_loss(inputs, targets, beta=0.5, reduction="mean") loss.backward() self.assertEqual(loss.detach().numpy(), 0.0) self.assertIsNotNone(inputs.grad)
def _dense_box_regression_loss( anchors: List[Boxes], box2box_transform: Box2BoxTransform, pred_anchor_deltas: List[torch.Tensor], gt_boxes: List[torch.Tensor], fg_mask: torch.Tensor, box_reg_loss_type="smooth_l1", smooth_l1_beta=0.0, ): """ Compute loss for dense multi-level box regression. Loss is accumulated over ``fg_mask``. Args: anchors: #lvl anchor boxes, each is (HixWixA, 4) pred_anchor_deltas: #lvl predictions, each is (N, HixWixA, 4) gt_boxes: N ground truth boxes, each has shape (R, 4) (R = sum(Hi * Wi * A)) fg_mask: the foreground boolean mask of shape (N, R) to compute loss on box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou", "diou", "ciou". smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1" """ anchors = type(anchors[0]).cat(anchors).tensor # (R, 4) if box_reg_loss_type == "smooth_l1": gt_anchor_deltas = [box2box_transform.get_deltas(anchors, k) for k in gt_boxes] gt_anchor_deltas = torch.stack(gt_anchor_deltas) # (N, R, 4) loss_box_reg = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[fg_mask], gt_anchor_deltas[fg_mask], beta=smooth_l1_beta, reduction="sum", ) elif box_reg_loss_type == "giou": pred_boxes = [ box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1) ] loss_box_reg = giou_loss( torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum" ) elif box_reg_loss_type == "diou": pred_boxes = [ box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1) ] loss_box_reg = diou_loss( torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum" ) elif box_reg_loss_type == "ciou": pred_boxes = [ box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1) ] loss_box_reg = ciou_loss( torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum" ) else: raise ValueError(f"Invalid dense box regression loss type '{box_reg_loss_type}'") return loss_box_reg
def losses(self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes): """ Args: anchors (list[Boxes]): a list of #feature level Boxes gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`. Their shapes are (N, R) and (N, R, 4), respectively, where R is the total number of anchors across levels, i.e. sum(Hi x Wi x Ai) pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4). Where K is the number of classes used in `pred_logits`. Returns: dict[str, Tensor]: mapping from a named loss to a scalar tensor storing the loss. Used during training only. The dict keys are: "loss_cls" and "loss_box_reg" """ num_images = len(gt_labels) gt_labels = torch.stack(gt_labels) # (N, R) anchors = type(anchors[0]).cat(anchors).tensor # (R, 4) gt_anchor_deltas = [ self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes ] gt_anchor_deltas = torch.stack(gt_anchor_deltas) # (N, R, 4) valid_mask = gt_labels >= 0 pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes) num_pos_anchors = pos_mask.sum().item() get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images) self.loss_normalizer = self.loss_normalizer_momentum * self.loss_normalizer + ( 1 - self.loss_normalizer_momentum) * max(num_pos_anchors, 1) # classification and regression loss gt_labels_target = F.one_hot( gt_labels[valid_mask], num_classes=self.num_classes + 1)[:, :-1] # no loss for the last (background) class loss_cls = sigmoid_focal_loss_jit( cat(pred_logits, dim=1)[valid_mask], gt_labels_target.to(pred_logits[0].dtype), alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) loss_box_reg = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[pos_mask], gt_anchor_deltas[pos_mask], beta=self.smooth_l1_loss_beta, reduction="sum", ) return { "loss_cls": loss_cls / self.loss_normalizer, "loss_box_reg": loss_box_reg / self.loss_normalizer, }
def overlap_prob_loss(self): self.pred_overlap_prob = self.pred_overlap_prob loss_overlap_prob = smooth_l1_loss( self.pred_overlap_prob[:, 0], # logit --> sigmoid self.overlap_iou, self.cls_box_beta, reduction="sum", ) return loss_overlap_prob / (self.pred_overlap_prob.size(0) + 1e-6)
def box_reg_loss(self): """ Deprecated """ if self._no_instances: return 0.0 * self.pred_proposal_deltas.sum() box_dim = self.proposals.tensor.size(1) # 4 or 5 cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim device = self.pred_proposal_deltas.device bg_class_ind = self.pred_class_logits.shape[1] - 1 # Box delta loss is only computed between the prediction for the gt class k # (if 0 <= k < bg_class_ind) and the target; there is no loss defined on predictions # for non-gt classes and background. # Empty fg_inds should produce a valid loss of zero because reduction=sum. fg_inds = nonzero_tuple((self.gt_classes >= 0) & (self.gt_classes < bg_class_ind))[0] if cls_agnostic_bbox_reg: # pred_proposal_deltas only corresponds to foreground class for agnostic gt_class_cols = torch.arange(box_dim, device=device) else: # pred_proposal_deltas for class k are located in columns [b * k : b * k + b], # where b is the dimension of box representation (4 or 5) # Note that compared to Detectron1, # we do not perform bounding box regression for background classes. gt_class_cols = box_dim * self.gt_classes[ fg_inds, None] + torch.arange(box_dim, device=device) if self.box_reg_loss_type == "smooth_l1": gt_proposal_deltas = self.box2box_transform.get_deltas( self.proposals.tensor, self.gt_boxes.tensor) loss_box_reg = smooth_l1_loss( self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols], gt_proposal_deltas[fg_inds], self.smooth_l1_beta, reduction="sum", ) elif self.box_reg_loss_type == "giou": fg_pred_boxes = self.box2box_transform.apply_deltas( self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols], self.proposals.tensor[fg_inds], ) loss_box_reg = giou_loss( fg_pred_boxes, self.gt_boxes.tensor[fg_inds], reduction="sum", ) else: raise ValueError( f"Invalid bbox reg loss type '{self.box_reg_loss_type}'") loss_box_reg = loss_box_reg / self.gt_classes.numel() return loss_box_reg
def losses(self, gt_classes, gt_anchors_deltas, pred_class_logits, pred_anchor_deltas): """ Args: For `gt_classes` and `gt_anchors_deltas` parameters, see :meth:`RetinaNet.get_ground_truth`. Their shapes are (N, R) and (N, R, 4), respectively, where R is the total number of anchors across levels, i.e. sum(Hi x Wi x A) For `pred_class_logits` and `pred_anchor_deltas`, see :meth:`RetinaNetHead.forward`. Returns: dict[str, Tensor]: mapping from a named loss to a scalar tensor storing the loss. Used during training only. The dict keys are: "loss_cls" and "loss_box_reg" """ pred_class_logits, pred_anchor_deltas = permute_all_cls_and_box_to_N_HWA_K_and_concat( pred_class_logits, pred_anchor_deltas, self.num_classes ) # Shapes: (N x R, K) and (N x R, 4), respectively. gt_classes = gt_classes.flatten() gt_anchors_deltas = gt_anchors_deltas.view(-1, 4) valid_idxs = gt_classes >= 0 foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) num_foreground = foreground_idxs.sum().item() get_event_storage().put_scalar("num_foreground", num_foreground) self.loss_normalizer = ( self.loss_normalizer_momentum * self.loss_normalizer + (1 - self.loss_normalizer_momentum) * num_foreground) gt_classes_target = torch.zeros_like(pred_class_logits) gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 # logits loss loss_cls = sigmoid_focal_loss_jit( pred_class_logits[valid_idxs], gt_classes_target[valid_idxs], alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) / max(1, self.loss_normalizer) # regression loss loss_box_reg = smooth_l1_loss( pred_anchor_deltas[foreground_idxs], gt_anchors_deltas[foreground_idxs], beta=self.smooth_l1_loss_beta, reduction="sum", ) / max(1, self.loss_normalizer) return {"loss_cls": loss_cls, "loss_box_reg": loss_box_reg}
def reg_loss(output, mask, index, target, loss_type="l1", smooth_l1_beta=0.1): pred = gather_feature(output, index, use_transform=True) mask = mask.unsqueeze(dim=2).expand_as(pred).float() if loss_type == "l1": loss = F.l1_loss(pred * mask, target * mask, reduction="sum") elif loss_type == "smooth_l1": loss = smooth_l1_loss(pred * mask, target * mask, smooth_l1_beta, reduction="sum") loss = loss / (mask.sum() + 1e-4) return loss
def rpn_losses(gt_labels, gt_anchor_deltas, pred_objectness_logits, pred_anchor_deltas, smooth_l1_beta, cfg, box2box_transform): """ Args: gt_labels (Tensor): shape (N,), each element in {-1, 0, 1} representing ground-truth objectness labels with: -1 = ignore; 0 = not object; 1 = object. gt_anchor_deltas (Tensor): shape (N, box_dim), row i represents ground-truth box2box transform targets (dx, dy, dw, dh) or (dx, dy, dw, dh, da) that map anchor i to its matched ground-truth box. pred_objectness_logits (Tensor): shape (N,), each element is a predicted objectness logit. pred_anchor_deltas (Tensor): shape (N, box_dim), each row is a predicted box2box transform (dx, dy, dw, dh) or (dx, dy, dw, dh, da) smooth_l1_beta (float): The transition point between L1 and L2 loss in the smooth L1 loss function. When set to 0, the loss becomes L1. When set to +inf, the loss becomes constant 0. #Added for DIOU implementation cfg (configuration): Hacky way to get which loss to apply for bbox box2box_transform: To get predetermined weights and scale_clamp Returns: objectness_loss, localization_loss, both unnormalized (summed over samples). """ pos_masks = gt_labels == 1 # Will need to improve the configuration part reg_loss = cfg.MODEL.RPN_LOSS_TYPE localization_loss = 0 if reg_loss == "diou": localization_loss = compute_diou(pos_masks, gt_anchor_deltas, pred_anchor_deltas, box2box_transform, cfg.SOLVER.IMS_PER_BATCH, cfg.MODEL.RPN_LOSS_BBOX_WEIGHT) else: localization_loss = smooth_l1_loss(pred_anchor_deltas[pos_masks], gt_anchor_deltas[pos_masks], smooth_l1_beta, reduction="sum") valid_masks = gt_labels >= 0 objectness_loss = F.binary_cross_entropy_with_logits( pred_objectness_logits[valid_masks], gt_labels[valid_masks].to(torch.float32), reduction="sum", ) return objectness_loss, localization_loss
def overlap_smooth_l1_loss(self, fg_inds): overlap_deltas = self.box2box_transform.get_deltas( self.proposals.tensor[fg_inds], self.overlap_gt_boxes.tensor) trained_idx = torch.nonzero( self.overlap_iou > self.overlap_iou_threshold).squeeze(1) loss_overlap_reg = smooth_l1_loss( self.pred_overlap_deltas[trained_idx], overlap_deltas[trained_idx], self.smooth_l1_beta, reduction="sum", ) if self.uniform_reg_divisor: return loss_overlap_reg / (self.gt_classes.numel() + 1e-6) else: return loss_overlap_reg / (trained_idx.size(0) + 1e-6) * self.loss_overlap_reg_coeff
def box_reg_loss(self): """ change _no_instance handling and normalization """ if self._no_instances: print('No instance in box reg loss') return self.pred_proposal_deltas.sum() * 0. box_dim = self.gt_boxes.tensor.size(1) # 4 or 5 cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim device = self.pred_proposal_deltas.device bg_class_ind = self.pred_class_logits.shape[1] - 1 fg_inds = nonzero_tuple((self.gt_classes >= 0) & (self.gt_classes < bg_class_ind))[0] if cls_agnostic_bbox_reg: gt_class_cols = torch.arange(box_dim, device=device) else: fg_gt_classes = self.gt_classes[fg_inds] gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange( box_dim, device=device) if self.box_reg_loss_type == "smooth_l1": gt_proposal_deltas = self.box2box_transform.get_deltas( self.proposals.tensor, self.gt_boxes.tensor) loss_box_reg = smooth_l1_loss( self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols], gt_proposal_deltas[fg_inds], self.smooth_l1_beta, reduction="sum", ) elif self.box_reg_loss_type == "giou": loss_box_reg = giou_loss( self._predict_boxes()[fg_inds[:, None], gt_class_cols], self.gt_boxes.tensor[fg_inds], reduction="sum", ) else: raise ValueError( f"Invalid bbox reg loss type '{self.box_reg_loss_type}'") if self.fix_norm_reg: loss_box_reg = loss_box_reg / self.box_batch_size else: loss_box_reg = loss_box_reg / self.gt_classes.numel() return loss_box_reg
def z_rcnn_loss(z_pred, instances, src_boxes, loss_weight=1.0, smooth_l1_beta=0.0): """ Compute the z_pred loss. Args: z_pred (Tensor): A tensor of shape (B, C) or (B, 1) for class-specific or class-agnostic, where B is the total number of foreground regions in all images, C is the number of foreground classes, instances (list[Instances]): A list of N Instances, where N is the number of images in the batch. The ground-truth labels (class, box, mask, ...) associated with each instance are stored in fields. Returns: loss (Tensor): A scalar tensor containing the loss. """ cls_agnostic_z = z_pred.size(1) == 1 total_num = z_pred.size(0) gt_classes = [] gt_dz = [] for instances_per_image in instances: if len(instances_per_image) == 0: continue if not cls_agnostic_z: gt_classes_per_image = instances_per_image.gt_classes.to(dtype=torch.int64) gt_classes.append(gt_classes_per_image) gt_dz.append(instances_per_image.gt_dz) if len(gt_dz) == 0: return z_pred.sum() * 0 gt_dz = cat(gt_dz, dim=0) assert gt_dz.numel() > 0 src_heights = src_boxes[:, 3] - src_boxes[:, 1] dz = torch.log(gt_dz / src_heights) if cls_agnostic_z: z_pred = z_pred[:, 0] else: indices = torch.arange(total_num) gt_classes = cat(gt_classes, dim=0) z_pred = z_pred[indices, gt_classes] loss_z_reg = smooth_l1_loss(z_pred, dz, smooth_l1_beta, reduction="sum") loss_z_reg = loss_weight * loss_z_reg / gt_classes.numel() return loss_z_reg
def plane_rcnn_loss(plane_pred, instances, loss_weight=1.0, smooth_l1_beta=0.0, plane_normal_only=False): """ Compute the plane_param loss. Args: z_pred (Tensor): A tensor of shape (B, C) or (B, 1) for class-specific or class-agnostic, where B is the total number of foreground regions in all images, C is the number of foreground classes, instances (list[Instances]): A list of N Instances, where N is the number of images in the batch. The ground-truth labels (class, box, mask, ...) associated with each instance are stored in fields. Returns: loss (Tensor): A scalar tensor containing the loss. """ gt_param = [] for instances_per_image in instances: if len(instances_per_image) == 0: continue gt_param.append(instances_per_image.gt_planes) if len(gt_param) == 0: return plane_pred.sum() * 0 gt_param = cat(gt_param, dim=0) if plane_normal_only: gt_param = F.normalize(gt_param, p=2, dim=1) assert len(plane_pred) > 0 loss_plane_reg = smooth_l1_loss(plane_pred, gt_param, smooth_l1_beta, reduction="sum") loss_plane_reg = loss_weight * loss_plane_reg / len(plane_pred) return loss_plane_reg
def losses(self, init_gt_classes, init_reg_targets, refine_gt_classes, refine_reg_targets, \ pred_class_logits, pred_box_reg_init, pred_box_reg, pred_center_score, strides, pred_ratio): strides = strides.repeat(pred_class_logits[0].shape[0]) # [N*X] pred_class_logits, pred_box_reg_init, pred_box_reg, pred_center_score, pred_ratio = \ permute_and_concat(pred_class_logits, pred_box_reg_init, pred_box_reg, pred_center_score, pred_ratio, self.num_classes) # Shapes: (N x R) and (N x R, 4), (N x R) respectively. init_gt_classes = init_gt_classes.flatten() init_reg_targets = init_reg_targets.view(-1, 4) init_foreground_idxs = (init_gt_classes >= 0) & (init_gt_classes != self.num_classes) init_pos_inds = torch.nonzero(init_foreground_idxs).squeeze(1) num_gpus = get_num_gpus() # sync num_pos from all gpus init_total_num_pos = reduce_sum(init_pos_inds.new_tensor([init_pos_inds.numel()])).item() init_num_pos_avg_per_gpu = max(init_total_num_pos / float(num_gpus), 1.0) refine_gt_classes = refine_gt_classes.flatten() refine_reg_targets = refine_reg_targets.view(-1, 4) refine_foreground_idxs = (refine_gt_classes >= 0) & (refine_gt_classes != self.num_classes) refine_pos_inds = torch.nonzero(refine_foreground_idxs).squeeze(1) # sync num_pos from all gpus refine_total_num_pos = reduce_sum(refine_pos_inds.new_tensor([refine_pos_inds.numel()])).item() refine_num_pos_avg_per_gpu = max(refine_total_num_pos / float(num_gpus), 1.0) gt_classes_target = torch.zeros_like(pred_class_logits) gt_classes_target[refine_foreground_idxs, refine_gt_classes[refine_foreground_idxs]] = 1 # logits loss cls_loss = sigmoid_focal_loss_jit( pred_class_logits, gt_classes_target, alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) / refine_num_pos_avg_per_gpu init_foreground_targets = init_reg_targets[init_foreground_idxs] gt_ratio_1 = (init_foreground_targets[:,0] + init_foreground_targets[:,2]) \ / (init_foreground_targets[:,1] + init_foreground_targets[:,3]) gt_ratio_2 = 1 / gt_ratio_1 gt_ratios = torch.stack((gt_ratio_1,gt_ratio_2), dim = 1) gt_ratio = gt_ratios.min(dim=1)[0] gt_center_score = compute_centerness_targets(init_reg_targets[init_foreground_idxs], gt_ratio) # average sum_centerness_targets from all gpus, # which is used to normalize centerness-weighed reg loss sum_centerness_targets_avg_per_gpu = \ reduce_sum(gt_center_score.sum()).item() / float(num_gpus) reg_loss_init = iou_loss( pred_box_reg_init[init_foreground_idxs], init_reg_targets[init_foreground_idxs], gt_center_score, loss_type=self.iou_loss_type ) / sum_centerness_targets_avg_per_gpu coords_norm_refine = strides[refine_foreground_idxs].unsqueeze(-1) * 4 reg_loss = smooth_l1_loss( pred_box_reg[refine_foreground_idxs] / coords_norm_refine, refine_reg_targets[refine_foreground_idxs] / coords_norm_refine, 0.11, reduction="sum") / max(1, refine_num_pos_avg_per_gpu) # reg_loss = iou_loss( # pred_box_reg[refine_foreground_idxs], refine_reg_targets[refine_foreground_idxs], 1, # loss_type=self.iou_loss_type # ) / sum_centerness_targets_avg_per_gpu centerness_loss = F.binary_cross_entropy_with_logits( torch.pow(torch.abs(pred_center_score[init_foreground_idxs]), pred_ratio[init_foreground_idxs]), gt_center_score, reduction='sum' ) / init_num_pos_avg_per_gpu return dict(cls_loss=cls_loss, reg_loss_init=reg_loss_init, reg_loss=reg_loss, centerness_loss=centerness_loss)
def losses( self, gt_class_info, gt_delta_info, gt_mask_info, num_fg, pred_logits, pred_deltas, pred_masks, ): """ Args: For `gt_class_info`, `gt_delta_info`, `gt_mask_info` and `num_fg` parameters, see :meth:`TensorMask.get_ground_truth`. For `pred_logits`, `pred_deltas` and `pred_masks`, see :meth:`TensorMaskHead.forward`. Returns: losses (dict[str: Tensor]): mapping from a named loss to a scalar tensor storing the loss. Used during training only. The potential dict keys are: "loss_cls", "loss_box_reg" and "loss_mask". """ gt_classes_target, gt_valid_inds = gt_class_info gt_deltas, gt_fg_inds = gt_delta_info gt_masks, gt_mask_inds = gt_mask_info loss_normalizer = torch.tensor(max(1, num_fg), dtype=torch.float32, device=self.device) # classification and regression pred_logits, pred_deltas = permute_all_cls_and_box_to_N_HWA_K_and_concat( pred_logits, pred_deltas, self.num_classes ) loss_cls = ( sigmoid_focal_loss_star_jit( pred_logits[gt_valid_inds], gt_classes_target[gt_valid_inds], alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) / loss_normalizer ) if num_fg == 0: loss_box_reg = pred_deltas.sum() * 0 else: loss_box_reg = ( smooth_l1_loss(pred_deltas[gt_fg_inds], gt_deltas, beta=0.0, reduction="sum") / loss_normalizer ) losses = {"loss_cls": loss_cls, "loss_box_reg": loss_box_reg} # mask prediction if self.mask_on: loss_mask = 0 for lvl in range(self.num_levels): cur_level_factor = 2 ** lvl if self.bipyramid_on else 1 for anc in range(self.num_anchors): cur_gt_mask_inds = gt_mask_inds[lvl][anc] if cur_gt_mask_inds is None: loss_mask += pred_masks[lvl][anc][0, 0, 0, 0] * 0 else: cur_mask_size = self.mask_sizes[anc] * cur_level_factor # TODO maybe there are numerical issues when mask sizes are large cur_size_divider = torch.tensor( self.mask_loss_weight / (cur_mask_size ** 2), dtype=torch.float32, device=self.device, ) cur_pred_masks = pred_masks[lvl][anc][ cur_gt_mask_inds[:, 0], # N :, # V x U cur_gt_mask_inds[:, 1], # H cur_gt_mask_inds[:, 2], # W ] loss_mask += F.binary_cross_entropy_with_logits( cur_pred_masks.view(-1, cur_mask_size, cur_mask_size), # V, U gt_masks[lvl][anc].to(dtype=torch.float32), reduction="sum", weight=cur_size_divider, pos_weight=self.mask_pos_weight, ) losses["loss_mask"] = loss_mask / loss_normalizer return losses
def fast_rcnn_losses(gt_classes, gt_proposal_deltas, pred_class_logits, pred_proposal_deltas, smooth_l1_beta, gt_light_direction=None, pred_light_direction=None): """ When box dimension is 4: Computes the classification and box delta losses defined in the Fast R-CNN paper. When box dimension is 5: Computes the same losses for Fast R-CNN with rotated boxes. Args: gt_classes (Tensor): A tensor of shape (R,) storing ground-truth classification labels in [0, K], including K fg class and 1 bg class. gt_proposal_deltas (Tensor): Shape (R, box_dim), row i represents ground-truth box2box transform targets (dx, dy, dw, dh) or (dx, dy, dw, dh, da) that map object instance i to its matched ground-truth box. pred_class_logits (Tensor): A tensor for shape (R, K + 1) storing predicted classification logits for the K+1-way classification problem. Each row corresponds to a predicted object instance. pred_proposal_deltas (Tensor): shape depends on whether we are doing cls-agnostic or cls-specific regression, and the box dimensions. When box_dim is 4: 1. cls-specific: Shape (R, 4 * K), each row stores a list of class-specific predicted box2box transform [dx_0, dy_0, dw_0, dh_0, ..., dx_k, dy_k, dw_k, dh_k, ...] for each class k in [0, K). (No predictions for the background class.) 2. cls-agnostic: Shape (R, 4), the second row stores the class-agnostic (foreground) predicted box2box transform. When box_dim is 5: 1. cls-specific: Shape (R, 5 * K), each row stores a list of class-specific predicted rotated box2box transform [dx_0, dy_0, dw_0, dh_0, da_0, ..., dx_k, dy_k, dw_k, dh_k, da_k, ...] for each class k in [0, K). (No predictions for the background class.) 2. cls-agnostic: Shape (R, 5), the second row stores the class-agnostic (foreground) predicted rotated box2box transform. smooth_l1_beta (float): The transition point between L1 and L2 loss in the smooth L1 loss function. When set to 0, the loss becomes L1. When set to +inf, the loss becomes constant 0. Returns: loss_cls, loss_box_reg (Tensor): Scalar loss values. """ box_dim = gt_proposal_deltas.size(1) cls_agnostic_bbox_reg = pred_proposal_deltas.size(1) == box_dim device = pred_class_logits.device loss_cls = F.cross_entropy(pred_class_logits, gt_classes, reduction="mean") bg_class_ind = pred_class_logits.shape[1] - 1 # Box delta loss is only computed between the prediction for the gt class k # (if 0 <= k < bg_class_ind) and the target; there is no loss defined on predictions # for non-gt classes and background. # Empty fg_inds produces a valid loss of zero as long as the size_average # arg to smooth_l1_loss is False (otherwise it uses torch.mean internally # and would produce a nan loss). fg_inds = torch.nonzero((gt_classes >= 0) & (gt_classes < bg_class_ind)).squeeze(1) if cls_agnostic_bbox_reg: # pred_proposal_deltas only corresponds to foreground class for agnostic gt_class_cols = torch.arange(box_dim, device=device) else: fg_gt_classes = gt_classes[fg_inds] # pred_proposal_deltas for class k are located in columns [b * k : b * k + b], # where b is the dimension of box representation (4 or 5) # Note that compared to Detectron1, # we do not perform bounding box regression for background classes. gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange( box_dim, device=device) loss_box_reg = smooth_l1_loss( pred_proposal_deltas[fg_inds[:, None], gt_class_cols], gt_proposal_deltas[fg_inds], smooth_l1_beta, reduction="sum", ) if type(pred_light_direction) == type(pred_proposal_deltas): pred_light_direction_ = pred_light_direction[fg_inds[:, None], gt_class_cols] gt_light_direction_ = gt_light_direction[fg_inds] # gt_gradient = x2 = pred_light_direction_[:, 0] y2 = pred_light_direction_[:, 1] x1 = pred_light_direction_[:, 2] y1 = pred_light_direction_[:, 3] pred_angle = torch.atan2(y2 - y1, x2 - x1) gt_angle = torch.atan2( gt_light_direction_[:, 3] - gt_light_direction_[:, 1], gt_light_direction_[:, 2] - gt_light_direction_[:, 0]) loss_light_reg = smooth_l1_loss( pred_angle, gt_angle, smooth_l1_beta, reduction='sum', ) loss_light_reg = loss_light_reg / gt_classes.numel() else: loss_light_reg = None # The loss is normalized using the total number of regions (R), not the number # of foreground regions even though the box regression loss is only defined on # foreground regions. Why? Because doing so gives equal training influence to # each foreground example. To see how, consider two different minibatches: # (1) Contains a single foreground region # (2) Contains 100 foreground regions # If we normalize by the number of foreground regions, the single example in # minibatch (1) will be given 100 times as much influence as each foreground # example in minibatch (2). Normalizing by the total number of regions, R, # means that the single example in minibatch (1) and each of the 100 examples # in minibatch (2) are given equal influence. loss_box_reg = loss_box_reg / gt_classes.numel() return loss_cls, loss_box_reg, loss_light_reg
def losses( self, anchors: List[Boxes], pred_objectness_logits: List[torch.Tensor], gt_labels: List[torch.Tensor], pred_anchor_deltas: List[torch.Tensor], gt_boxes: List[torch.Tensor], ) -> Dict[str, torch.Tensor]: """ Return the losses from a set of RPN predictions and their associated ground-truth. Args: anchors (list[Boxes or RotatedBoxes]): anchors for each feature map, each has shape (Hi*Wi*A, B), where B is box dimension (4 or 5). pred_objectness_logits (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, Hi*Wi*A) representing the predicted objectness logits for all anchors. gt_labels (list[Tensor]): Output of :meth:`label_and_sample_anchors`. pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, Hi*Wi*A, 4 or 5) representing the predicted "deltas" used to transform anchors to proposals. gt_boxes (list[Tensor]): Output of :meth:`label_and_sample_anchors`. Returns: dict[loss name -> loss value]: A dict mapping from loss name to loss value. Loss names are: `loss_rpn_cls` for objectness classification and `loss_rpn_loc` for proposal localization. """ num_images = len(gt_labels) gt_labels = torch.stack(gt_labels) # (N, sum(Hi*Wi*Ai)) # Log the number of positive/negative anchors per-image that's used in training pos_mask = gt_labels == 1 num_pos_anchors = pos_mask.sum().item() num_neg_anchors = (gt_labels == 0).sum().item() storage = get_event_storage() storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images) storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images) if self.box_reg_loss_type == "smooth_l1": anchors = type(anchors[0]).cat(anchors).tensor # Ax(4 or 5) gt_anchor_deltas = [ self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes ] gt_anchor_deltas = torch.stack( gt_anchor_deltas) # (N, sum(Hi*Wi*Ai), 4 or 5) localization_loss = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[pos_mask], gt_anchor_deltas[pos_mask], self.smooth_l1_beta, reduction="sum", ) elif self.box_reg_loss_type == "giou": pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas) pred_proposals = cat(pred_proposals, dim=1) pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1]) pos_mask = pos_mask.view(-1) localization_loss = giou_loss(pred_proposals[pos_mask], cat(gt_boxes)[pos_mask], reduction="sum") elif self.box_reg_loss_type == "diou": anchors = type(anchors[0]).cat(anchors).tensor # Ax(4 or 5) gt_anchor_deltas = [ self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes ] gt_anchor_deltas = torch.stack( gt_anchor_deltas) # (N, sum(Hi*Wi*Ai), 4 or 5) localization_loss = compute_diou( cat(pred_anchor_deltas, dim=1)[pos_mask], gt_anchor_deltas[pos_mask], self.box2box_transform.weights, self.box2box_transform.scale_clamp) # elif self.box_reg_loss_type == "diou_bbox": # pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas) # pred_proposals = cat(pred_proposals, dim=1) # pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1]) # pos_mask = pos_mask.view(-1) # localization_loss = giou_loss( # pred_proposals[pos_mask], cat(gt_boxes)[pos_mask] # ) elif self.box_reg_loss_type == "diou_mmdet": pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas) pred_proposals = cat(pred_proposals, dim=1) pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1]) pos_mask = pos_mask.view(-1) localization_loss = compute_diou_mmdet(pred_proposals[pos_mask], cat(gt_boxes)[pos_mask]) elif self.box_reg_loss_type == "ciou_mmdet": pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas) pred_proposals = cat(pred_proposals, dim=1) pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1]) pos_mask = pos_mask.view(-1) localization_loss = compute_ciou_mmdet(pred_proposals[pos_mask], cat(gt_boxes)[pos_mask]) else: raise ValueError( f"Invalid rpn box reg loss type '{self.box_reg_loss_type}'") valid_mask = gt_labels >= 0 objectness_loss = F.binary_cross_entropy_with_logits( cat(pred_objectness_logits, dim=1)[valid_mask], gt_labels[valid_mask].to(torch.float32), reduction="sum", ) normalizer = self.batch_size_per_image * num_images losses = { "loss_rpn_cls": objectness_loss / normalizer, "loss_rpn_loc": localization_loss / normalizer, } losses = { k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items() } return losses
def losses(self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes): """ Args: anchors (list[Boxes]): a list of #feature level Boxes gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`. Their shapes are (N, R) and (N, R, 4), respectively, where R is the total number of anchors across levels, i.e. sum(Hi x Wi x Ai) pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4). Where K is the number of classes used in `pred_logits`. Returns: dict[str, Tensor]: mapping from a named loss to a scalar tensor storing the loss. Used during training only. The dict keys are: "loss_cls" and "loss_box_reg" """ num_images = len(gt_labels) gt_labels = torch.stack(gt_labels) # (N, R) anchors = type(anchors[0]).cat(anchors).tensor # (R, 4) gt_anchor_deltas = [self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes] gt_anchor_deltas = torch.stack(gt_anchor_deltas) # (N, R, 4) valid_mask = gt_labels >= 0 pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes) num_pos_anchors = pos_mask.sum().item() get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images) self.loss_normalizer = self.loss_normalizer_momentum * self.loss_normalizer + ( 1 - self.loss_normalizer_momentum ) * max(num_pos_anchors, 1) # classification and regression loss """gt_labels_target = F.one_hot(gt_labels[valid_mask], num_classes=self.num_classes + 1)[ :, :-1 ] # no loss for the last (background) class loss_cls = sigmoid_focal_loss_jit( cat(pred_logits, dim=1)[valid_mask], gt_labels_target.to(pred_logits[0].dtype), alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", )""" gt_labels_target = gt_labels[valid_mask] # gt_labels_target = gt_labels_target_[gt_labels_target_ != 10] pred_logits = cat(pred_logits, dim=1)[valid_mask] # pred_logits = pred_logits[gt_labels_target_ != 10] unique_labels, count = torch.unique(gt_labels_target, return_counts=True) samples_per_cls = torch.zeros(self.num_classes + 1, dtype=torch.int64).cuda() samples_per_cls[unique_labels] = count # samples_per_cls = torch.Tensor([2632, 551, 42497, 2834, 739, 2157, 964, 0, 103, 4292]) loss_cls = CB_loss( gt_labels_target, pred_logits, samples_per_cls=samples_per_cls, no_of_classes=self.num_classes, loss_type="focal", beta=self.cb_loss_beta, gamma=self.focal_loss_gamma ) if self.box_reg_loss_type == "smooth_l1": loss_box_reg = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[pos_mask], gt_anchor_deltas[pos_mask], beta=self.smooth_l1_loss_beta, reduction="sum", ) elif self.box_reg_loss_type == "giou": pred_boxes = [ self.box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1) ] loss_box_reg = giou_loss( torch.stack(pred_boxes)[pos_mask], torch.stack(gt_boxes)[pos_mask], reduction="sum" ) else: raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'") return { "loss_cls": loss_cls, "loss_box_reg": loss_box_reg / self.loss_normalizer, }