Exemple #1
0
    def test_smooth_l1_loss(self) -> None:
        inputs = torch.tensor([1, 2, 3], dtype=torch.float32)
        targets = torch.tensor([1.1, 2, 4.5], dtype=torch.float32)
        beta = 0.5
        loss = smooth_l1_loss(inputs, targets, beta=beta,
                              reduction="none").numpy()
        self.assertTrue(
            np.allclose(loss, [0.5 * 0.1**2 / beta, 0, 1.5 - 0.5 * beta]))

        beta = 0.05
        loss = smooth_l1_loss(inputs, targets, beta=beta,
                              reduction="none").numpy()
        self.assertTrue(
            np.allclose(loss, [0.1 - 0.5 * beta, 0, 1.5 - 0.5 * beta]))
    def smooth_l1_loss(self):
        if self._no_instances:
            return 0.0 * self.pred_proposal_deltas.sum()
        gt_proposal_deltas = self.box2box_transform.get_deltas(
            self.proposals.tensor, self.gt_boxes.tensor)
        box_dim = gt_proposal_deltas.size(1)  # 4 or 5
        cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim
        device = self.pred_proposal_deltas.device

        bg_class_ind = self.pred_class_logits.shape[1] - 1

        fg_inds = nonzero_tuple((self.gt_classes >= 0)
                                & (self.gt_classes < bg_class_ind))[0]
        if cls_agnostic_bbox_reg:
            # pred_proposal_deltas only corresponds to foreground class for agnostic
            gt_class_cols = torch.arange(box_dim, device=device)
        else:
            fg_gt_classes = self.gt_classes[fg_inds]
            gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange(
                box_dim, device=device)

        loss_box_reg = smooth_l1_loss(
            self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols],
            gt_proposal_deltas[fg_inds],
            self.smooth_l1_beta,
            reduction="sum",
        )
        loss_box_reg = loss_box_reg / self.gt_classes.numel()
        return loss_box_reg
Exemple #3
0
    def smooth_l1_loss_vp_residual(self):
        """
        Compute the smooth L1 loss for viewpoint regression.

        Returns:
            scalar Tensor
        """

        gt_vp_deltas = self.get_vp_deltas()
        device = self.pred_proposal_deltas.device

        bg_class_ind = self.pred_class_logits.shape[1] - 1

        fg_inds = torch.nonzero((self.gt_classes >= 0)
                                & (self.gt_classes < bg_class_ind)).squeeze(1)
        fg_gt_classes = self.gt_classes[fg_inds]
        # pdb.set_trace()
        res_index_list = list()
        for idx, logit in enumerate(self.viewpoint_res_logits[fg_inds]):
            res_index_list.append(fg_gt_classes[idx] * self.vp_bins +
                                  self.gt_viewpoint[fg_inds][idx])

        loss_box_reg = smooth_l1_loss(
            self.viewpoint_res_logits[fg_inds, res_index_list],
            gt_vp_deltas[fg_inds],
            self.smooth_l1_beta,
            reduction="sum",
        )
        loss_box_reg = loss_box_reg / self.gt_classes.numel()
        return loss_box_reg
Exemple #4
0
    def smooth_l1_loss_height(self):
        """
        Compute the smooth L1 loss for height regression.

        Returns:
            scalar Tensor
        """
        gt_height_deltas = self.get_h_deltas()
        # dh,dz
        box_dim = gt_height_deltas.size(1)
        device = self.pred_proposal_deltas.device

        bg_class_ind = self.pred_class_logits.shape[1] - 1

        fg_inds = torch.nonzero((self.gt_classes >= 0)
                                & (self.gt_classes < bg_class_ind)).squeeze(1)
        fg_gt_classes = self.gt_classes[fg_inds]
        # 2 columns
        gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange(
            box_dim, device=device)
        loss_box_reg = smooth_l1_loss(
            self.height_logits[fg_inds[:, None], gt_class_cols],
            gt_height_deltas[fg_inds],
            self.smooth_l1_beta,
            reduction="sum",
        )
        # The loss is normalized as in box delta regression task
        loss_box_reg = loss_box_reg / self.gt_classes.numel()
        return loss_box_reg
Exemple #5
0
    def smooth_l1_loss(self, gt_classes, gt_anchors_deltas,
                       pred_anchor_deltas):
        """
        Compute the smooth L1 loss for box regression.

        Returns:
            scalar Tensor
        """
        box_delta_flattened = [
            permute_to_N_HWA_K(x, 4) for x in pred_anchor_deltas
        ]
        pred_anchor_deltas = cat(box_delta_flattened, dim=1).reshape(-1, 4)
        # shapes: (N x R, 4)

        gt_classes = gt_classes.flatten()
        gt_anchors_deltas = gt_anchors_deltas.view(-1, 4)
        foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes)
        num_foreground = foreground_idxs.sum()

        loss_box_reg = smooth_l1_loss(
            pred_anchor_deltas[foreground_idxs],
            gt_anchors_deltas[foreground_idxs],
            beta=self.smooth_l1_loss_beta,
            reduction="sum",
        ) / max(1, num_foreground)
        return loss_box_reg
Exemple #6
0
    def losses(
            self,
            anchors,
            pred_objectness_logits: List[torch.Tensor],
            gt_labels: List[torch.Tensor],
            pred_anchor_deltas: List[torch.Tensor],
            gt_boxes,
    ):
        num_images = len(gt_labels)
        gt_labels = torch.stack(gt_labels)  # (N, sum(Hi*Wi*Ai))
        anchors = type(anchors[0]).cat(anchors).tensor  # Ax4
        gt_anchor_deltas = [self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes]
        gt_anchor_deltas = torch.stack(gt_anchor_deltas)  # (N, sum(Hi*Wi*Ai), 4)

        pos_mask = gt_labels == 1

        localization_loss = smooth_l1_loss(
            cat(pred_anchor_deltas, dim=1)[pos_mask],
            gt_anchor_deltas[pos_mask],
            self.smooth_l1_beta,
            reduction="sum",
        )
        valid_mask = gt_labels >= 0
        objectness_loss = F.binary_cross_entropy_with_logits(
            cat(pred_objectness_logits, dim=1)[valid_mask],
            gt_labels[valid_mask].to(torch.float32),
            reduction="sum",
        )
        normalizer = self.batch_size_per_image * num_images
        return {
            "loss_rpn_cls": objectness_loss / normalizer,
            "loss_rpn_loc": localization_loss / normalizer,
        }
Exemple #7
0
def rpn_losses(gt_labels, gt_anchor_deltas, pred_objectness_logits,
               pred_anchor_deltas, smooth_l1_beta):
    """
    Args:
        gt_labels (Tensor): shape (N,), each element in {-1, 0, 1} representing
            ground-truth objectness labels with: -1 = ignore; 0 = not object; 1 = object.
        gt_anchor_deltas (Tensor): shape (N, box_dim), row i represents ground-truth
            box2box transform targets (dx, dy, dw, dh) or (dx, dy, dw, dh, da) that map anchor i to
            its matched ground-truth box.
        pred_objectness_logits (Tensor): shape (N,), each element is a predicted objectness
            logit.
        pred_anchor_deltas (Tensor): shape (N, box_dim), each row is a predicted box2box
            transform (dx, dy, dw, dh) or (dx, dy, dw, dh, da)
        smooth_l1_beta (float): The transition point between L1 and L2 loss in
            the smooth L1 loss function. When set to 0, the loss becomes L1. When
            set to +inf, the loss becomes constant 0.

    Returns:
        objectness_loss, localization_loss, both unnormalized (summed over samples).
    """
    pos_masks = gt_labels == 1
    localization_loss = smooth_l1_loss(pred_anchor_deltas[pos_masks],
                                       gt_anchor_deltas[pos_masks],
                                       smooth_l1_beta,
                                       reduction="sum")

    valid_masks = gt_labels >= 0
    objectness_loss = F.binary_cross_entropy_with_logits(
        pred_objectness_logits[valid_masks],
        gt_labels[valid_masks].to(torch.float32),
        reduction="sum",
    )
    return objectness_loss, localization_loss
Exemple #8
0
    def losses(self, strides):
        # (N, X)
        pred_objectness_logits = torch.cat(
            [p.view(p.size(0), -1) for p in self.pred_objectness_logits], dim=1
        )
        # (N, X, 4)
        pred_bboxes = torch.cat([p.view(p.size(0), 4, -1) for p in self.pred_bboxes], dim=2)

        pos_masks = self.gt_labels > 0
        pos_count = pos_masks.sum()
        neg_masks = ~pos_masks
        neg_count = torch.min(neg_masks.sum(), pos_count * 3).item()

        cls_loss = sigmoid_focal_loss_jit(
            pred_objectness_logits,
            self.gt_labels,
            alpha=0.25,
            reduction="none"
        )
        neg_cls_loss, _ = cls_loss[neg_masks].topk(neg_count)
        cls_loss = cls_loss[pos_masks].mean() + neg_cls_loss.mean()
        # (N, X)
        pred_bboxes = pred_bboxes.permute(0, 2, 1) / strides[None, :, None] / 4
        gt_bboxes = self.gt_boxes / strides[None, :, None] / 4
        localization_loss = smooth_l1_loss(
            pred_bboxes[pos_masks],
            gt_bboxes[pos_masks],
            0.11,
            reduction="mean"
        )
        return {
            "cls_loss": cls_loss,
            "localization_loss": localization_loss
        }
Exemple #9
0
    def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes):
        """
        Args:
            All boxes are tensors with the same shape Rx(4 or 5).
            gt_classes is a long tensor of shape R, the gt class label of each proposal.
            R shall be the number of proposals.
        """
        box_dim = proposal_boxes.shape[1]  # 4 or 5
        # Regression loss is only computed for foreground proposals (those matched to a GT)
        fg_inds = nonzero_tuple((gt_classes >= 0)
                                & (gt_classes < self.num_classes))[0]
        if pred_deltas.shape[1] == box_dim:  # cls-agnostic regression
            fg_pred_deltas = pred_deltas[fg_inds]
        else:
            fg_pred_deltas = pred_deltas.view(-1, self.num_classes,
                                              box_dim)[fg_inds,
                                                       gt_classes[fg_inds]]

        if self.box_reg_loss_type == "smooth_l1":
            gt_pred_deltas = self.box2box_transform.get_deltas(
                proposal_boxes[fg_inds],
                gt_boxes[fg_inds],
            )
            loss_box_reg = smooth_l1_loss(fg_pred_deltas,
                                          gt_pred_deltas,
                                          self.smooth_l1_beta,
                                          reduction="sum")
        elif self.box_reg_loss_type == "giou":
            fg_pred_boxes = self.box2box_transform.apply_deltas(
                fg_pred_deltas, proposal_boxes[fg_inds])
            loss_box_reg = giou_loss(fg_pred_boxes,
                                     gt_boxes[fg_inds],
                                     reduction="sum")
        elif self.box_reg_loss_type == "diou":
            fg_pred_boxes = self.box2box_transform.apply_deltas(
                fg_pred_deltas, proposal_boxes[fg_inds])
            loss_box_reg = diou_loss(fg_pred_boxes,
                                     gt_boxes[fg_inds],
                                     reduction="sum")
        elif self.box_reg_loss_type == "ciou":
            fg_pred_boxes = self.box2box_transform.apply_deltas(
                fg_pred_deltas, proposal_boxes[fg_inds])
            loss_box_reg = ciou_loss(fg_pred_boxes,
                                     gt_boxes[fg_inds],
                                     reduction="sum")
        else:
            raise ValueError(
                f"Invalid bbox reg loss type '{self.box_reg_loss_type}'")
        # The reg loss is normalized using the total number of regions (R), not the number
        # of foreground regions even though the box regression loss is only defined on
        # foreground regions. Why? Because doing so gives equal training influence to
        # each foreground example. To see how, consider two different minibatches:
        #  (1) Contains a single foreground region
        #  (2) Contains 100 foreground regions
        # If we normalize by the number of foreground regions, the single example in
        # minibatch (1) will be given 100 times as much influence as each foreground
        # example in minibatch (2). Normalizing by the total number of regions, R,
        # means that the single example in minibatch (1) and each of the 100 examples
        # in minibatch (2) are given equal influence.
        return loss_box_reg / max(gt_classes.numel(), 1.0)  # return 0 if empty
Exemple #10
0
    def losses(self,
               anchors,
               pred_objectness_logits,
               gt_labels,
               pred_anchor_deltas,
               gt_boxes,
               loss_weights=None):
        num_images = len(gt_labels)
        gt_labels = torch.stack(gt_labels)  # (N, sum(Hi*Wi*Ai))

        # Log the number of positive/negative anchors per-image that's used in training
        pos_mask = gt_labels == 1
        num_pos_anchors = pos_mask.sum().item()
        num_neg_anchors = (gt_labels == 0).sum().item()
        storage = get_event_storage()
        storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images)
        storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images)
        reduction = "sum" if loss_weights is None else "none"
        if self.box_reg_loss_type == "smooth_l1":
            anchors = type(anchors[0]).cat(anchors).tensor  # Ax(4 or 5)
            gt_anchor_deltas = [
                self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes
            ]
            gt_anchor_deltas = torch.stack(
                gt_anchor_deltas)  # (N, sum(Hi*Wi*Ai), 4 or 5)
            localization_loss = smooth_l1_loss(
                cat(pred_anchor_deltas, dim=1)[pos_mask],
                gt_anchor_deltas[pos_mask],
                self.smooth_l1_beta,
                reduction=reduction,
            )
        elif self.box_reg_loss_type == "giou":
            pred_proposals = self._decode_proposals(anchors,
                                                    pred_anchor_deltas)
            pred_proposals = cat(pred_proposals, dim=1)
            pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1])
            pos_mask = pos_mask.view(-1)
            localization_loss = giou_loss(pred_proposals[pos_mask],
                                          cat(gt_boxes)[pos_mask],
                                          reduction=reduction)
        else:
            raise ValueError(
                f"Invalid rpn box reg loss type '{self.box_reg_loss_type}'")

        valid_mask = gt_labels >= 0
        objectness_loss = F.binary_cross_entropy_with_logits(
            cat(pred_objectness_logits, dim=1)[valid_mask],
            gt_labels[valid_mask].to(torch.float32),
            reduction=reduction,
        )
        normalizer = self.batch_size_per_image * num_images
        losses = {
            "loss_rpn_cls": objectness_loss / normalizer,
            "loss_rpn_loc": localization_loss / normalizer,
        }
        losses = {
            k: v * self.loss_weight.get(k, 1.0)
            for k, v in losses.items()
        }
        return losses
Exemple #11
0
    def losses(
        self,
        anchors,
        pred_objectness_logits: List[torch.Tensor],
        gt_labels: List[torch.Tensor],
        pred_anchor_deltas: List[torch.Tensor],
        gt_boxes,
    ):
        """
        Return the losses from a set of RPN predictions and their associated ground-truth.

        Args:
            anchors (list[Boxes or RotatedBoxes]): anchors for each feature map, each
                has shape (Hi*Wi*A, B), where B is box dimension (4 or 5).
            pred_objectness_logits (list[Tensor]): A list of L elements.
                Element i is a tensor of shape (N, Hi*Wi*A) representing
                the predicted objectness logits for all anchors.
            gt_labels (list[Tensor]): Output of :meth:`label_and_sample_anchors`.
            pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape
                (N, Hi*Wi*A, 4 or 5) representing the predicted "deltas" used to transform anchors
                to proposals.
            gt_boxes (list[Boxes or RotatedBoxes]): Output of :meth:`label_and_sample_anchors`.

        Returns:
            dict[loss name -> loss value]: A dict mapping from loss name to loss value.
                Loss names are: `loss_rpn_cls` for objectness classification and
                `loss_rpn_loc` for proposal localization.
        """
        num_images = len(gt_labels)
        gt_labels = torch.stack(gt_labels)  # (N, sum(Hi*Wi*Ai))
        anchors = type(anchors[0]).cat(anchors).tensor  # Ax(4 or 5)
        #print(anchors.shape, gt_boxes[0].shape, len(gt_boxes))
        gt_anchor_deltas = [self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes]
        gt_anchor_deltas = torch.stack(gt_anchor_deltas)  # (N, sum(Hi*Wi*Ai), 4 or 5)

        # Log the number of positive/negative anchors per-image that's used in training
        pos_mask = gt_labels == 1
        num_pos_anchors = pos_mask.sum().item()
        num_neg_anchors = (gt_labels == 0).sum().item()
        storage = get_event_storage()
        storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images)
        storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images)

        localization_loss = smooth_l1_loss(
            cat(pred_anchor_deltas, dim=1)[pos_mask],
            gt_anchor_deltas[pos_mask],
            self.smooth_l1_beta,
            reduction="sum",
        )
        valid_mask = gt_labels >= 0
        objectness_loss = F.binary_cross_entropy_with_logits(
            cat(pred_objectness_logits, dim=1)[valid_mask],
            gt_labels[valid_mask].to(torch.float32),
            reduction="sum",
        )
        normalizer = self.batch_size_per_image * num_images
        return {
            "loss_rpn_cls": objectness_loss / normalizer,
            "loss_rpn_loc": localization_loss / normalizer,
        }
Exemple #12
0
    def smooth_l1_loss(self):
        """
        Compute the smooth L1 loss for box regression.

        Returns:
            scalar Tensor
        """

        if self._no_instances:
            return 0.0 * self.pred_proposal_deltas.sum()
        gt_proposal_deltas = self.box2box_transform.get_deltas(
            self.proposals.tensor, self.gt_boxes.tensor
        )

        box_dim = gt_proposal_deltas.size(1)  # 4 or 5
        cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim
        device = self.pred_proposal_deltas.device

        bg_class_ind = self.pred_class_logits.shape[1] - 1

        # Box delta loss is only computed between the prediction for the gt class k
        # (if 0 <= k < bg_class_ind) and the target; there is no loss defined on predictions
        # for non-gt classes and background.
        # Empty fg_inds produces a valid loss of zero as long as the size_average
        # arg to smooth_l1_loss is False (otherwise it uses torch.mean internally
        # and would produce a nan loss).
        fg_inds = torch.nonzero(
            (self.gt_classes >= 0) & (self.gt_classes < bg_class_ind), as_tuple=True
        )[0]
        if cls_agnostic_bbox_reg:
            # pred_proposal_deltas only corresponds to foreground class for agnostic
            gt_class_cols = torch.arange(box_dim, device=device)
        else:
            fg_gt_classes = self.gt_classes[fg_inds]
            # pred_proposal_deltas for class k are located in columns [b * k : b * k + b],
            # where b is the dimension of box representation (4 or 5)
            # Note that compared to Detectron1,
            # we do not perform bounding box regression for background classes.
            gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange(box_dim, device=device)

        loss_box_reg = smooth_l1_loss(
            self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols],
            gt_proposal_deltas[fg_inds],
            self.smooth_l1_beta,
            reduction="sum",
        )
        # The loss is normalized using the total number of regions (R), not the number
        # of foreground regions even though the box regression loss is only defined on
        # foreground regions. Why? Because doing so gives equal training influence to
        # each foreground example. To see how, consider two different minibatches:
        #  (1) Contains a single foreground region
        #  (2) Contains 100 foreground regions
        # If we normalize by the number of foreground regions, the single example in
        # minibatch (1) will be given 100 times as much influence as each foreground
        # example in minibatch (2). Normalizing by the total number of regions, R,
        # means that the single example in minibatch (1) and each of the 100 examples
        # in minibatch (2) are given equal influence.
        loss_box_reg = loss_box_reg / self.gt_classes.numel()
        return loss_box_reg
Exemple #13
0
    def losses(self, pred_logits, pred_init_boxes, gt_init_bboxes,
               gt_cls: torch.Tensor, strides):
        """
        Loss computation.
        Args:
            pred_logits: (N, X, C). Classification prediction, where X is the number
                of positions from all feature levels, C is the number of object classes.
            pred_init_boxes: (N, X, 4). Init box prediction.
            pred_refine_boxes: (N, X, 4). Refined box prediction.
            gt_init_objectness: (N, X). Foreground/background classification for initial
                prediction.
            gt_init_bboxes: (N, X, 4). Initial box prediction.
            gt_cls: (N, X), Long. GT for box classification, -1 indicates ignoring.
            gt_refine_bboxes: (N, X, 4). Refined box prediction.
            strides: (X). Scale factor at each position.
        Returns:
            dict[str, Tensor]:
                mapping from a named loss to a scalar tensor
                storing the loss. Used during training only. The dict keys are:
                "loss_cls", "loss_localization_init", and "loss_localization_refine".
        """

        valid_idxs = gt_cls >= 0
        foreground_idxs = valid_idxs.logical_and(gt_cls != self.num_classes)
        num_foreground = foreground_idxs.sum().item() / gt_init_bboxes.shape[0]
        get_event_storage().put_scalar("num_foreground", num_foreground)

        gt_cls_target = torch.zeros_like(pred_logits)
        gt_cls_target[foreground_idxs, gt_cls[foreground_idxs]] = 1

        self.loss_normalizer = (
            self.loss_normalizer_momentum * self.loss_normalizer +
            (1 - self.loss_normalizer_momentum) * num_foreground)

        loss_cls = sigmoid_focal_loss_jit(pred_logits[valid_idxs],
                                          gt_cls_target[valid_idxs],
                                          alpha=self.focal_loss_alpha,
                                          gamma=self.focal_loss_gamma,
                                          reduction="sum") / max(
                                              1, self.loss_normalizer)

        strides = strides[None].repeat(pred_logits.shape[0], 1)
        coords_norm_init = strides[foreground_idxs].unsqueeze(-1) * 4
        loss_localization_init = smooth_l1_loss(
            pred_init_boxes[foreground_idxs] / coords_norm_init,
            gt_init_bboxes[foreground_idxs] / coords_norm_init,
            0.11,
            reduction='sum') / max(1, self.loss_normalizer)

        #        coords_norm = strides[foreground_idxs].unsqueeze(-1) * 4
        #        loss_localization = smooth_l1_loss(
        #            pred_boxes[foreground_idxs] / coords_norm,
        #            gt_bboxes[foreground_idxs] / coords_norm,
        #            0.11, reduction="sum") / max(1, self.loss_normalizer)

        return {
            "loss_cls": loss_cls,
            "loss_localization": loss_localization_init
        }
Exemple #14
0
    def test_empty_inputs(self) -> None:
        inputs = torch.empty([0, 10], dtype=torch.float32).requires_grad_()
        targets = torch.empty([0, 10], dtype=torch.float32)
        loss = smooth_l1_loss(inputs, targets, beta=0.5, reduction="mean")
        loss.backward()

        self.assertEqual(loss.detach().numpy(), 0.0)
        self.assertIsNotNone(inputs.grad)
Exemple #15
0
def _dense_box_regression_loss(
    anchors: List[Boxes],
    box2box_transform: Box2BoxTransform,
    pred_anchor_deltas: List[torch.Tensor],
    gt_boxes: List[torch.Tensor],
    fg_mask: torch.Tensor,
    box_reg_loss_type="smooth_l1",
    smooth_l1_beta=0.0,
):
    """
    Compute loss for dense multi-level box regression.
    Loss is accumulated over ``fg_mask``.

    Args:
        anchors: #lvl anchor boxes, each is (HixWixA, 4)
        pred_anchor_deltas: #lvl predictions, each is (N, HixWixA, 4)
        gt_boxes: N ground truth boxes, each has shape (R, 4) (R = sum(Hi * Wi * A))
        fg_mask: the foreground boolean mask of shape (N, R) to compute loss on
        box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou",
            "diou", "ciou".
        smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to
            use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1"
    """
    anchors = type(anchors[0]).cat(anchors).tensor  # (R, 4)
    if box_reg_loss_type == "smooth_l1":
        gt_anchor_deltas = [box2box_transform.get_deltas(anchors, k) for k in gt_boxes]
        gt_anchor_deltas = torch.stack(gt_anchor_deltas)  # (N, R, 4)
        loss_box_reg = smooth_l1_loss(
            cat(pred_anchor_deltas, dim=1)[fg_mask],
            gt_anchor_deltas[fg_mask],
            beta=smooth_l1_beta,
            reduction="sum",
        )
    elif box_reg_loss_type == "giou":
        pred_boxes = [
            box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
        ]
        loss_box_reg = giou_loss(
            torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
        )
    elif box_reg_loss_type == "diou":
        pred_boxes = [
            box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
        ]
        loss_box_reg = diou_loss(
            torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
        )
    elif box_reg_loss_type == "ciou":
        pred_boxes = [
            box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
        ]
        loss_box_reg = ciou_loss(
            torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
        )
    else:
        raise ValueError(f"Invalid dense box regression loss type '{box_reg_loss_type}'")
    return loss_box_reg
Exemple #16
0
    def losses(self, anchors, pred_logits, gt_labels, pred_anchor_deltas,
               gt_boxes):
        """
        Args:
            anchors (list[Boxes]): a list of #feature level Boxes
            gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`.
                Their shapes are (N, R) and (N, R, 4), respectively, where R is
                the total number of anchors across levels, i.e. sum(Hi x Wi x Ai)
            pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the
                list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4).
                Where K is the number of classes used in `pred_logits`.

        Returns:
            dict[str, Tensor]:
                mapping from a named loss to a scalar tensor
                storing the loss. Used during training only. The dict keys are:
                "loss_cls" and "loss_box_reg"
        """
        num_images = len(gt_labels)
        gt_labels = torch.stack(gt_labels)  # (N, R)
        anchors = type(anchors[0]).cat(anchors).tensor  # (R, 4)
        gt_anchor_deltas = [
            self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes
        ]
        gt_anchor_deltas = torch.stack(gt_anchor_deltas)  # (N, R, 4)

        valid_mask = gt_labels >= 0
        pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes)
        num_pos_anchors = pos_mask.sum().item()
        get_event_storage().put_scalar("num_pos_anchors",
                                       num_pos_anchors / num_images)
        self.loss_normalizer = self.loss_normalizer_momentum * self.loss_normalizer + (
            1 - self.loss_normalizer_momentum) * max(num_pos_anchors, 1)

        # classification and regression loss
        gt_labels_target = F.one_hot(
            gt_labels[valid_mask], num_classes=self.num_classes +
            1)[:, :-1]  # no loss for the last (background) class
        loss_cls = sigmoid_focal_loss_jit(
            cat(pred_logits, dim=1)[valid_mask],
            gt_labels_target.to(pred_logits[0].dtype),
            alpha=self.focal_loss_alpha,
            gamma=self.focal_loss_gamma,
            reduction="sum",
        )

        loss_box_reg = smooth_l1_loss(
            cat(pred_anchor_deltas, dim=1)[pos_mask],
            gt_anchor_deltas[pos_mask],
            beta=self.smooth_l1_loss_beta,
            reduction="sum",
        )
        return {
            "loss_cls": loss_cls / self.loss_normalizer,
            "loss_box_reg": loss_box_reg / self.loss_normalizer,
        }
Exemple #17
0
    def overlap_prob_loss(self):
        self.pred_overlap_prob = self.pred_overlap_prob
        loss_overlap_prob = smooth_l1_loss(
            self.pred_overlap_prob[:, 0],  # logit --> sigmoid
            self.overlap_iou,
            self.cls_box_beta,
            reduction="sum",
        )

        return loss_overlap_prob / (self.pred_overlap_prob.size(0) + 1e-6)
    def box_reg_loss(self):
        """
        Deprecated
        """
        if self._no_instances:
            return 0.0 * self.pred_proposal_deltas.sum()

        box_dim = self.proposals.tensor.size(1)  # 4 or 5
        cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim
        device = self.pred_proposal_deltas.device

        bg_class_ind = self.pred_class_logits.shape[1] - 1
        # Box delta loss is only computed between the prediction for the gt class k
        # (if 0 <= k < bg_class_ind) and the target; there is no loss defined on predictions
        # for non-gt classes and background.
        # Empty fg_inds should produce a valid loss of zero because reduction=sum.
        fg_inds = nonzero_tuple((self.gt_classes >= 0)
                                & (self.gt_classes < bg_class_ind))[0]

        if cls_agnostic_bbox_reg:
            # pred_proposal_deltas only corresponds to foreground class for agnostic
            gt_class_cols = torch.arange(box_dim, device=device)
        else:
            # pred_proposal_deltas for class k are located in columns [b * k : b * k + b],
            # where b is the dimension of box representation (4 or 5)
            # Note that compared to Detectron1,
            # we do not perform bounding box regression for background classes.
            gt_class_cols = box_dim * self.gt_classes[
                fg_inds, None] + torch.arange(box_dim, device=device)

        if self.box_reg_loss_type == "smooth_l1":
            gt_proposal_deltas = self.box2box_transform.get_deltas(
                self.proposals.tensor, self.gt_boxes.tensor)
            loss_box_reg = smooth_l1_loss(
                self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols],
                gt_proposal_deltas[fg_inds],
                self.smooth_l1_beta,
                reduction="sum",
            )
        elif self.box_reg_loss_type == "giou":
            fg_pred_boxes = self.box2box_transform.apply_deltas(
                self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols],
                self.proposals.tensor[fg_inds],
            )
            loss_box_reg = giou_loss(
                fg_pred_boxes,
                self.gt_boxes.tensor[fg_inds],
                reduction="sum",
            )
        else:
            raise ValueError(
                f"Invalid bbox reg loss type '{self.box_reg_loss_type}'")

        loss_box_reg = loss_box_reg / self.gt_classes.numel()
        return loss_box_reg
Exemple #19
0
    def losses(self, gt_classes, gt_anchors_deltas, pred_class_logits,
               pred_anchor_deltas):
        """
        Args:
            For `gt_classes` and `gt_anchors_deltas` parameters, see
                :meth:`RetinaNet.get_ground_truth`.
            Their shapes are (N, R) and (N, R, 4), respectively, where R is
            the total number of anchors across levels, i.e. sum(Hi x Wi x A)
            For `pred_class_logits` and `pred_anchor_deltas`, see
                :meth:`RetinaNetHead.forward`.

        Returns:
            dict[str, Tensor]:
                mapping from a named loss to a scalar tensor
                storing the loss. Used during training only. The dict keys are:
                "loss_cls" and "loss_box_reg"
        """
        pred_class_logits, pred_anchor_deltas = permute_all_cls_and_box_to_N_HWA_K_and_concat(
            pred_class_logits, pred_anchor_deltas, self.num_classes
        )  # Shapes: (N x R, K) and (N x R, 4), respectively.

        gt_classes = gt_classes.flatten()
        gt_anchors_deltas = gt_anchors_deltas.view(-1, 4)

        valid_idxs = gt_classes >= 0
        foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes)
        num_foreground = foreground_idxs.sum().item()
        get_event_storage().put_scalar("num_foreground", num_foreground)
        self.loss_normalizer = (
            self.loss_normalizer_momentum * self.loss_normalizer +
            (1 - self.loss_normalizer_momentum) * num_foreground)

        gt_classes_target = torch.zeros_like(pred_class_logits)
        gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1

        # logits loss
        loss_cls = sigmoid_focal_loss_jit(
            pred_class_logits[valid_idxs],
            gt_classes_target[valid_idxs],
            alpha=self.focal_loss_alpha,
            gamma=self.focal_loss_gamma,
            reduction="sum",
        ) / max(1, self.loss_normalizer)

        # regression loss
        loss_box_reg = smooth_l1_loss(
            pred_anchor_deltas[foreground_idxs],
            gt_anchors_deltas[foreground_idxs],
            beta=self.smooth_l1_loss_beta,
            reduction="sum",
        ) / max(1, self.loss_normalizer)

        return {"loss_cls": loss_cls, "loss_box_reg": loss_box_reg}
Exemple #20
0
def reg_loss(output, mask, index, target, loss_type="l1", smooth_l1_beta=0.1):
    pred = gather_feature(output, index, use_transform=True)
    mask = mask.unsqueeze(dim=2).expand_as(pred).float()
    if loss_type == "l1":
        loss = F.l1_loss(pred * mask, target * mask, reduction="sum")
    elif loss_type == "smooth_l1":
        loss = smooth_l1_loss(pred * mask,
                              target * mask,
                              smooth_l1_beta,
                              reduction="sum")
    loss = loss / (mask.sum() + 1e-4)
    return loss
Exemple #21
0
def rpn_losses(gt_labels, gt_anchor_deltas, pred_objectness_logits,
               pred_anchor_deltas, smooth_l1_beta, cfg, box2box_transform):
    """
    Args:
        gt_labels (Tensor): shape (N,), each element in {-1, 0, 1} representing
            ground-truth objectness labels with: -1 = ignore; 0 = not object; 1 = object.
        gt_anchor_deltas (Tensor): shape (N, box_dim), row i represents ground-truth
            box2box transform targets (dx, dy, dw, dh) or (dx, dy, dw, dh, da) that map anchor i to
            its matched ground-truth box.
        pred_objectness_logits (Tensor): shape (N,), each element is a predicted objectness
            logit.
        pred_anchor_deltas (Tensor): shape (N, box_dim), each row is a predicted box2box
            transform (dx, dy, dw, dh) or (dx, dy, dw, dh, da)
        smooth_l1_beta (float): The transition point between L1 and L2 loss in
            the smooth L1 loss function. When set to 0, the loss becomes L1. When
            set to +inf, the loss becomes constant 0.

        #Added for DIOU implementation
        cfg (configuration): Hacky way to get which loss to apply for bbox
        box2box_transform: To get predetermined weights and scale_clamp

    Returns:
        objectness_loss, localization_loss, both unnormalized (summed over samples).
    """
    pos_masks = gt_labels == 1

    # Will need to improve the configuration part
    reg_loss = cfg.MODEL.RPN_LOSS_TYPE
    localization_loss = 0

    if reg_loss == "diou":
        localization_loss = compute_diou(pos_masks, gt_anchor_deltas,
                                         pred_anchor_deltas, box2box_transform,
                                         cfg.SOLVER.IMS_PER_BATCH,
                                         cfg.MODEL.RPN_LOSS_BBOX_WEIGHT)
    else:
        localization_loss = smooth_l1_loss(pred_anchor_deltas[pos_masks],
                                           gt_anchor_deltas[pos_masks],
                                           smooth_l1_beta,
                                           reduction="sum")

    valid_masks = gt_labels >= 0
    objectness_loss = F.binary_cross_entropy_with_logits(
        pred_objectness_logits[valid_masks],
        gt_labels[valid_masks].to(torch.float32),
        reduction="sum",
    )
    return objectness_loss, localization_loss
Exemple #22
0
 def overlap_smooth_l1_loss(self, fg_inds):
     overlap_deltas = self.box2box_transform.get_deltas(
         self.proposals.tensor[fg_inds], self.overlap_gt_boxes.tensor)
     trained_idx = torch.nonzero(
         self.overlap_iou > self.overlap_iou_threshold).squeeze(1)
     loss_overlap_reg = smooth_l1_loss(
         self.pred_overlap_deltas[trained_idx],
         overlap_deltas[trained_idx],
         self.smooth_l1_beta,
         reduction="sum",
     )
     if self.uniform_reg_divisor:
         return loss_overlap_reg / (self.gt_classes.numel() + 1e-6)
     else:
         return loss_overlap_reg / (trained_idx.size(0) +
                                    1e-6) * self.loss_overlap_reg_coeff
Exemple #23
0
    def box_reg_loss(self):
        """
        change _no_instance handling and normalization
        """
        if self._no_instances:
            print('No instance in box reg loss')
            return self.pred_proposal_deltas.sum() * 0.

        box_dim = self.gt_boxes.tensor.size(1)  # 4 or 5
        cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim
        device = self.pred_proposal_deltas.device

        bg_class_ind = self.pred_class_logits.shape[1] - 1

        fg_inds = nonzero_tuple((self.gt_classes >= 0)
                                & (self.gt_classes < bg_class_ind))[0]
        if cls_agnostic_bbox_reg:
            gt_class_cols = torch.arange(box_dim, device=device)
        else:
            fg_gt_classes = self.gt_classes[fg_inds]
            gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange(
                box_dim, device=device)

        if self.box_reg_loss_type == "smooth_l1":
            gt_proposal_deltas = self.box2box_transform.get_deltas(
                self.proposals.tensor, self.gt_boxes.tensor)
            loss_box_reg = smooth_l1_loss(
                self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols],
                gt_proposal_deltas[fg_inds],
                self.smooth_l1_beta,
                reduction="sum",
            )
        elif self.box_reg_loss_type == "giou":
            loss_box_reg = giou_loss(
                self._predict_boxes()[fg_inds[:, None], gt_class_cols],
                self.gt_boxes.tensor[fg_inds],
                reduction="sum",
            )
        else:
            raise ValueError(
                f"Invalid bbox reg loss type '{self.box_reg_loss_type}'")

        if self.fix_norm_reg:
            loss_box_reg = loss_box_reg / self.box_batch_size
        else:
            loss_box_reg = loss_box_reg / self.gt_classes.numel()
        return loss_box_reg
Exemple #24
0
def z_rcnn_loss(z_pred, instances, src_boxes, loss_weight=1.0, smooth_l1_beta=0.0):
    """
    Compute the z_pred loss.

    Args:
        z_pred (Tensor): A tensor of shape (B, C) or (B, 1) for class-specific or class-agnostic,
            where B is the total number of foreground regions in all images, C is the number of foreground classes,
        instances (list[Instances]): A list of N Instances, where N is the number of images
            in the batch. The ground-truth labels (class, box, mask,
            ...) associated with each instance are stored in fields.

    Returns:
        loss (Tensor): A scalar tensor containing the loss.
    """
    cls_agnostic_z = z_pred.size(1) == 1
    total_num = z_pred.size(0)

    gt_classes = []
    gt_dz = []
    for instances_per_image in instances:
        if len(instances_per_image) == 0:
            continue
        if not cls_agnostic_z:
            gt_classes_per_image = instances_per_image.gt_classes.to(dtype=torch.int64)
            gt_classes.append(gt_classes_per_image)

        gt_dz.append(instances_per_image.gt_dz)

    if len(gt_dz) == 0:
        return z_pred.sum() * 0

    gt_dz = cat(gt_dz, dim=0)
    assert gt_dz.numel() > 0
    src_heights = src_boxes[:, 3] - src_boxes[:, 1]
    dz = torch.log(gt_dz / src_heights)

    if cls_agnostic_z:
        z_pred = z_pred[:, 0]
    else:
        indices = torch.arange(total_num)
        gt_classes = cat(gt_classes, dim=0)
        z_pred = z_pred[indices, gt_classes]

    loss_z_reg = smooth_l1_loss(z_pred, dz, smooth_l1_beta, reduction="sum")
    loss_z_reg = loss_weight * loss_z_reg / gt_classes.numel()
    return loss_z_reg
Exemple #25
0
def plane_rcnn_loss(plane_pred,
                    instances,
                    loss_weight=1.0,
                    smooth_l1_beta=0.0,
                    plane_normal_only=False):
    """
    Compute the plane_param loss.
    Args:
        z_pred (Tensor): A tensor of shape (B, C) or (B, 1) for class-specific or class-agnostic,
            where B is the total number of foreground regions in all images, C is the number of foreground classes,
        instances (list[Instances]): A list of N Instances, where N is the number of images
            in the batch. The ground-truth labels (class, box, mask,
            ...) associated with each instance are stored in fields.
    Returns:
        loss (Tensor): A scalar tensor containing the loss.
    """
    gt_param = []
    for instances_per_image in instances:
        if len(instances_per_image) == 0:
            continue
        gt_param.append(instances_per_image.gt_planes)

    if len(gt_param) == 0:
        return plane_pred.sum() * 0

    gt_param = cat(gt_param, dim=0)
    if plane_normal_only:
        gt_param = F.normalize(gt_param, p=2, dim=1)
    assert len(plane_pred) > 0

    loss_plane_reg = smooth_l1_loss(plane_pred,
                                    gt_param,
                                    smooth_l1_beta,
                                    reduction="sum")
    loss_plane_reg = loss_weight * loss_plane_reg / len(plane_pred)
    return loss_plane_reg
    def losses(self, init_gt_classes, init_reg_targets, refine_gt_classes, refine_reg_targets, \
               pred_class_logits, pred_box_reg_init, pred_box_reg, pred_center_score, strides, pred_ratio):

        strides = strides.repeat(pred_class_logits[0].shape[0])  # [N*X]
        pred_class_logits, pred_box_reg_init, pred_box_reg, pred_center_score, pred_ratio = \
            permute_and_concat(pred_class_logits, pred_box_reg_init, pred_box_reg, pred_center_score, pred_ratio, self.num_classes)
        # Shapes: (N x R) and (N x R, 4), (N x R) respectively.

        init_gt_classes = init_gt_classes.flatten()
        init_reg_targets = init_reg_targets.view(-1, 4)

        init_foreground_idxs = (init_gt_classes >= 0) & (init_gt_classes != self.num_classes)
        init_pos_inds = torch.nonzero(init_foreground_idxs).squeeze(1)

        num_gpus = get_num_gpus()
        # sync num_pos from all gpus
        init_total_num_pos = reduce_sum(init_pos_inds.new_tensor([init_pos_inds.numel()])).item()
        init_num_pos_avg_per_gpu = max(init_total_num_pos / float(num_gpus), 1.0)

        refine_gt_classes = refine_gt_classes.flatten()
        refine_reg_targets = refine_reg_targets.view(-1, 4)

        refine_foreground_idxs = (refine_gt_classes >= 0) & (refine_gt_classes != self.num_classes)
        refine_pos_inds = torch.nonzero(refine_foreground_idxs).squeeze(1)

        # sync num_pos from all gpus
        refine_total_num_pos = reduce_sum(refine_pos_inds.new_tensor([refine_pos_inds.numel()])).item()
        refine_num_pos_avg_per_gpu = max(refine_total_num_pos / float(num_gpus), 1.0)

        gt_classes_target = torch.zeros_like(pred_class_logits)
        gt_classes_target[refine_foreground_idxs, refine_gt_classes[refine_foreground_idxs]] = 1

        # logits loss
        cls_loss = sigmoid_focal_loss_jit(
            pred_class_logits, gt_classes_target,
            alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum",
        ) / refine_num_pos_avg_per_gpu
        
        init_foreground_targets = init_reg_targets[init_foreground_idxs]
        gt_ratio_1 = (init_foreground_targets[:,0] + init_foreground_targets[:,2]) \
            / (init_foreground_targets[:,1] + init_foreground_targets[:,3])
        gt_ratio_2 = 1 / gt_ratio_1
        gt_ratios = torch.stack((gt_ratio_1,gt_ratio_2), dim = 1)
        gt_ratio = gt_ratios.min(dim=1)[0]
        gt_center_score = compute_centerness_targets(init_reg_targets[init_foreground_idxs], gt_ratio)
        
        # average sum_centerness_targets from all gpus,
        # which is used to normalize centerness-weighed reg loss
        sum_centerness_targets_avg_per_gpu = \
            reduce_sum(gt_center_score.sum()).item() / float(num_gpus)
        reg_loss_init = iou_loss(
            pred_box_reg_init[init_foreground_idxs], init_reg_targets[init_foreground_idxs], gt_center_score,
            loss_type=self.iou_loss_type
        ) / sum_centerness_targets_avg_per_gpu

        coords_norm_refine = strides[refine_foreground_idxs].unsqueeze(-1) * 4
        reg_loss = smooth_l1_loss(
            pred_box_reg[refine_foreground_idxs] / coords_norm_refine,
            refine_reg_targets[refine_foreground_idxs] / coords_norm_refine,
            0.11, reduction="sum") / max(1, refine_num_pos_avg_per_gpu)
        #        reg_loss = iou_loss(
        #            pred_box_reg[refine_foreground_idxs], refine_reg_targets[refine_foreground_idxs], 1,
        #            loss_type=self.iou_loss_type
        #        ) / sum_centerness_targets_avg_per_gpu
        centerness_loss = F.binary_cross_entropy_with_logits(
            torch.pow(torch.abs(pred_center_score[init_foreground_idxs]), pred_ratio[init_foreground_idxs]), gt_center_score, reduction='sum'
        ) / init_num_pos_avg_per_gpu

        return dict(cls_loss=cls_loss, reg_loss_init=reg_loss_init, reg_loss=reg_loss, centerness_loss=centerness_loss)
Exemple #27
0
    def losses(
        self,
        gt_class_info,
        gt_delta_info,
        gt_mask_info,
        num_fg,
        pred_logits,
        pred_deltas,
        pred_masks,
    ):
        """
        Args:
            For `gt_class_info`, `gt_delta_info`, `gt_mask_info` and `num_fg` parameters, see
                :meth:`TensorMask.get_ground_truth`.
            For `pred_logits`, `pred_deltas` and `pred_masks`, see
                :meth:`TensorMaskHead.forward`.

        Returns:
            losses (dict[str: Tensor]): mapping from a named loss to a scalar tensor
                storing the loss. Used during training only. The potential dict keys are:
                "loss_cls", "loss_box_reg" and "loss_mask".
        """
        gt_classes_target, gt_valid_inds = gt_class_info
        gt_deltas, gt_fg_inds = gt_delta_info
        gt_masks, gt_mask_inds = gt_mask_info
        loss_normalizer = torch.tensor(max(1, num_fg), dtype=torch.float32, device=self.device)

        # classification and regression
        pred_logits, pred_deltas = permute_all_cls_and_box_to_N_HWA_K_and_concat(
            pred_logits, pred_deltas, self.num_classes
        )
        loss_cls = (
            sigmoid_focal_loss_star_jit(
                pred_logits[gt_valid_inds],
                gt_classes_target[gt_valid_inds],
                alpha=self.focal_loss_alpha,
                gamma=self.focal_loss_gamma,
                reduction="sum",
            )
            / loss_normalizer
        )

        if num_fg == 0:
            loss_box_reg = pred_deltas.sum() * 0
        else:
            loss_box_reg = (
                smooth_l1_loss(pred_deltas[gt_fg_inds], gt_deltas, beta=0.0, reduction="sum")
                / loss_normalizer
            )
        losses = {"loss_cls": loss_cls, "loss_box_reg": loss_box_reg}

        # mask prediction
        if self.mask_on:
            loss_mask = 0
            for lvl in range(self.num_levels):
                cur_level_factor = 2 ** lvl if self.bipyramid_on else 1
                for anc in range(self.num_anchors):
                    cur_gt_mask_inds = gt_mask_inds[lvl][anc]
                    if cur_gt_mask_inds is None:
                        loss_mask += pred_masks[lvl][anc][0, 0, 0, 0] * 0
                    else:
                        cur_mask_size = self.mask_sizes[anc] * cur_level_factor
                        # TODO maybe there are numerical issues when mask sizes are large
                        cur_size_divider = torch.tensor(
                            self.mask_loss_weight / (cur_mask_size ** 2),
                            dtype=torch.float32,
                            device=self.device,
                        )

                        cur_pred_masks = pred_masks[lvl][anc][
                            cur_gt_mask_inds[:, 0],  # N
                            :,  # V x U
                            cur_gt_mask_inds[:, 1],  # H
                            cur_gt_mask_inds[:, 2],  # W
                        ]

                        loss_mask += F.binary_cross_entropy_with_logits(
                            cur_pred_masks.view(-1, cur_mask_size, cur_mask_size),  # V, U
                            gt_masks[lvl][anc].to(dtype=torch.float32),
                            reduction="sum",
                            weight=cur_size_divider,
                            pos_weight=self.mask_pos_weight,
                        )
            losses["loss_mask"] = loss_mask / loss_normalizer
        return losses
def fast_rcnn_losses(gt_classes,
                     gt_proposal_deltas,
                     pred_class_logits,
                     pred_proposal_deltas,
                     smooth_l1_beta,
                     gt_light_direction=None,
                     pred_light_direction=None):
    """
    When box dimension is 4:
        Computes the classification and box delta losses defined in the Fast R-CNN paper.
    When box dimension is 5:
        Computes the same losses for Fast R-CNN with rotated boxes.

    Args:
        gt_classes (Tensor): A tensor of shape (R,) storing ground-truth classification
            labels in [0, K], including K fg class and 1 bg class.
        gt_proposal_deltas (Tensor):
            Shape (R, box_dim), row i represents ground-truth box2box transform targets
            (dx, dy, dw, dh) or (dx, dy, dw, dh, da) that map object instance i to
            its matched ground-truth box.
        pred_class_logits (Tensor): A tensor for shape (R, K + 1) storing predicted classification
            logits for the K+1-way classification problem. Each row corresponds to a predicted
            object instance.
        pred_proposal_deltas (Tensor): shape depends on whether we are doing
            cls-agnostic or cls-specific regression, and the box dimensions.
            When box_dim is 4:
            1. cls-specific: Shape (R, 4 * K), each row stores a list of class-specific
            predicted box2box transform [dx_0, dy_0, dw_0, dh_0, ..., dx_k, dy_k, dw_k, dh_k, ...]
            for each class k in [0, K). (No predictions for the background class.)
            2. cls-agnostic: Shape (R, 4), the second row stores the class-agnostic (foreground)
            predicted box2box transform.
            When box_dim is 5:
            1. cls-specific: Shape (R, 5 * K), each row stores a list of class-specific
            predicted rotated box2box transform
            [dx_0, dy_0, dw_0, dh_0, da_0, ..., dx_k, dy_k, dw_k, dh_k, da_k, ...]
            for each class k in [0, K). (No predictions for the background class.)
            2. cls-agnostic: Shape (R, 5), the second row stores the class-agnostic (foreground)
            predicted rotated box2box transform.
        smooth_l1_beta (float): The transition point between L1 and L2 loss in
            the smooth L1 loss function. When set to 0, the loss becomes L1. When
            set to +inf, the loss becomes constant 0.

    Returns:
        loss_cls, loss_box_reg (Tensor): Scalar loss values.
    """
    box_dim = gt_proposal_deltas.size(1)
    cls_agnostic_bbox_reg = pred_proposal_deltas.size(1) == box_dim
    device = pred_class_logits.device

    loss_cls = F.cross_entropy(pred_class_logits, gt_classes, reduction="mean")

    bg_class_ind = pred_class_logits.shape[1] - 1

    # Box delta loss is only computed between the prediction for the gt class k
    # (if 0 <= k < bg_class_ind) and the target; there is no loss defined on predictions
    # for non-gt classes and background.
    # Empty fg_inds produces a valid loss of zero as long as the size_average
    # arg to smooth_l1_loss is False (otherwise it uses torch.mean internally
    # and would produce a nan loss).
    fg_inds = torch.nonzero((gt_classes >= 0)
                            & (gt_classes < bg_class_ind)).squeeze(1)
    if cls_agnostic_bbox_reg:
        # pred_proposal_deltas only corresponds to foreground class for agnostic
        gt_class_cols = torch.arange(box_dim, device=device)
    else:
        fg_gt_classes = gt_classes[fg_inds]
        # pred_proposal_deltas for class k are located in columns [b * k : b * k + b],
        # where b is the dimension of box representation (4 or 5)
        # Note that compared to Detectron1,
        # we do not perform bounding box regression for background classes.
        gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange(
            box_dim, device=device)

    loss_box_reg = smooth_l1_loss(
        pred_proposal_deltas[fg_inds[:, None], gt_class_cols],
        gt_proposal_deltas[fg_inds],
        smooth_l1_beta,
        reduction="sum",
    )

    if type(pred_light_direction) == type(pred_proposal_deltas):

        pred_light_direction_ = pred_light_direction[fg_inds[:, None],
                                                     gt_class_cols]
        gt_light_direction_ = gt_light_direction[fg_inds]

        # gt_gradient =
        x2 = pred_light_direction_[:, 0]
        y2 = pred_light_direction_[:, 1]
        x1 = pred_light_direction_[:, 2]
        y1 = pred_light_direction_[:, 3]

        pred_angle = torch.atan2(y2 - y1, x2 - x1)
        gt_angle = torch.atan2(
            gt_light_direction_[:, 3] - gt_light_direction_[:, 1],
            gt_light_direction_[:, 2] - gt_light_direction_[:, 0])

        loss_light_reg = smooth_l1_loss(
            pred_angle,
            gt_angle,
            smooth_l1_beta,
            reduction='sum',
        )
        loss_light_reg = loss_light_reg / gt_classes.numel()
    else:
        loss_light_reg = None
    # The loss is normalized using the total number of regions (R), not the number
    # of foreground regions even though the box regression loss is only defined on
    # foreground regions. Why? Because doing so gives equal training influence to
    # each foreground example. To see how, consider two different minibatches:
    #  (1) Contains a single foreground region
    #  (2) Contains 100 foreground regions
    # If we normalize by the number of foreground regions, the single example in
    # minibatch (1) will be given 100 times as much influence as each foreground
    # example in minibatch (2). Normalizing by the total number of regions, R,
    # means that the single example in minibatch (1) and each of the 100 examples
    # in minibatch (2) are given equal influence.
    loss_box_reg = loss_box_reg / gt_classes.numel()

    return loss_cls, loss_box_reg, loss_light_reg
Exemple #29
0
    def losses(
        self,
        anchors: List[Boxes],
        pred_objectness_logits: List[torch.Tensor],
        gt_labels: List[torch.Tensor],
        pred_anchor_deltas: List[torch.Tensor],
        gt_boxes: List[torch.Tensor],
    ) -> Dict[str, torch.Tensor]:
        """
        Return the losses from a set of RPN predictions and their associated ground-truth.

        Args:
            anchors (list[Boxes or RotatedBoxes]): anchors for each feature map, each
                has shape (Hi*Wi*A, B), where B is box dimension (4 or 5).
            pred_objectness_logits (list[Tensor]): A list of L elements.
                Element i is a tensor of shape (N, Hi*Wi*A) representing
                the predicted objectness logits for all anchors.
            gt_labels (list[Tensor]): Output of :meth:`label_and_sample_anchors`.
            pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape
                (N, Hi*Wi*A, 4 or 5) representing the predicted "deltas" used to transform anchors
                to proposals.
            gt_boxes (list[Tensor]): Output of :meth:`label_and_sample_anchors`.

        Returns:
            dict[loss name -> loss value]: A dict mapping from loss name to loss value.
                Loss names are: `loss_rpn_cls` for objectness classification and
                `loss_rpn_loc` for proposal localization.
        """
        num_images = len(gt_labels)
        gt_labels = torch.stack(gt_labels)  # (N, sum(Hi*Wi*Ai))

        # Log the number of positive/negative anchors per-image that's used in training
        pos_mask = gt_labels == 1
        num_pos_anchors = pos_mask.sum().item()
        num_neg_anchors = (gt_labels == 0).sum().item()
        storage = get_event_storage()
        storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images)
        storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images)

        if self.box_reg_loss_type == "smooth_l1":
            anchors = type(anchors[0]).cat(anchors).tensor  # Ax(4 or 5)
            gt_anchor_deltas = [
                self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes
            ]
            gt_anchor_deltas = torch.stack(
                gt_anchor_deltas)  # (N, sum(Hi*Wi*Ai), 4 or 5)
            localization_loss = smooth_l1_loss(
                cat(pred_anchor_deltas, dim=1)[pos_mask],
                gt_anchor_deltas[pos_mask],
                self.smooth_l1_beta,
                reduction="sum",
            )
        elif self.box_reg_loss_type == "giou":
            pred_proposals = self._decode_proposals(anchors,
                                                    pred_anchor_deltas)
            pred_proposals = cat(pred_proposals, dim=1)
            pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1])
            pos_mask = pos_mask.view(-1)
            localization_loss = giou_loss(pred_proposals[pos_mask],
                                          cat(gt_boxes)[pos_mask],
                                          reduction="sum")
        elif self.box_reg_loss_type == "diou":
            anchors = type(anchors[0]).cat(anchors).tensor  # Ax(4 or 5)
            gt_anchor_deltas = [
                self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes
            ]
            gt_anchor_deltas = torch.stack(
                gt_anchor_deltas)  # (N, sum(Hi*Wi*Ai), 4 or 5)
            localization_loss = compute_diou(
                cat(pred_anchor_deltas, dim=1)[pos_mask],
                gt_anchor_deltas[pos_mask], self.box2box_transform.weights,
                self.box2box_transform.scale_clamp)
        # elif self.box_reg_loss_type == "diou_bbox":
        #     pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas)
        #     pred_proposals = cat(pred_proposals, dim=1)
        #     pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1])
        #     pos_mask = pos_mask.view(-1)
        #     localization_loss = giou_loss(
        #         pred_proposals[pos_mask], cat(gt_boxes)[pos_mask]
        #     )
        elif self.box_reg_loss_type == "diou_mmdet":
            pred_proposals = self._decode_proposals(anchors,
                                                    pred_anchor_deltas)
            pred_proposals = cat(pred_proposals, dim=1)
            pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1])
            pos_mask = pos_mask.view(-1)
            localization_loss = compute_diou_mmdet(pred_proposals[pos_mask],
                                                   cat(gt_boxes)[pos_mask])
        elif self.box_reg_loss_type == "ciou_mmdet":
            pred_proposals = self._decode_proposals(anchors,
                                                    pred_anchor_deltas)
            pred_proposals = cat(pred_proposals, dim=1)
            pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1])
            pos_mask = pos_mask.view(-1)
            localization_loss = compute_ciou_mmdet(pred_proposals[pos_mask],
                                                   cat(gt_boxes)[pos_mask])
        else:
            raise ValueError(
                f"Invalid rpn box reg loss type '{self.box_reg_loss_type}'")

        valid_mask = gt_labels >= 0
        objectness_loss = F.binary_cross_entropy_with_logits(
            cat(pred_objectness_logits, dim=1)[valid_mask],
            gt_labels[valid_mask].to(torch.float32),
            reduction="sum",
        )
        normalizer = self.batch_size_per_image * num_images
        losses = {
            "loss_rpn_cls": objectness_loss / normalizer,
            "loss_rpn_loc": localization_loss / normalizer,
        }
        losses = {
            k: v * self.loss_weight.get(k, 1.0)
            for k, v in losses.items()
        }
        return losses
Exemple #30
0
    def losses(self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes):
        """
        Args:
            anchors (list[Boxes]): a list of #feature level Boxes
            gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`.
                Their shapes are (N, R) and (N, R, 4), respectively, where R is
                the total number of anchors across levels, i.e. sum(Hi x Wi x Ai)
            pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the
                list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4).
                Where K is the number of classes used in `pred_logits`.

        Returns:
            dict[str, Tensor]:
                mapping from a named loss to a scalar tensor
                storing the loss. Used during training only. The dict keys are:
                "loss_cls" and "loss_box_reg"
        """
        num_images = len(gt_labels)
        gt_labels = torch.stack(gt_labels)  # (N, R)
        anchors = type(anchors[0]).cat(anchors).tensor  # (R, 4)
        gt_anchor_deltas = [self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes]
        gt_anchor_deltas = torch.stack(gt_anchor_deltas)  # (N, R, 4)

        valid_mask = gt_labels >= 0
        pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes)
        num_pos_anchors = pos_mask.sum().item()
        get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images)
        self.loss_normalizer = self.loss_normalizer_momentum * self.loss_normalizer + (
            1 - self.loss_normalizer_momentum
        ) * max(num_pos_anchors, 1)

        # classification and regression loss
        """gt_labels_target = F.one_hot(gt_labels[valid_mask], num_classes=self.num_classes + 1)[
            :, :-1
        ]  # no loss for the last (background) class
        loss_cls = sigmoid_focal_loss_jit(
            cat(pred_logits, dim=1)[valid_mask],
            gt_labels_target.to(pred_logits[0].dtype),
            alpha=self.focal_loss_alpha,
            gamma=self.focal_loss_gamma,
            reduction="sum",
        )"""
        gt_labels_target = gt_labels[valid_mask]
        # gt_labels_target = gt_labels_target_[gt_labels_target_ != 10]
        pred_logits = cat(pred_logits, dim=1)[valid_mask]
        # pred_logits = pred_logits[gt_labels_target_ != 10]
        unique_labels, count = torch.unique(gt_labels_target, return_counts=True)
        samples_per_cls = torch.zeros(self.num_classes + 1, dtype=torch.int64).cuda()
        samples_per_cls[unique_labels] = count
        # samples_per_cls = torch.Tensor([2632, 551, 42497, 2834, 739, 2157, 964, 0, 103, 4292])
        loss_cls = CB_loss(
            gt_labels_target,
            pred_logits,
            samples_per_cls=samples_per_cls,
            no_of_classes=self.num_classes,
            loss_type="focal",
            beta=self.cb_loss_beta,
            gamma=self.focal_loss_gamma
        )

        if self.box_reg_loss_type == "smooth_l1":
            loss_box_reg = smooth_l1_loss(
                cat(pred_anchor_deltas, dim=1)[pos_mask],
                gt_anchor_deltas[pos_mask],
                beta=self.smooth_l1_loss_beta,
                reduction="sum",
            )
        elif self.box_reg_loss_type == "giou":
            pred_boxes = [
                self.box2box_transform.apply_deltas(k, anchors)
                for k in cat(pred_anchor_deltas, dim=1)
            ]
            loss_box_reg = giou_loss(
                torch.stack(pred_boxes)[pos_mask], torch.stack(gt_boxes)[pos_mask], reduction="sum"
            )
        else:
            raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'")

        return {
            "loss_cls": loss_cls,
            "loss_box_reg": loss_box_reg / self.loss_normalizer,
        }