Exemple #1
0
    def losses(
        self,
        anchors,
        pred_objectness_logits: List[torch.Tensor],
        gt_labels: List[torch.Tensor],
        pred_anchor_deltas: List[torch.Tensor],
        gt_boxes,
    ):
        """
        Return the losses from a set of RPN predictions and their associated ground-truth.

        Args:
            anchors (list[Boxes or RotatedBoxes]): anchors for each feature map, each
                has shape (Hi*Wi*A, B), where B is box dimension (4 or 5).
            pred_objectness_logits (list[Tensor]): A list of L elements.
                Element i is a tensor of shape (N, Hi*Wi*A) representing
                the predicted objectness logits for all anchors.
            gt_labels (list[Tensor]): Output of :meth:`label_and_sample_anchors`.
            pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape
                (N, Hi*Wi*A, 4 or 5) representing the predicted "deltas" used to transform anchors
                to proposals.
            gt_boxes (list[Boxes or RotatedBoxes]): Output of :meth:`label_and_sample_anchors`.

        Returns:
            dict[loss name -> loss value]: A dict mapping from loss name to loss value.
                Loss names are: `loss_rpn_cls` for objectness classification and
                `loss_rpn_loc` for proposal localization.
        """
        num_images = len(gt_labels)
        gt_labels = torch.stack(gt_labels)  # (N, sum(Hi*Wi*Ai))
        anchors = type(anchors[0]).cat(anchors).tensor  # Ax(4 or 5)
        gt_anchor_deltas = [
            self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes
        ]
        gt_anchor_deltas = torch.stack(
            gt_anchor_deltas)  # (N, sum(Hi*Wi*Ai), 4 or 5)

        # Log the number of positive/negative anchors per-image that's used in training
        pos_mask = (gt_labels == 1) | (gt_labels == -2)
        num_pos_anchors = pos_mask.sum().item()
        num_neg_anchors = (gt_labels == 0).sum().item()
        storage = get_event_storage()
        storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images)
        storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images)

        localization_loss = smooth_l1_loss(
            cat(pred_anchor_deltas, dim=1)[pos_mask],
            gt_anchor_deltas[pos_mask],
            self.smooth_l1_beta,
            reduction="sum",
        )

        valid_mask = gt_labels >= 0
        p = torch.sigmoid(cat(pred_objectness_logits, dim=1)[valid_mask])
        gt_target = gt_labels[valid_mask].to(torch.float32)
        ce_loss = F.binary_cross_entropy_with_logits(
            cat(pred_objectness_logits, dim=1)[valid_mask],
            gt_target,
            reduction="none",
        )
        p_t = p * gt_target + (1 - p) * (1 - gt_target)
        focal_loss = ce_loss * ((1 - p_t)**self.focal_loss_gamma)
        if self.focal_loss_alpha >= 0:
            alpha_t = self.focal_loss_alpha * gt_target + (
                1 - self.focal_loss_alpha) * (1 - gt_target)
            objectness_loss = alpha_t * focal_loss
        objectness_loss = objectness_loss.sum()

        normalizer = self.batch_size_per_image * num_images
        return {
            "loss_rpn_cls": objectness_loss / normalizer,
            "loss_rpn_loc": localization_loss / normalizer,
        }
    def inference_single_image(self, locations, box_cls, center_score,
                               box_reg_init, box_reg, image_size):
        boxes_all = []
        scores_all = []
        class_idxs_all = []

        # Iterate over every feature level
        for box_cls_i, box_reg_i, locs_i, center_score_i in zip(
                box_cls, box_reg, locations, center_score):
            # (HxW, C)
            box_cls_i = box_cls_i.sigmoid_()
            keep_idxs = box_cls_i > self.pre_nms_thresh

            # multiply the classification scores with center scores
            box_cls_i *= center_score_i.sigmoid_()

            box_cls_i = box_cls_i[keep_idxs]
            keep_idxs_nonzero_i = keep_idxs.nonzero()

            box_loc_i = keep_idxs_nonzero_i[:, 0]
            class_i = keep_idxs_nonzero_i[:, 1]

            box_reg_i = box_reg_i[box_loc_i]
            locs_i = locs_i[box_loc_i]

            per_pre_nms_top_n = keep_idxs.sum().clamp(max=self.pre_nms_top_n)
            if keep_idxs.sum().item() > per_pre_nms_top_n.item():
                box_cls_i, topk_idxs = box_cls_i.topk(per_pre_nms_top_n,
                                                      sorted=False)

                class_i = class_i[topk_idxs]
                box_reg_i = box_reg_i[topk_idxs]
                locs_i = locs_i[topk_idxs]

            # predict boxes
            predicted_boxes = torch.stack([
                locs_i[:, 0] - box_reg_i[:, 0],
                locs_i[:, 1] - box_reg_i[:, 1],
                locs_i[:, 0] + box_reg_i[:, 2],
                locs_i[:, 1] + box_reg_i[:, 3],
            ],
                                          dim=1)
            box_cls_i = torch.sqrt(box_cls_i)

            boxes_all.append(predicted_boxes)
            scores_all.append(box_cls_i)
            class_idxs_all.append(class_i)

        boxes_all, scores_all, class_idxs_all = [
            cat(x) for x in [boxes_all, scores_all, class_idxs_all]
        ]

        # Apply per-class nms for each image
        keep = batched_nms(boxes_all, scores_all, class_idxs_all,
                           self.nms_threshold)
        keep = keep[:self.max_detections_per_image]

        result = Instances(image_size)
        result.pred_boxes = Boxes(boxes_all[keep])
        result.scores = scores_all[keep]
        result.pred_classes = class_idxs_all[keep]

        return result
Exemple #3
0
    def forward(self, x, box_lists):
        assert not self.training

        pooler_fmt_boxes = self.c2_preprocess(box_lists)
        num_level_assignments = len(self.level_poolers)

        if num_level_assignments == 1:
            if isinstance(self.level_poolers[0], ROIAlignRotated):
                c2_roi_align = torch.ops._caffe2.RoIAlignRotated
                aligned = True
            else:
                c2_roi_align = torch.ops._caffe2.RoIAlign
                aligned = self.level_poolers[0].aligned

            x0 = x[0]
            if x0.is_quantized:
                x0 = x0.dequantize()

            out = c2_roi_align(
                x0,
                pooler_fmt_boxes,
                order="NCHW",
                spatial_scale=float(self.level_poolers[0].spatial_scale),
                pooled_h=int(self.output_size[0]),
                pooled_w=int(self.output_size[1]),
                sampling_ratio=int(self.level_poolers[0].sampling_ratio),
                aligned=aligned,
            )
            return out

        device = pooler_fmt_boxes.device
        assert (
            self.max_level - self.min_level +
            1 == 4), "Currently DistributeFpnProposals only support 4 levels"
        fpn_outputs = torch.ops._caffe2.DistributeFpnProposals(
            to_device(pooler_fmt_boxes, "cpu"),
            roi_canonical_scale=self.canonical_box_size,
            roi_canonical_level=self.canonical_level,
            roi_max_level=self.max_level,
            roi_min_level=self.min_level,
            legacy_plus_one=False,
        )
        fpn_outputs = [to_device(x, device) for x in fpn_outputs]

        rois_fpn_list = fpn_outputs[:-1]
        rois_idx_restore_int32 = fpn_outputs[-1]

        roi_feat_fpn_list = []
        for roi_fpn, x_level, pooler in zip(rois_fpn_list, x,
                                            self.level_poolers):
            if isinstance(pooler, ROIAlignRotated):
                c2_roi_align = torch.ops._caffe2.RoIAlignRotated
                aligned = True
            else:
                c2_roi_align = torch.ops._caffe2.RoIAlign
                aligned = bool(pooler.aligned)

            if x_level.is_quantized:
                x_level = x_level.dequantize()

            roi_feat_fpn = c2_roi_align(
                x_level,
                roi_fpn,
                order="NCHW",
                spatial_scale=float(pooler.spatial_scale),
                pooled_h=int(self.output_size[0]),
                pooled_w=int(self.output_size[1]),
                sampling_ratio=int(pooler.sampling_ratio),
                aligned=aligned,
            )
            roi_feat_fpn_list.append(roi_feat_fpn)

        roi_feat_shuffled = cat(roi_feat_fpn_list, dim=0)
        assert roi_feat_shuffled.numel() > 0 and rois_idx_restore_int32.numel(
        ) > 0, (
            "Caffe2 export requires tracing with a model checkpoint + input that can produce valid"
            " detections. But no detections were obtained with the given checkpoint and input!"
        )
        roi_feat = torch.ops._caffe2.BatchPermutation(roi_feat_shuffled,
                                                      rois_idx_restore_int32)
        return roi_feat
def _fmt_box_list(box_tensor, batch_index: int):
    repeated_index = torch.full_like(box_tensor[:, :1],
                                     batch_index,
                                     dtype=box_tensor.dtype,
                                     device=box_tensor.device)
    return cat((repeated_index, box_tensor), dim=1)
Exemple #5
0
def mask_rcnn_loss(pred_mask_logits, instances, vis_period=0):
    """
    Compute the mask prediction loss defined in the Mask R-CNN paper.

    Args:
        pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask)
            for class-specific or class-agnostic, where B is the total number of predicted masks
            in all images, C is the number of foreground classes, and Hmask, Wmask are the height
            and width of the mask predictions. The values are logits.
        instances (list[Instances]): A list of N Instances, where N is the number of images
            in the batch. These instances are in 1:1
            correspondence with the pred_mask_logits. The ground-truth labels (class, box, mask,
            ...) associated with each instance are stored in fields.
        vis_period (int): the period (in steps) to dump visualization.

    Returns:
        mask_loss (Tensor): A scalar tensor containing the loss.
    """
    cls_agnostic_mask = pred_mask_logits.size(1) == 1
    total_num_masks = pred_mask_logits.size(0)
    mask_side_len = pred_mask_logits.size(2)
    assert pred_mask_logits.size(2) == pred_mask_logits.size(
        3), "Mask prediction must be square!"

    gt_classes = []
    gt_masks = []
    for instances_per_image in instances:
        if len(instances_per_image) == 0:
            continue
        if not cls_agnostic_mask:
            gt_classes_per_image = instances_per_image.gt_classes.to(
                dtype=torch.int64)
            gt_classes.append(gt_classes_per_image)

        gt_masks_per_image = instances_per_image.gt_masks.crop_and_resize(
            instances_per_image.proposal_boxes.tensor,
            mask_side_len).to(device=pred_mask_logits.device)
        # A tensor of shape (N, M, M), N=#instances in the image; M=mask_side_len
        gt_masks.append(gt_masks_per_image)

    if len(gt_masks) == 0:
        return pred_mask_logits.sum() * 0

    gt_masks = cat(gt_masks, dim=0)

    if cls_agnostic_mask:
        pred_mask_logits = pred_mask_logits[:, 0]
    else:
        indices = torch.arange(total_num_masks)
        gt_classes = cat(gt_classes, dim=0)
        pred_mask_logits = pred_mask_logits[indices, gt_classes]

    if gt_masks.dtype == torch.bool:
        gt_masks_bool = gt_masks
    else:
        # Here we allow gt_masks to be float as well (depend on the implementation of rasterize())
        gt_masks_bool = gt_masks > 0.5
    gt_masks = gt_masks.to(dtype=torch.float32)

    # Log the training accuracy (using gt classes and 0.5 threshold)
    mask_incorrect = (pred_mask_logits > 0.0) != gt_masks_bool
    mask_accuracy = 1 - (mask_incorrect.sum().item() /
                         max(mask_incorrect.numel(), 1.0))
    num_positive = gt_masks_bool.sum().item()
    false_positive = (mask_incorrect & ~gt_masks_bool).sum().item() / max(
        gt_masks_bool.numel() - num_positive, 1.0)
    false_negative = (mask_incorrect & gt_masks_bool).sum().item() / max(
        num_positive, 1.0)

    storage = get_event_storage()
    storage.put_scalar("mask_rcnn/accuracy", mask_accuracy)
    storage.put_scalar("mask_rcnn/false_positive", false_positive)
    storage.put_scalar("mask_rcnn/false_negative", false_negative)
    if vis_period > 0 and storage.iter % vis_period == 0:
        pred_masks = pred_mask_logits.sigmoid()
        vis_masks = torch.cat([pred_masks, gt_masks], axis=2)
        name = "Left: mask prediction;   Right: mask GT"
        for idx, vis_mask in enumerate(vis_masks):
            vis_mask = torch.stack([vis_mask] * 3, axis=0)
            storage.put_image(name + f" ({idx})", vis_mask)

    mask_loss = F.binary_cross_entropy_with_logits(pred_mask_logits,
                                                   gt_masks,
                                                   reduction="mean")
    return mask_loss
Exemple #6
0
    def losses(self):
        """
        Return the losses from a set of FCOS predictions and their associated ground-truth.

        Returns:
            dict[loss name -> loss value]: A dict mapping from loss name to loss value.
        """

        training_targets = self._get_ground_truth()
        labels, reg_targets, bezier_targets = (
            training_targets["labels"], training_targets["reg_targets"],
            training_targets["bezier_targets"])

        # Collect all logits and regression predictions over feature maps
        # and images to arrive at the same shape as the labels and targets
        # The final ordering is L, N, H, W from slowest to fastest axis.
        logits_pred = cat(
            [
                # Reshape: (N, C, Hi, Wi) -> (N, Hi, Wi, C) -> (N*Hi*Wi, C)
                x.permute(0, 2, 3, 1).reshape(-1, self.num_classes)
                for x in self.logits_pred
            ],
            dim=0,
        )
        reg_pred = cat(
            [
                # Reshape: (N, B, Hi, Wi) -> (N, Hi, Wi, B) -> (N*Hi*Wi, B)
                x.permute(0, 2, 3, 1).reshape(-1, 4) for x in self.reg_pred
            ],
            dim=0,
        )
        bezier_pred = cat(
            [
                # Reshape: (N, B, Hi, Wi) -> (N, Hi, Wi, B) -> (N*Hi*Wi, B)
                x.permute(0, 2, 3, 1).reshape(-1, 16) for x in self.bezier_pred
            ],
            dim=0,
        )
        ctrness_pred = cat(
            [
                # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
                x.reshape(-1) for x in self.ctrness_pred
            ],
            dim=0,
        )

        labels = cat(
            [
                # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
                x.reshape(-1) for x in labels
            ],
            dim=0,
        )

        bezier_targets = cat(
            [
                # Reshape: (N, Hi, Wi, 16) -> (N*Hi*Wi, 16)
                x.reshape(-1, 16) for x in bezier_targets
            ],
            dim=0,
        )

        reg_targets = cat(
            [
                # Reshape: (N, Hi, Wi, 4) -> (N*Hi*Wi, 4)
                x.reshape(-1, 4) for x in reg_targets
            ],
            dim=0,
        )

        return fcos_losses(
            labels,
            reg_targets,
            bezier_targets,
            logits_pred,
            reg_pred,
            bezier_pred,
            ctrness_pred,
            self.focal_loss_alpha,
            self.focal_loss_gamma,
            self.iou_loss,
        )
def find_top_rpn_proposals(
    proposals: List[torch.Tensor],
    pred_objectness_logits: List[torch.Tensor],
    image_sizes: List[Tuple[int, int]],
    nms_thresh: float,
    pre_nms_topk: int,
    post_nms_topk: int,
    min_box_size: float,
    training: bool,
):
    """
    For each feature map, select the `pre_nms_topk` highest scoring proposals,
    apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
    highest scoring proposals among all the feature maps for each image.

    Args:
        proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4).
            All proposal predictions on the feature maps.
        pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
        image_sizes (list[tuple]): sizes (h, w) for each image
        nms_thresh (float): IoU threshold to use for NMS
        pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
            When RPN is run on multiple feature maps (as in FPN) this number is per
            feature map.
        post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
            When RPN is run on multiple feature maps (as in FPN) this number is total,
            over all feature maps.
        min_box_size (float): minimum proposal box side length in pixels (absolute units
            wrt input images).
        training (bool): True if proposals are to be used in training, otherwise False.
            This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
            comment.

    Returns:
        list[Instances]: list of N Instances. The i-th Instances
            stores post_nms_topk object proposals for image i, sorted by their
            objectness score in descending order.
    """
    num_images = len(image_sizes)
    device = proposals[0].device

    # 1. Select top-k anchor for every level and every image
    topk_scores = []  # #lvl Tensor, each of shape N x topk
    topk_proposals = []
    level_ids = []  # #lvl Tensor, each of shape (topk,)
    batch_idx = torch.arange(num_images, device=device)
    for level_id, (proposals_i, logits_i) in enumerate(
            zip(proposals, pred_objectness_logits)):
        Hi_Wi_A = logits_i.shape[1]
        if isinstance(Hi_Wi_A, torch.Tensor):  # it's a tensor in tracing
            num_proposals_i = torch.clamp(Hi_Wi_A, max=pre_nms_topk)
        else:
            num_proposals_i = min(Hi_Wi_A, pre_nms_topk)

        # sort is faster than topk: https://github.com/pytorch/pytorch/issues/22812
        # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
        logits_i, idx = logits_i.sort(descending=True, dim=1)
        topk_scores_i = logits_i.narrow(1, 0, num_proposals_i)
        topk_idx = idx.narrow(1, 0, num_proposals_i)

        # each is N x topk
        topk_proposals_i = proposals_i[batch_idx[:, None],
                                       topk_idx]  # N x topk x 4

        topk_proposals.append(topk_proposals_i)
        topk_scores.append(topk_scores_i)
        level_ids.append(
            torch.full((num_proposals_i, ),
                       level_id,
                       dtype=torch.int64,
                       device=device))

    # 2. Concat all levels together
    topk_scores = cat(topk_scores, dim=1)
    topk_proposals = cat(topk_proposals, dim=1)
    level_ids = cat(level_ids, dim=0)

    # 3. For each image, run a per-level NMS, and choose topk results.
    results: List[Instances] = []
    for n, image_size in enumerate(image_sizes):
        boxes = Boxes(topk_proposals[n])
        scores_per_img = topk_scores[n]
        lvl = level_ids

        valid_mask = torch.isfinite(
            boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img)
        if not valid_mask.all():
            if training:
                raise FloatingPointError(
                    "Predicted boxes or scores contain Inf/NaN. Training has diverged."
                )
            boxes = boxes[valid_mask]
            scores_per_img = scores_per_img[valid_mask]
            lvl = lvl[valid_mask]
        boxes.clip(image_size)

        # filter empty boxes
        keep = boxes.nonempty(threshold=min_box_size)
        if _is_tracing() or keep.sum().item() != len(boxes):
            boxes, scores_per_img, lvl = boxes[keep], scores_per_img[
                keep], lvl[keep]

        keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh)
        # In Detectron1, there was different behavior during training vs. testing.
        # (https://github.com/facebookresearch/Detectron/issues/459)
        # During training, topk is over the proposals from *all* images in the training batch.
        # During testing, it is over the proposals for each image separately.
        # As a result, the training behavior becomes batch-dependent,
        # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
        # This bug is addressed in Detectron2 to make the behavior independent of batch size.
        keep = keep[:post_nms_topk]  # keep is already sorted

        res = Instances(image_size)
        res.proposal_boxes = boxes[keep]
        res.objectness_logits = scores_per_img[keep]
        results.append(res)
    return results
Exemple #8
0
    def forward(self, images, features, proposals, targets=None):
        """
        see detectron2.modeling.ROIHeads
        """
        del images
        features = [features[f] for f in self.in_features]

        if self.coordconv:
            mask_features = []
            for i in range(len(features)):
                mask_feat = self.mask_head(features[i])
                all_feat = mask_feat + features[i]
                mask_features.append(all_feat)
            features = mask_features

        if self.training:
            beziers = [p.beziers for p in targets]
            if not self.aet:
                targets = torch.cat([x.text for x in targets], dim=0)
            else:
                beziers2 = [p.top_feat for p in proposals]
                for k in range(len(targets)):
                    rec_assign = [
                        int(
                            torch.argmin(
                                torch.abs(beziers[k] -
                                          beziers2[k][i]).sum(dim=1)))
                        for i in range(len(beziers2[k]))
                    ]
                    targets[k] = torch.cat(
                        [targets[k].text, targets[k].text[rec_assign]], dim=0)
                targets = torch.cat([x for x in targets], dim=0)
                cat_beziers = []
                for ix in range(len(beziers)):
                    cat_beziers.append(cat((beziers[ix], beziers2[ix]), dim=0))
                beziers = cat_beziers
        else:
            beziers = [p.top_feat for p in proposals]
        bezier_features = self.pooler(features, beziers)
        bezier_features = self.tower(bezier_features)

        # TODO: move this part to recognizer
        if self.training:
            preds, rec_loss = self.recognizer(bezier_features, targets)
            rec_loss *= 0.05
            losses = {'rec_loss': rec_loss}
            return None, losses
        else:
            if bezier_features.size(0) == 0:
                for box in proposals:
                    box.beziers = box.top_feat
                    box.recs = box.top_feat
                return proposals, {}
            preds, _ = self.recognizer(bezier_features, targets)
            start_ind = 0
            for proposals_per_im in proposals:
                end_ind = start_ind + len(proposals_per_im)
                proposals_per_im.recs = preds[start_ind:end_ind]
                proposals_per_im.beziers = proposals_per_im.top_feat
                start_ind = end_ind
            return proposals, {}
Exemple #9
0
    def forward(self, indices, gt_instances, anchors, pred_class_logits,
                pred_anchor_deltas):
        pred_class_logits = cat(pred_class_logits,
                                dim=1).view(-1, self.num_classes)
        pred_anchor_deltas = cat(pred_anchor_deltas, dim=1).view(-1, 4)

        anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]
        N = len(anchors)
        # list[Tensor(R, 4)], one for each image
        all_anchors = Boxes.cat(anchors).tensor
        # Boxes(Tensor(N*R, 4))
        predicted_boxes = self.box2box_transform.apply_deltas(
            pred_anchor_deltas, all_anchors)
        predicted_boxes = predicted_boxes.reshape(N, -1, 4)

        ious = []
        pos_ious = []
        for i in range(N):
            src_idx, tgt_idx = indices[i]
            iou = box_iou(predicted_boxes[i, ...],
                          gt_instances[i].gt_boxes.tensor)
            if iou.numel() == 0:
                max_iou = iou.new_full((iou.size(0), ), 0)
            else:
                max_iou = iou.max(dim=1)[0]
            a_iou = box_iou(anchors[i].tensor, gt_instances[i].gt_boxes.tensor)
            if a_iou.numel() == 0:
                pos_iou = a_iou.new_full((0, ), 0)
            else:
                pos_iou = a_iou[src_idx, tgt_idx]
            ious.append(max_iou)
            pos_ious.append(pos_iou)
        ious = torch.cat(ious)
        ignore_idx = ious > self.neg_ignore_thresh
        pos_ious = torch.cat(pos_ious)
        pos_ignore_idx = pos_ious < self.pos_ignore_thresh

        src_idx = torch.cat([
            src + idx * anchors[0].tensor.shape[0]
            for idx, (src, _) in enumerate(indices)
        ])
        gt_classes = torch.full(pred_class_logits.shape[:1],
                                self.num_classes,
                                dtype=torch.int64,
                                device=pred_class_logits.device)
        gt_classes[ignore_idx] = -1
        target_classes_o = torch.cat(
            [t.gt_classes[J] for t, (_, J) in zip(gt_instances, indices)])
        target_classes_o[pos_ignore_idx] = -1
        gt_classes[src_idx] = target_classes_o

        valid_idxs = gt_classes >= 0
        foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes)
        num_foreground = foreground_idxs.sum()

        gt_classes_target = torch.zeros_like(pred_class_logits)
        gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1

        if comm.get_world_size() > 1:
            dist.all_reduce(num_foreground)
        num_foreground = num_foreground * 1.0 / comm.get_world_size()

        # cls loss
        loss_cls = sigmoid_focal_loss_jit(
            pred_class_logits[valid_idxs],
            gt_classes_target[valid_idxs],
            alpha=self.focal_loss_alpha,
            gamma=self.focal_loss_gamma,
            reduction="sum",
        )
        # reg loss
        target_boxes = torch.cat(
            [t.gt_boxes.tensor[i] for t, (_, i) in zip(gt_instances, indices)],
            dim=0)
        target_boxes = target_boxes[~pos_ignore_idx]
        matched_predicted_boxes = predicted_boxes.reshape(
            -1, 4)[src_idx[~pos_ignore_idx]]
        loss_box_reg = giou_loss(matched_predicted_boxes,
                                 target_boxes,
                                 reduction="sum")

        return {
            "loss_cls": loss_cls / max(1, num_foreground),
            "loss_box_reg": loss_box_reg / max(1, num_foreground),
        }
Exemple #10
0
    def losses(self):
        """
        Return the losses from a set of RPN predictions and their associated ground-truth.

        Returns:
            dict[loss name -> loss value]: A dict mapping from loss name to loss value.
                Loss names are: `loss_rpn_cls` for objectness classification and
                `loss_rpn_loc` for proposal localization.
        """

        def resample(label):
            """
            Randomly sample a subset of positive and negative examples by overwriting
            the label vector to the ignore value (-1) for all elements that are not
            included in the sample.
            """
            pos_idx, neg_idx = subsample_labels(
                label, self.batch_size_per_image, self.positive_fraction, 0
            )
            # Fill with the ignore label (-1), then set positive and negative labels
            label.fill_(-1)
            label.scatter_(0, pos_idx, 1)
            label.scatter_(0, neg_idx, 0)
            return label

        gt_objectness_logits, gt_anchor_deltas = self._get_ground_truth()
        """
        gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the
            total number of anchors in image i (i.e., len(anchors[i]))
        gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), B),
            where B is the box dimension
        """
        # Collect all objectness labels and delta targets over feature maps and images
        # The final ordering is L, N, H, W, A from slowest to fastest axis.
        num_anchors_per_map = [np.prod(x.shape[1:]) for x in self.pred_objectness_logits]
        num_anchors_per_image = sum(num_anchors_per_map)

        # Stack to: (N, num_anchors_per_image)
        gt_objectness_logits = torch.stack(
            [resample(label) for label in gt_objectness_logits], dim=0
        )

        # Log the number of positive/negative anchors per-image that's used in training
        num_pos_anchors = (gt_objectness_logits == 1).sum().item()
        num_neg_anchors = (gt_objectness_logits == 0).sum().item()
        storage = get_event_storage()
        storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / self.num_images)
        storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / self.num_images)

        assert gt_objectness_logits.shape[1] == num_anchors_per_image
        # Split to tuple of L tensors, each with shape (N, num_anchors_per_map)
        gt_objectness_logits = torch.split(gt_objectness_logits, num_anchors_per_map, dim=1)
        # Concat from all feature maps
        gt_objectness_logits = cat([x.flatten() for x in gt_objectness_logits], dim=0)

        # Stack to: (N, num_anchors_per_image, B)
        gt_anchor_deltas = torch.stack(gt_anchor_deltas, dim=0)
        assert gt_anchor_deltas.shape[1] == num_anchors_per_image
        B = gt_anchor_deltas.shape[2]  # box dimension (4 or 5)

        # Split to tuple of L tensors, each with shape (N, num_anchors_per_image)
        gt_anchor_deltas = torch.split(gt_anchor_deltas, num_anchors_per_map, dim=1)
        # Concat from all feature maps
        gt_anchor_deltas = cat([x.reshape(-1, B) for x in gt_anchor_deltas], dim=0)

        # Collect all objectness logits and delta predictions over feature maps
        # and images to arrive at the same shape as the labels and targets
        # The final ordering is L, N, H, W, A from slowest to fastest axis.
        pred_objectness_logits = cat(
            [
                # Reshape: (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N*Hi*Wi*A, )
                x.permute(0, 2, 3, 1).flatten()
                for x in self.pred_objectness_logits
            ],
            dim=0,
        )
        pred_anchor_deltas = cat(
            [
                # Reshape: (N, A*B, Hi, Wi) -> (N, A, B, Hi, Wi) -> (N, Hi, Wi, A, B)
                #          -> (N*Hi*Wi*A, B)
                x.view(x.shape[0], -1, B, x.shape[-2], x.shape[-1])
                .permute(0, 3, 4, 1, 2)
                .reshape(-1, B)
                for x in self.pred_anchor_deltas
            ],
            dim=0,
        )

        objectness_loss, localization_loss = rpn_losses(
            gt_objectness_logits,
            gt_anchor_deltas,
            pred_objectness_logits,
            pred_anchor_deltas,
            self.smooth_l1_beta,
        )
        normalizer = 1.0 / (self.batch_size_per_image * self.num_images)
        loss_cls = objectness_loss * normalizer  # cls: classification loss
        loss_loc = localization_loss * normalizer  # loc: localization loss
        losses = {"loss_rpn_cls": loss_cls, "loss_rpn_loc": loss_loc}

        return losses
Exemple #11
0
    def forward(self, features, pred_instances=None, targets=None):
        if self.training:
            training_targets = self.compute_targets_for_polys(targets)
            locations, reg_targets, scales, image_idx = (
                training_targets["octagon_locs"],
                training_targets["octagon_targets"],
                training_targets["scales"],
                training_targets["image_idx"],
            )
            init_locations, init_targets = (
                training_targets["quadrangle_locs"],
                training_targets["quadrangle_targets"],
            )

        else:
            assert pred_instances is not None
            init_locations, image_idx = self.sample_quadrangles_fast(
                pred_instances)
            if len(init_locations) == 0:
                return pred_instances, {}

        # enhance bottom features TODO: maybe reduce later
        for i in range(self.num_convs):
            features = self.bottom_out[i](features)

        pred_exts = self.init(self.init_snake, features, init_locations,
                              image_idx)

        if not self.training:
            h = features.shape[2] * 4
            w = features.shape[3] * 4

            poly_sample_locations = []
            for i, instance_per_im in enumerate(pred_instances):
                pred_exts_per_im = pred_exts[image_idx == i]  # N x 4 x 2
                pred_exts_per_im[..., 0] = torch.clamp(pred_exts_per_im[...,
                                                                        0],
                                                       min=0,
                                                       max=w - 1)
                pred_exts_per_im[..., 1] = torch.clamp(pred_exts_per_im[...,
                                                                        1],
                                                       min=0,
                                                       max=h - 1)
                if not instance_per_im.has("ext_points"):
                    instance_per_im.ext_points = ExtremePoints(
                        pred_exts_per_im)
                    poly_sample_locations.append(
                        self.get_octagon(pred_exts_per_im, self.num_sampling))
                else:  # NOTE: For GT Input testing
                    # print('Using GT EX')
                    poly_sample_locations.append(
                        self.get_octagon(instance_per_im.ext_points.tensor,
                                         self.num_sampling))
            locations = cat(poly_sample_locations, dim=0)

        location_preds = []

        for i in range(len(self.num_iter)):
            deformer = self.__getattr__("deformer" + str(i))
            if i == 0:
                pred_location = self.evolve(deformer, features, locations,
                                            image_idx)
            else:
                pred_location = self.evolve(deformer, features, pred_location,
                                            image_idx)
            location_preds.append(pred_location)

        if self.training:
            evolve_loss = 0
            for pred in location_preds:
                evolve_loss += (self.loss_reg(
                    pred / scales[:, None, None],
                    reg_targets / scales[:, None, None],
                ) / 3)

            init_loss = self.loss_reg(pred_exts / scales[:, None, None],
                                      init_targets / scales[:, None, None])
            losses = {
                "loss_evolve": evolve_loss * self.refine_loss_weight,
                "loss_init": init_loss * self.refine_loss_weight,
            }
            return [], losses
        else:
            new_instances = self.predict_postprocess(pred_instances, locations,
                                                     location_preds, image_idx)
            return new_instances, {}
Exemple #12
0
def mask_loss_ps(pred_mask_logits, instances, vis_period=0, training_mode=''):
    """
    Copyright (c) Facebook, Inc. and its affiliates.
    Adapted Detectron2 function.

    :param pred_mask_logits: Predicted logits by the model
    :param instances: Detectron2 instances
    :param vis_period: Determines the iterations where predictions are visualized
    :param training_mode: Determines the supervised subset of mask labels ('voc', 'nvoc',
                          '40_classes_inc' or ''). `''` signifies training on all classes.
                          All instances not of the supervised set of classes are masked.
                          This way the model does not `see` the respective ground-truth
                          labels and no information of the novel classes is backpropagated.
    :return: Pixelwise binary cross entropy loss reduced with mean.
    """
    mask_side_len = pred_mask_logits.size(2)
    assert pred_mask_logits.size(2) == pred_mask_logits.size(
        3), "Mask prediction must be square!"
    assert training_mode in PARTIALLY_SUPERVISED + [''], \
        "partially_supervised is not 'voc', 'nvoc', '40_classes_inc' or '' (empty)"

    gt_masks = []
    global_ps_mask = []
    for instances_per_image in instances:
        if len(instances_per_image) == 0:
            continue

        gt_masks_per_image = instances_per_image.gt_masks.crop_and_resize(
            instances_per_image.proposal_boxes.tensor,
            mask_side_len).to(device=pred_mask_logits.device)

        if training_mode in PARTIALLY_SUPERVISED:
            gt_classes_per_image = instances_per_image.gt_classes.to(
                dtype=torch.int64)

            ps_mask = get_ps_mask(gt_classes_per_image, training_mode)
            global_ps_mask.extend(ps_mask)

        # A tensor of shape (N, M, M), N=#instances in the image; M=mask_side_len
        gt_masks.append(gt_masks_per_image)

    gt_masks = cat(gt_masks, dim=0)
    if training_mode in PARTIALLY_SUPERVISED:
        pred_mask_logits = pred_mask_logits[global_ps_mask]
        gt_masks = gt_masks[global_ps_mask]

    if len(gt_masks) == 0 or len(pred_mask_logits) == 0:
        return pred_mask_logits.sum() * 0

    pred_mask_logits = pred_mask_logits[:, 0]

    if gt_masks.dtype == torch.bool:
        gt_masks_bool = gt_masks
    else:
        # Here we allow gt_masks to be float as well (depend on the implementation of rasterize())
        gt_masks_bool = gt_masks > 0.5
    gt_masks = gt_masks.to(dtype=torch.float32)

    # Log the training accuracy (using gt classes and 0.5 threshold)
    mask_incorrect = (pred_mask_logits > 0.0) != gt_masks_bool
    mask_accuracy = 1 - (mask_incorrect.sum().item() /
                         max(mask_incorrect.numel(), 1.0))
    num_positive = gt_masks_bool.sum().item()
    false_positive = (mask_incorrect & ~gt_masks_bool).sum().item() / max(
        gt_masks_bool.numel() - num_positive, 1.0)
    false_negative = (mask_incorrect & gt_masks_bool).sum().item() / max(
        num_positive, 1.0)

    storage = get_event_storage()
    storage.put_scalar("mask/accuracy", mask_accuracy)
    storage.put_scalar("mask/false_positive", false_positive)
    storage.put_scalar("mask/false_negative", false_negative)
    if vis_period > 0 and storage.iter % vis_period == 0:
        pred_masks = pred_mask_logits.sigmoid()
        vis_masks = torch.cat([pred_masks, gt_masks], dim=2)
        name = "Left: mask prediction;   Right: mask GT"
        for idx, vis_mask in enumerate(vis_masks):
            vis_mask = torch.stack([vis_mask] * 3, dim=0)
            storage.put_image(name + f" ({idx})", vis_mask)

    mask_loss = F.binary_cross_entropy_with_logits(pred_mask_logits,
                                                   gt_masks,
                                                   reduction="mean")
    return mask_loss
Exemple #13
0
 def fmt_box_list(box_tensor, batch_index):
     repeated_index = torch.full((len(box_tensor), 1),
                                 batch_index,
                                 dtype=box_tensor.dtype,
                                 device=box_tensor.device)
     return cat((repeated_index, box_tensor), dim=1)
Exemple #14
0
    def losses(self, indices, gt_instances, anchors, pred_class_logits,
               pred_anchor_deltas):
        pred_class_logits = cat(pred_class_logits,
                                dim=1).view(-1, self.num_classes)
        pred_anchor_deltas = cat(pred_anchor_deltas, dim=1).view(-1, 4)

        anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]
        N = len(anchors)
        # list[Tensor(R, 4)], one for each image
        all_anchors = Boxes.cat(anchors).tensor
        # Boxes(Tensor(N*R, 4))
        predicted_boxes = self.box2box_transform.apply_deltas(
            pred_anchor_deltas, all_anchors)
        predicted_boxes = predicted_boxes.reshape(N, -1, 4)

        # We obtain positive anchors by choosing gt boxes' k nearest anchors
        # and leave the rest to be negative anchors. However, there may
        # exist negative anchors that have similar distances with the chosen
        # positives. These negatives may cause ambiguity for model training
        # if we just set them as negatives. Given that we want the model's
        # predict boxes on negative anchors to have low IoU with gt boxes,
        # we set a threshold on the IoU between predicted boxes and gt boxes
        # instead of the IoU between anchor boxes and gt boxes.
        ious = []
        pos_ious = []
        for i in range(N):
            src_idx, tgt_idx = indices[i]
            iou = box_iou(predicted_boxes[i, ...],
                          gt_instances[i].gt_boxes.tensor)
            if iou.numel() == 0:
                max_iou = iou.new_full((iou.size(0), ), 0)
            else:
                max_iou = iou.max(dim=1)[0]
            a_iou = box_iou(anchors[i].tensor, gt_instances[i].gt_boxes.tensor)
            if a_iou.numel() == 0:
                pos_iou = a_iou.new_full((0, ), 0)
            else:
                pos_iou = a_iou[src_idx, tgt_idx]
            ious.append(max_iou)
            pos_ious.append(pos_iou)
        ious = torch.cat(ious)
        ignore_idx = ious > self.neg_ignore_thresh
        pos_ious = torch.cat(pos_ious)
        pos_ignore_idx = pos_ious < self.pos_ignore_thresh

        src_idx = torch.cat([
            src + idx * anchors[0].tensor.shape[0]
            for idx, (src, _) in enumerate(indices)
        ])
        gt_classes = torch.full(pred_class_logits.shape[:1],
                                self.num_classes,
                                dtype=torch.int64,
                                device=pred_class_logits.device)
        gt_classes[ignore_idx] = -1
        target_classes_o = torch.cat(
            [t.gt_classes[J] for t, (_, J) in zip(gt_instances, indices)])
        target_classes_o[pos_ignore_idx] = -1
        gt_classes[src_idx] = target_classes_o

        valid_idxs = gt_classes >= 0
        foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes)
        num_foreground = foreground_idxs.sum()

        gt_classes_target = torch.zeros_like(pred_class_logits)
        gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1

        if comm.get_world_size() > 1:
            dist.all_reduce(num_foreground)
        num_foreground = num_foreground * 1.0 / comm.get_world_size()

        # cls loss
        loss_cls = sigmoid_focal_loss_jit(
            pred_class_logits[valid_idxs],
            gt_classes_target[valid_idxs],
            alpha=self.focal_loss_alpha,
            gamma=self.focal_loss_gamma,
            reduction="sum",
        )
        # reg loss
        target_boxes = torch.cat(
            [t.gt_boxes.tensor[i] for t, (_, i) in zip(gt_instances, indices)],
            dim=0)
        target_boxes = target_boxes[~pos_ignore_idx]
        matched_predicted_boxes = predicted_boxes.reshape(
            -1, 4)[src_idx[~pos_ignore_idx]]
        loss_box_reg = giou_loss(matched_predicted_boxes,
                                 target_boxes,
                                 reduction="sum")

        return {
            "loss_cls": loss_cls / max(1, num_foreground),
            "loss_box_reg": loss_box_reg / max(1, num_foreground),
        }
Exemple #15
0
    def __init__(
        self,
        box2box_transform,
        pred_class_logits,
        pred_proposal_deltas,
        proposals,
        smooth_l1_beta=0.0,
        box_reg_loss_type="smooth_l1",
    ):
        """
        Args:
            box2box_transform (Box2BoxTransform/Box2BoxTransformRotated):
                box2box transform instance for proposal-to-detection transformations.
            pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class
                logits for all R predicted object instances.
                Each row corresponds to a predicted object instance.
            pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for
                class-specific or class-agnostic regression. It stores the predicted deltas that
                transform proposals into final box detections.
                B is the box dimension (4 or 5).
                When B is 4, each row is [dx, dy, dw, dh (, ....)].
                When B is 5, each row is [dx, dy, dw, dh, da (, ....)].
            proposals (list[Instances]): A list of N Instances, where Instances i stores the
                proposals for image i, in the field "proposal_boxes".
                When training, each Instances must have ground-truth labels
                stored in the field "gt_classes" and "gt_boxes".
                The total number of all instances must be equal to R.
            smooth_l1_beta (float): The transition point between L1 and L2 loss in
                the smooth L1 loss function. When set to 0, the loss becomes L1. When
                set to +inf, the loss becomes constant 0.
            box_reg_loss_type (str): Box regression loss type. One of: "smooth_l1", "giou"
        """
        self.box2box_transform = box2box_transform
        self.num_preds_per_image = [len(p) for p in proposals]
        self.pred_class_logits = pred_class_logits
        self.pred_proposal_deltas = pred_proposal_deltas
        self.smooth_l1_beta = smooth_l1_beta
        self.box_reg_loss_type = box_reg_loss_type

        self.image_shapes = [x.image_size for x in proposals]

        if len(proposals):
            box_type = type(proposals[0].proposal_boxes)
            # cat(..., dim=0) concatenates over all images in the batch
            self.proposals = box_type.cat(
                [p.proposal_boxes for p in proposals])
            assert (not self.proposals.tensor.requires_grad
                    ), "Proposals should not require gradients!"

            # "gt_classes" exists if and only if training. But other gt fields may
            # not necessarily exist in training for images that have no groundtruth.
            if proposals[0].has("gt_classes"):
                self.gt_classes = cat([p.gt_classes for p in proposals], dim=0)

                # If "gt_boxes" does not exist, the proposals must be all negative and
                # should not be included in regression loss computation.
                # Here we just use proposal_boxes as an arbitrary placeholder because its
                # value won't be used in self.box_reg_loss().
                gt_boxes = [
                    p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes
                    for p in proposals
                ]
                self.gt_boxes = box_type.cat(gt_boxes)
        else:
            self.proposals = Boxes(
                torch.zeros(0, 4, device=self.pred_proposal_deltas.device))
        self._no_instances = len(self.proposals) == 0  # no instances found
Exemple #16
0
def keypoint_rcnn_inter_part_loss(pred_keypoint_logits, pred_inter_keypoint_logits, instances, normalizer):
    """
    Arguments:
        pred_keypoint_logits (Tensor): A tensor of shape (N, K, S, S) where N is the total number
            of instances in the batch, K is the number of keypoints, and S is the side length
            of the keypoint heatmap. The values are spatial logits.
        instances (list[Instances]): A list of M Instances, where M is the batch size.
            These instances are predictions from the model
            that are in 1:1 correspondence with pred_keypoint_logits.
            Each Instances should contain a `gt_keypoints` field containing a `structures.Keypoint`
            instance.
        normalizer (float): Normalize the loss by this amount.
            If not specified, we normalize by the number of visible keypoints in the minibatch.

    Returns a scalar tensor containing the loss.
    """
    heatmaps = []
    inter_heatmaps = []
    valid = []

    keypoint_side_len = pred_keypoint_logits.shape[2]
    for instances_per_image in instances:
        if len(instances_per_image) == 0:
            continue
        keypoints = instances_per_image.gt_keypoints
        inter_heatmaps_per_image, _ = keypoints.to_inter_heatmap(
            instances_per_image.proposal_boxes.tensor, keypoint_side_len
        )
        tmp_inter_maps = inter_heatmaps_per_image[:,1:,:,:].reshape((inter_heatmaps_per_image.size(0),8,2,keypoint_side_len,keypoint_side_len))
        tmp_inter_maps = torch.sum(tmp_inter_maps, 2)
        inter_heatmaps_per_image = torch.cat([inter_heatmaps_per_image[:,0].unsqueeze(1), tmp_inter_maps], 1)
        inter_heatmaps_per_image = inter_heatmaps_per_image.clamp(min=0, max=1)
        inter_heatmaps.append(inter_heatmaps_per_image)

        heatmaps_per_image, valid_per_image = keypoints.to_heatmap(
            instances_per_image.proposal_boxes.tensor, keypoint_side_len
        )
        heatmaps.append(heatmaps_per_image.view(-1))
        valid.append(valid_per_image.view(-1))

    inter_keypoint_targets = cat(inter_heatmaps, dim=0).to(dtype=torch.float32).to("cuda")

    inter_part_loss = F.binary_cross_entropy(pred_inter_keypoint_logits, inter_keypoint_targets)

    if len(heatmaps):
        keypoint_targets = cat(heatmaps, dim=0)
        valid = cat(valid, dim=0).to(dtype=torch.uint8)
        valid = torch.nonzero(valid).squeeze(1)

    # torch.mean (in binary_cross_entropy_with_logits) doesn't
    # accept empty tensors, so handle it separately
    if len(heatmaps) == 0 or valid.numel() == 0:
        global _TOTAL_SKIPPED
        _TOTAL_SKIPPED += 1
        storage = get_event_storage()
        storage.put_scalar("kpts_num_skipped_batches", _TOTAL_SKIPPED, smoothing_hint=False)
        return pred_keypoint_logits.sum() * 0

    N, K, H, W = pred_keypoint_logits.shape
    pred_keypoint_logits = pred_keypoint_logits.view(N * K, H * W)

    keypoint_loss = F.cross_entropy(
        pred_keypoint_logits[valid], keypoint_targets[valid], reduction="sum"
    )

    # If a normalizer isn't specified, normalize by the number of visible keypoints in the minibatch
    if normalizer is None:
        normalizer = valid.numel()
    keypoint_loss /= normalizer

    return keypoint_loss, inter_part_loss
Exemple #17
0
    def test_caffe2_pytorch_eq(self):
        ims_per_batch = 8
        post_nms_topk = 100
        detections_per_im = 10
        num_class = 80
        score_thresh = 0.05
        nms_thresh = 0.5

        image_shapes = [torch.Size([800, 800])] * ims_per_batch
        batch_splits = [post_nms_topk] * ims_per_batch

        # NOTE: There're still some unsure minor implementation differences
        # (eg. ordering when equal score across classes) causing some seeds
        # don't pass the test.
        # Thus set a fixed seed to make sure this test passes consistantly.
        rng = torch.Generator()
        rng.manual_seed(42)
        boxes = []
        for n in batch_splits:
            box = 1000.0 * 0.5 * torch.rand(n, num_class, 4,
                                            generator=rng) + 0.001
            box[:, :, -2:] += box[:, :, :2]
            box = box.view(n, num_class * 4)
            boxes.append(box)
        scores = [
            torch.rand(n, num_class + 1, generator=rng) for n in batch_splits
        ]

        ref_results, ref_kept_indices = fast_rcnn_inference(
            boxes,
            scores,
            image_shapes,
            score_thresh=score_thresh,
            nms_thresh=nms_thresh,
            topk_per_image=detections_per_im)
        for result, kept_index, score in zip(ref_results, ref_kept_indices,
                                             scores):
            torch.testing.assert_allclose(
                score[kept_index, result.pred_classes],
                result.scores,
            )

        # clip is done in BBoxTransformOp
        c2_boxes = []
        for box, image_shape in zip(boxes, image_shapes):
            num_bbox_reg_classes = box.shape[1] // 4
            clipped_box = Boxes(box.reshape(-1, 4))
            clipped_box.clip(image_shape)
            clipped_box = clipped_box.tensor.view(-1, num_bbox_reg_classes * 4)
            c2_boxes.append(clipped_box)

        c2_boxes = cat(c2_boxes)
        c2_scores = cat(scores)
        c2_batch_splits = torch.Tensor(batch_splits)

        nms_outputs = torch.ops._caffe2.BoxWithNMSLimit(
            c2_scores,
            c2_boxes,
            c2_batch_splits,
            score_thresh=float(score_thresh),
            nms=float(nms_thresh),
            detections_per_im=int(detections_per_im),
            soft_nms_enabled=False,
            soft_nms_method="linear",
            soft_nms_sigma=0.5,
            soft_nms_min_score_thres=0.001,
            rotated=False,
            cls_agnostic_bbox_reg=False,
            input_boxes_include_bg_cls=False,
            output_classes_include_bg_cls=False,
            legacy_plus_one=False,
        )
        roi_score_nms, roi_bbox_nms, roi_class_nms, roi_batch_splits_nms, roi_keeps_nms, roi_keeps_size_nms = nms_outputs  # noqa

        roi_score_nms = roi_score_nms.split(
            roi_batch_splits_nms.int().tolist())
        roi_bbox_nms = roi_bbox_nms.split(roi_batch_splits_nms.int().tolist())
        roi_class_nms = roi_class_nms.split(
            roi_batch_splits_nms.int().tolist())
        roi_keeps_nms = roi_keeps_nms.split(
            roi_batch_splits_nms.int().tolist())

        for _score_nms, _class_nms, _keeps_nms, _score in zip(
                roi_score_nms, roi_class_nms, roi_keeps_nms, scores):
            torch.testing.assert_allclose(
                _score[_keeps_nms.to(torch.int64),
                       _class_nms.to(torch.int64)],
                _score_nms,
            )

        for ref, s, b, c in zip(ref_results, roi_score_nms, roi_bbox_nms,
                                roi_class_nms):
            s1, i1 = s.sort()
            s2, i2 = ref.scores.sort()
            torch.testing.assert_allclose(s1, s2)
            torch.testing.assert_allclose(b[i1], ref.pred_boxes.tensor[i2])
            torch.testing.assert_allclose(
                c.to(torch.int64)[i1], ref.pred_classes[i2])

        for ref, k in zip(ref_kept_indices, roi_keeps_nms):
            # NOTE: order might be different due to implementation
            ref_set = set(ref.tolist())
            k_set = set(k.tolist())
            self.assertEqual(ref_set, k_set)
def _assignment_rule(
    gt_boxes,
    anchor_boxes,
    unit_lengths,
    min_anchor_size,
    scale_thresh=2.0,
    spatial_thresh=1.0,
    uniqueness_on=True,
):
    """
    Given two lists of boxes of N ground truth boxes and M anchor boxes,
    compute the assignment between the two, following the assignment rules in
    https://arxiv.org/abs/1903.12174.
    The box order must be (xmin, ymin, xmax, ymax), so please make sure to convert
    to BoxMode.XYXY_ABS before calling this function.

    Args:
        gt_boxes, anchor_boxes (Boxes): two Boxes. Contains N & M boxes/anchors, respectively.
        unit_lengths (Tensor): Contains the unit lengths of M anchor boxes.
        min_anchor_size (float): Minimum size of the anchor, in pixels
        scale_thresh (float): The `scale` threshold: the maximum size of the anchor
                              should not be greater than scale_thresh x max(h, w) of
                              the ground truth box.
        spatial_thresh (float): The `spatial` threshold: the l2 distance between the
                              center of the anchor and the ground truth box should not
                              be greater than spatial_thresh x u where u is the unit length.

    Returns:
        matches (Tensor[int64]): a vector of length M, where matches[i] is a matched
                ground-truth index in [0, N)
        match_labels (Tensor[int8]): a vector of length M, where pred_labels[i] indicates
            whether a prediction is a true or false positive or ignored
    """
    gt_boxes, anchor_boxes = gt_boxes.tensor, anchor_boxes.tensor
    N = gt_boxes.shape[0]
    M = anchor_boxes.shape[0]
    if N == 0 or M == 0:
        return (
            gt_boxes.new_full((N, ), 0, dtype=torch.int64),
            gt_boxes.new_full((N, ), -1, dtype=torch.int8),
        )

    # Containment rule
    lt = torch.min(gt_boxes[:, None, :2], anchor_boxes[:, :2])  # [N,M,2]
    rb = torch.max(gt_boxes[:, None, 2:], anchor_boxes[:, 2:])  # [N,M,2]
    union = cat([lt, rb], dim=2)  # [N,M,4]

    dummy_gt_boxes = torch.zeros_like(gt_boxes)
    anchor = dummy_gt_boxes[:, None, :] + anchor_boxes[:, :]  # [N,M,4]

    contain_matrix = torch.all(union == anchor, dim=2)  # [N,M]

    # Centrality rule, scale
    gt_size_lower = torch.max(gt_boxes[:, 2:] - gt_boxes[:, :2],
                              dim=1)[0]  # [N]
    gt_size_upper = gt_size_lower * scale_thresh  # [N]
    # Fall back for small objects
    gt_size_upper[gt_size_upper < min_anchor_size] = min_anchor_size
    # Due to sampling of locations, the anchor sizes are deducted with sampling strides
    anchor_size = (
        torch.max(anchor_boxes[:, 2:] - anchor_boxes[:, :2], dim=1)[0] -
        unit_lengths)  # [M]

    size_diff_upper = gt_size_upper[:, None] - anchor_size  # [N,M]
    scale_matrix = size_diff_upper >= 0  # [N,M]

    # Centrality rule, spatial
    gt_center = (gt_boxes[:, 2:] + gt_boxes[:, :2]) / 2  # [N,2]
    anchor_center = (anchor_boxes[:, 2:] + anchor_boxes[:, :2]) / 2  # [M,2]
    offset_center = gt_center[:, None, :] - anchor_center[:, :]  # [N,M,2]
    offset_center /= unit_lengths[:, None]  # [N,M,2]
    spatial_square = spatial_thresh * spatial_thresh
    spatial_matrix = torch.sum(offset_center * offset_center,
                               dim=2) <= spatial_square

    assign_matrix = (contain_matrix & scale_matrix & spatial_matrix).int()

    # assign_matrix is N (gt) x M (predicted)
    # Max over gt elements (dim 0) to find best gt candidate for each prediction
    matched_vals, matches = assign_matrix.max(dim=0)
    match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)

    match_labels[matched_vals == 0] = 0
    match_labels[matched_vals == 1] = 1

    # find all the elements that match to ground truths multiple times
    not_unique_idxs = assign_matrix.sum(dim=0) > 1
    if uniqueness_on:
        match_labels[not_unique_idxs] = 0
    else:
        match_labels[not_unique_idxs] = -1

    return matches, match_labels
Exemple #19
0
    def _forward_mask_point(self, features, mask_coarse_logits, instances):
        """
        Forward logic of the mask point head.
        """
        if not self.mask_point_on:
            return {} if self.training else mask_coarse_logits

        mask_features_list = [features[k] for k in self.mask_point_in_features]
        features_scales = [
            self._feature_scales[k] for k in self.mask_point_in_features
        ]

        if self.training:
            proposal_boxes = [x.proposal_boxes for x in instances]
            gt_classes = cat([x.gt_classes for x in instances])
            with torch.no_grad():
                point_coords = get_uncertain_point_coords_with_randomness(
                    mask_coarse_logits,
                    lambda logits: calculate_uncertainty(logits, gt_classes),
                    self.mask_point_train_num_points,
                    self.mask_point_oversample_ratio,
                    self.mask_point_importance_sample_ratio,
                )

            fine_grained_features, point_coords_wrt_image = point_sample_fine_grained_features(
                mask_features_list, features_scales, proposal_boxes,
                point_coords)
            coarse_features = point_sample(mask_coarse_logits,
                                           point_coords,
                                           align_corners=False)
            point_logits = self.point_head(fine_grained_features,
                                           coarse_features)
            return {
                "loss_mask_point":
                roi_mask_point_loss(point_logits, instances,
                                    point_coords_wrt_image)
            }
        else:
            pred_boxes = [x.pred_boxes for x in instances]
            pred_classes = cat([x.pred_classes for x in instances])
            # The subdivision code will fail with the empty list of boxes
            if len(pred_classes) == 0:
                return mask_coarse_logits

            mask_logits = None
            # +1 here to include an initial step to generate the coarsest mask
            # prediction with init_resolution, when mask_logits is None.
            # We compute initial mask by sampling on a regular grid. coarse_mask
            # can be used as initial mask as well, but it's typically very low-res
            # so it will be completely overwritten during subdivision anyway.
            for _ in range(self.mask_point_subdivision_steps + 1):
                if mask_logits is None:
                    point_coords = generate_regular_grid_point_coords(
                        pred_classes.size(0),
                        self.mask_point_subdivision_init_resolution,
                        pred_boxes[0].device,
                    )
                else:
                    mask_logits = interpolate(mask_logits,
                                              scale_factor=2,
                                              mode="bilinear",
                                              align_corners=False)
                    uncertainty_map = calculate_uncertainty(
                        mask_logits, pred_classes)
                    point_indices, point_coords = get_uncertain_point_coords_on_grid(
                        uncertainty_map,
                        self.mask_point_subdivision_num_points)

                # Run the point head for every point in point_coords
                fine_grained_features, _ = point_sample_fine_grained_features(
                    mask_features_list, features_scales, pred_boxes,
                    point_coords)
                coarse_features = point_sample(mask_coarse_logits,
                                               point_coords,
                                               align_corners=False)
                point_logits = self.point_head(fine_grained_features,
                                               coarse_features)

                if mask_logits is None:
                    # Create initial mask_logits using point_logits on this regular grid
                    R, C, _ = point_logits.shape
                    mask_logits = point_logits.reshape(
                        R,
                        C,
                        self.mask_point_subdivision_init_resolution,
                        self.mask_point_subdivision_init_resolution,
                    )
                else:
                    # Put point predictions to the right places on the upsampled grid.
                    R, C, H, W = mask_logits.shape
                    point_indices = point_indices.unsqueeze(1).expand(
                        -1, C, -1)
                    mask_logits = (mask_logits.reshape(R, C, H * W).scatter_(
                        2, point_indices, point_logits).view(R, C, H, W))
            return mask_logits
    def get_ground_truth(self, anchors, unit_lengths, indexes, targets):
        """
        Args:
            anchors (list[list[Boxes]]): a list of N=#image elements. Each is a
                list of #feature level Boxes. The Boxes contains anchors of
                this image on the specific feature level.
            unit_lengths (list[list[Tensor]]): a list of N=#image elements. Each is a
                list of #feature level Tensor. The tensor contains unit lengths for anchors of
                this image on the specific feature level.
            indexes (list[list[Tensor]]): a list of N=#image elements. Each is a
                list of #feature level Tensor. The tensor contains the 5D index of
                each anchor, the second dimension means (L, N, H, W, A), where L
                is level, I is image, H is height, W is width, and A is anchor.
            targets (list[Instances]): a list of N `Instances`s. The i-th
                `Instances` contains the ground-truth per-instance annotations
                for the i-th input image.  Specify `targets` during training only.

        Returns:
            gt_class_info (Tensor, Tensor): A pair of two tensors for classification.
                The first one is an integer tensor of shape (R, #classes) storing ground-truth
                labels for each anchor. R is the total number of anchors in the batch.
                The second one is an integer tensor of shape (R,), to indicate which
                anchors are valid for loss computation, which anchors are not.
            gt_delta_info (Tensor, Tensor): A pair of two tensors for boxes.
                The first one, of shape (F, 4). F=#foreground anchors.
                The last dimension represents ground-truth box2box transform
                targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box.
                Only foreground anchors have values in this tensor. Could be `None` if F=0.
                The second one, of shape (R,), is an integer tensor indicating which anchors
                are foreground ones used for box regression. Could be `None` if F=0.
            gt_mask_info (list[list[Tensor]], list[list[Tensor]]): A pair of two lists for masks.
                The first one is a list of P=#feature level elements. Each is a
                list of A=#anchor tensors. Each tensor contains the ground truth
                masks of the same size and for the same feature level. Could be `None`.
                The second one is a list of P=#feature level elements. Each is a
                list of A=#anchor tensors. Each tensor contains the location of the ground truth
                masks of the same size and for the same feature level. The second dimension means
                (N, H, W), where N is image, H is height, and W is width. Could be `None`.
            num_fg (int): F=#foreground anchors, used later for loss normalization.
        """
        gt_classes = []
        gt_deltas = []
        gt_masks = [[[] for _ in range(self.num_anchors)]
                    for _ in range(self.num_levels)]
        gt_mask_inds = [[[] for _ in range(self.num_anchors)]
                        for _ in range(self.num_levels)]

        anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]
        unit_lengths = [cat(unit_lengths_i) for unit_lengths_i in unit_lengths]
        indexes = [cat(indexes_i) for indexes_i in indexes]

        num_fg = 0
        for i, (anchors_im, unit_lengths_im, indexes_im,
                targets_im) in enumerate(
                    zip(anchors, unit_lengths, indexes, targets)):
            # Initialize all
            gt_classes_i = torch.full_like(unit_lengths_im,
                                           self.num_classes,
                                           dtype=torch.int64,
                                           device=self.device)
            # Ground truth classes
            has_gt = len(targets_im) > 0
            if has_gt:
                # Compute the pairwise matrix
                gt_matched_inds, anchor_labels = _assignment_rule(
                    targets_im.gt_boxes, anchors_im, unit_lengths_im,
                    self.min_anchor_size)
                # Find the foreground instances
                fg_inds = anchor_labels == 1
                fg_anchors = anchors_im[fg_inds]
                num_fg += len(fg_anchors)
                # Find the ground truths for foreground instances
                gt_fg_matched_inds = gt_matched_inds[fg_inds]
                # Assign labels for foreground instances
                gt_classes_i[fg_inds] = targets_im.gt_classes[
                    gt_fg_matched_inds]
                # Anchors with label -1 are ignored, others are left as negative
                gt_classes_i[anchor_labels == -1] = -1

                # Boxes
                # Ground truth box regression, only for foregrounds
                matched_gt_boxes = targets_im[gt_fg_matched_inds].gt_boxes
                # Compute box regression offsets for foregrounds only
                gt_deltas_i = self.box2box_transform.get_deltas(
                    fg_anchors.tensor, matched_gt_boxes.tensor)
                gt_deltas.append(gt_deltas_i)

                # Masks
                if self.mask_on:
                    # Compute masks for each level and each anchor
                    matched_indexes = indexes_im[fg_inds, :]
                    for lvl in range(self.num_levels):
                        ids_lvl = matched_indexes[:, 0] == lvl
                        if torch.any(ids_lvl):
                            cur_level_factor = 2**lvl if self.bipyramid_on else 1
                            for anc in range(self.num_anchors):
                                ids_lvl_anchor = ids_lvl & (
                                    matched_indexes[:, 4] == anc)
                                if torch.any(ids_lvl_anchor):
                                    gt_masks[lvl][anc].append(
                                        targets_im[
                                            gt_fg_matched_inds[ids_lvl_anchor]]
                                        .gt_masks.crop_and_resize(
                                            fg_anchors[ids_lvl_anchor].tensor,
                                            self.mask_sizes[anc] *
                                            cur_level_factor,
                                        ))
                                    # Select (N, H, W) dimensions
                                    gt_mask_inds_lvl_anc = matched_indexes[
                                        ids_lvl_anchor, 1:4]
                                    # Set the image index to the current image
                                    gt_mask_inds_lvl_anc[:, 0] = i
                                    gt_mask_inds[lvl][anc].append(
                                        gt_mask_inds_lvl_anc)
            gt_classes.append(gt_classes_i)

        # Classes and boxes
        gt_classes = cat(gt_classes)
        gt_valid_inds = gt_classes >= 0
        gt_fg_inds = gt_valid_inds & (gt_classes < self.num_classes)
        gt_classes_target = torch.zeros(
            (gt_classes.shape[0], self.num_classes),
            dtype=torch.float32,
            device=self.device)
        gt_classes_target[gt_fg_inds, gt_classes[gt_fg_inds]] = 1
        gt_deltas = cat(gt_deltas) if gt_deltas else None

        # Masks
        gt_masks = [[cat(mla) if mla else None for mla in ml]
                    for ml in gt_masks]
        gt_mask_inds = [[cat(ila) if ila else None for ila in il]
                        for il in gt_mask_inds]
        return (
            (gt_classes_target, gt_valid_inds),
            (gt_deltas, gt_fg_inds),
            (gt_masks, gt_mask_inds),
            num_fg,
        )
Exemple #21
0
    def losses(
        self,
        anchors: List[Boxes],
        pred_objectness_logits: List[torch.Tensor],
        gt_labels: List[torch.Tensor],
        pred_anchor_deltas: List[torch.Tensor],
        gt_boxes: List[torch.Tensor],
    ) -> Dict[str, torch.Tensor]:
        """
        Return the losses from a set of RPN predictions and their associated ground-truth.

        Args:
            anchors (list[Boxes or RotatedBoxes]): anchors for each feature map, each
                has shape (Hi*Wi*A, B), where B is box dimension (4 or 5).
            pred_objectness_logits (list[Tensor]): A list of L elements.
                Element i is a tensor of shape (N, Hi*Wi*A) representing
                the predicted objectness logits for all anchors.
            gt_labels (list[Tensor]): Output of :meth:`label_and_sample_anchors`.
            pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape
                (N, Hi*Wi*A, 4 or 5) representing the predicted "deltas" used to transform anchors
                to proposals.
            gt_boxes (list[Tensor]): Output of :meth:`label_and_sample_anchors`.

        Returns:
            dict[loss name -> loss value]: A dict mapping from loss name to loss value.
                Loss names are: `loss_rpn_cls` for objectness classification and
                `loss_rpn_loc` for proposal localization.
        """
        num_images = len(gt_labels)
        gt_labels = torch.stack(gt_labels)  # (N, sum(Hi*Wi*Ai))

        # Log the number of positive/negative anchors per-image that's used in training
        pos_mask = gt_labels == 1
        num_pos_anchors = pos_mask.sum().item()
        num_neg_anchors = (gt_labels == 0).sum().item()
        storage = get_event_storage()
        storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images)
        storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images)

        if self.box_reg_loss_type == "smooth_l1":
            anchors = type(anchors[0]).cat(anchors).tensor  # Ax(4 or 5)
            gt_anchor_deltas = [self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes]
            gt_anchor_deltas = torch.stack(gt_anchor_deltas)  # (N, sum(Hi*Wi*Ai), 4 or 5)
            localization_loss = smooth_l1_loss(
                cat(pred_anchor_deltas, dim=1)[pos_mask],
                gt_anchor_deltas[pos_mask],
                self.smooth_l1_beta,
                reduction="sum",
            )
        elif self.box_reg_loss_type == "giou":
            pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas)
            pred_proposals = cat(pred_proposals, dim=1)
            pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1])
            pos_mask = pos_mask.view(-1)
            localization_loss = giou_loss(
                pred_proposals[pos_mask], cat(gt_boxes)[pos_mask], reduction="sum"
            )
        else:
            raise ValueError(f"Invalid rpn box reg loss type '{self.box_reg_loss_type}'")

        valid_mask = gt_labels >= 0
        objectness_loss = F.binary_cross_entropy_with_logits(
            cat(pred_objectness_logits, dim=1)[valid_mask],
            gt_labels[valid_mask].to(torch.float32),
            reduction="sum",
        )
        normalizer = self.batch_size_per_image * num_images
        losses = {
            "loss_rpn_cls": objectness_loss / normalizer,
            "loss_rpn_loc": localization_loss / normalizer,
        }
        losses = {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
        return losses
Exemple #22
0
def roi_mask_point_loss(mask_logits, instances, points_coord):
    """
    Compute the point-based loss for instance segmentation mask predictions.

    Args:
        mask_logits (Tensor): A tensor of shape (R, C, P) or (R, 1, P) for class-specific or
            class-agnostic, where R is the total number of predicted masks in all images, C is the
            number of foreground classes, and P is the number of points sampled for each mask.
            The values are logits.
        instances (list[Instances]): A list of N Instances, where N is the number of images
            in the batch. These instances are in 1:1 correspondence with the `mask_logits`. So, i_th
            elememt of the list contains R_i objects and R_1 + ... + R_N is equal to R.
            The ground-truth labels (class, box, mask, ...) associated with each instance are stored
            in fields.
        points_coords (Tensor): A tensor of shape (R, P, 2), where R is the total number of
            predicted masks and P is the number of points for each mask. The coordinates are in
            the image pixel coordinate space, i.e. [0, H] x [0, W].
    Returns:
        point_loss (Tensor): A scalar tensor containing the loss.
    """
    with torch.no_grad():
        cls_agnostic_mask = mask_logits.size(1) == 1
        total_num_masks = mask_logits.size(0)

        gt_classes = []
        gt_mask_logits = []
        idx = 0
        for instances_per_image in instances:
            if len(instances_per_image) == 0:
                continue
            assert isinstance(
                instances_per_image.gt_masks, BitMasks
            ), "Point head works with GT in 'bitmask' format. Set INPUT.MASK_FORMAT to 'bitmask'."

            if not cls_agnostic_mask:
                gt_classes_per_image = instances_per_image.gt_classes.to(
                    dtype=torch.int64)
                gt_classes.append(gt_classes_per_image)

            gt_bit_masks = instances_per_image.gt_masks.tensor
            h, w = instances_per_image.gt_masks.image_size
            scale = torch.tensor([w, h],
                                 dtype=torch.float,
                                 device=gt_bit_masks.device)
            points_coord_grid_sample_format = (
                points_coord[idx:idx + len(instances_per_image)] / scale)
            idx += len(instances_per_image)
            gt_mask_logits.append(
                point_sample(
                    gt_bit_masks.to(torch.float32).unsqueeze(1),
                    points_coord_grid_sample_format,
                    align_corners=False,
                ).squeeze(1))

    if len(gt_mask_logits) == 0:
        return mask_logits.sum() * 0

    gt_mask_logits = cat(gt_mask_logits)
    assert gt_mask_logits.numel() > 0, gt_mask_logits.shape

    if cls_agnostic_mask:
        mask_logits = mask_logits[:, 0]
    else:
        indices = torch.arange(total_num_masks)
        gt_classes = cat(gt_classes, dim=0)
        mask_logits = mask_logits[indices, gt_classes]

    # Log the training accuracy (using gt classes and 0.0 threshold for the logits)
    mask_accurate = (mask_logits > 0.0) == gt_mask_logits.to(dtype=torch.uint8)
    mask_accuracy = mask_accurate.nonzero().size(0) / mask_accurate.numel()
    get_event_storage().put_scalar("point_rend/accuracy", mask_accuracy)

    point_loss = F.binary_cross_entropy_with_logits(
        mask_logits, gt_mask_logits.to(dtype=torch.float32), reduction="mean")
    return point_loss
Exemple #23
0
    def inference_single_image(self, box_cls, box_delta, anchors, image_size):
        """
        Single-image inference. Return bounding-box detection results by thresholding
        on scores and applying non-maximum suppression (NMS).

        Arguments:
            box_cls (list[Tensor]): list of #feature levels. Each entry contains
                tensor of size (H x W x A, K)
            box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4.
            anchors (list[Boxes]): list of #feature levels. Each entry contains
                a Boxes object, which contains all the anchors for that
                image in that feature level.
            image_size (tuple(H, W)): a tuple of the image height and width.

        Returns:
            Same as `inference`, but for only one image.
        """
        boxes_all = []
        scores_all = []
        class_idxs_all = []

        # Iterate over every feature level
        for box_cls_i, box_reg_i, anchors_i in zip(box_cls, box_delta,
                                                   anchors):
            # (HxWxAxK,)
            box_cls_i = box_cls_i.flatten().sigmoid_()

            # Keep top k top scoring indices only.
            num_topk = min(self.topk_candidates, box_reg_i.size(0))
            # torch.sort is actually faster than .topk (at least on GPUs)
            predicted_prob, topk_idxs = box_cls_i.sort(descending=True)
            predicted_prob = predicted_prob[:num_topk]
            topk_idxs = topk_idxs[:num_topk]

            # filter out the proposals with low confidence score
            keep_idxs = predicted_prob > self.score_threshold
            predicted_prob = predicted_prob[keep_idxs]
            topk_idxs = topk_idxs[keep_idxs]

            anchor_idxs = topk_idxs // self.num_classes
            classes_idxs = topk_idxs % self.num_classes

            box_reg_i = box_reg_i[anchor_idxs]
            anchors_i = anchors_i[anchor_idxs]
            # predict boxes
            predicted_boxes = self.box2box_transform.apply_deltas(
                box_reg_i, anchors_i.tensor)

            boxes_all.append(predicted_boxes)
            scores_all.append(predicted_prob)
            class_idxs_all.append(classes_idxs)

        boxes_all, scores_all, class_idxs_all = [
            cat(x) for x in [boxes_all, scores_all, class_idxs_all]
        ]
        keep = batched_nms(boxes_all, scores_all, class_idxs_all,
                           self.nms_threshold)
        keep = keep[:self.max_detections_per_image]

        result = Instances(image_size)
        result.pred_boxes = Boxes(boxes_all[keep])
        result.scores = scores_all[keep]
        result.pred_classes = class_idxs_all[keep]
        return result
Exemple #24
0
def find_top_rrpn_proposals(
    proposals,
    pred_objectness_logits,
    images,
    nms_thresh,
    pre_nms_topk,
    post_nms_topk,
    min_box_side_len,
    training,
):
    """
    For each feature map, select the `pre_nms_topk` highest scoring proposals,
    apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
    highest scoring proposals among all the feature maps if `training` is True,
    otherwise, returns the highest `post_nms_topk` scoring proposals for each
    feature map.

    Args:
        proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 5).
            All proposal predictions on the feature maps.
        pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
        images (ImageList): Input images as an :class:`ImageList`.
        nms_thresh (float): IoU threshold to use for NMS
        pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
            When RRPN is run on multiple feature maps (as in FPN) this number is per
            feature map.
        post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
            When RRPN is run on multiple feature maps (as in FPN) this number is total,
            over all feature maps.
        min_box_side_len (float): minimum proposal box side length in pixels (absolute units
            wrt input images).
        training (bool): True if proposals are to be used in training, otherwise False.
            This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
            comment.

    Returns:
        proposals (list[Instances]): list of N Instances. The i-th Instances
            stores post_nms_topk object proposals for image i.
    """
    image_sizes = images.image_sizes  # in (h, w) order
    num_images = len(image_sizes)
    device = proposals[0].device

    # 1. Select top-k anchor for every level and every image
    topk_scores = []  # #lvl Tensor, each of shape N x topk
    topk_proposals = []
    level_ids = []  # #lvl Tensor, each of shape (topk,)
    batch_idx = torch.arange(num_images, device=device)
    for level_id, proposals_i, logits_i in zip(itertools.count(), proposals,
                                               pred_objectness_logits):
        Hi_Wi_A = logits_i.shape[1]
        num_proposals_i = min(pre_nms_topk, Hi_Wi_A)

        # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812)
        # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
        logits_i, idx = logits_i.sort(descending=True, dim=1)
        topk_scores_i = logits_i[batch_idx, :num_proposals_i]
        topk_idx = idx[batch_idx, :num_proposals_i]

        # each is N x topk
        topk_proposals_i = proposals_i[batch_idx[:, None],
                                       topk_idx]  # N x topk x 5

        topk_proposals.append(topk_proposals_i)
        topk_scores.append(topk_scores_i)
        level_ids.append(
            torch.full((num_proposals_i, ),
                       level_id,
                       dtype=torch.int64,
                       device=device))

    # 2. Concat all levels together
    topk_scores = cat(topk_scores, dim=1)
    topk_proposals = cat(topk_proposals, dim=1)
    level_ids = cat(level_ids, dim=0)

    # 3. For each image, run a per-level NMS, and choose topk results.
    results = []
    for n, image_size in enumerate(image_sizes):
        boxes = RotatedBoxes(topk_proposals[n])
        scores_per_img = topk_scores[n]
        boxes.clip(image_size)

        # filter empty boxes
        keep = boxes.nonempty(threshold=min_box_side_len)
        lvl = level_ids
        if keep.sum().item() != len(boxes):
            boxes, scores_per_img, lvl = (boxes[keep], scores_per_img[keep],
                                          level_ids[keep])

        keep = batched_nms_rotated(boxes.tensor, scores_per_img, lvl,
                                   nms_thresh)
        # In Detectron1, there was different behavior during training vs. testing.
        # (https://github.com/facebookresearch/Detectron/issues/459)
        # During training, topk is over the proposals from *all* images in the training batch.
        # During testing, it is over the proposals for each image separately.
        # As a result, the training behavior becomes batch-dependent,
        # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
        # This bug is addressed in Detectron2 to make the behavior independent of batch size.
        keep = keep[:post_nms_topk]

        res = Instances(image_size)
        res.proposal_boxes = boxes[keep]
        res.objectness_logits = scores_per_img[keep]
        results.append(res)
    return results
Exemple #25
0
    def __init__(self,
                 box2box_transform,
                 pred_class_logits,
                 pred_proposal_deltas,
                 proposals,
                 smooth_l1_beta,
                 vp_bins=None,
                 viewpoint_logits=None,
                 viewpoint_res_logits=None,
                 rotated_box_training=False,
                 height_logits=None,
                 weights_height=None):
        """
        Args:
            box2box_transform (Box2BoxTransform/Box2BoxTransformRotated):
                box2box transform instance for proposal-to-detection transformations.
            pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class
                logits for all R predicted object instances.
                Each row corresponds to a predicted object instance.
            pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for
                class-specific or class-agnostic regression. It stores the predicted deltas that
                transform proposals into final box detections.
                B is the box dimension (4 or 5).
                When B is 4, each row is [dx, dy, dw, dh (, ....)].
                When B is 5, each row is [dx, dy, dw, dh, da (, ....)].
            proposals (list[Instances]): A list of N Instances, where Instances i stores the
                proposals for image i, in the field "proposal_boxes".
                When training, each Instances must have ground-truth labels
                stored in the field "gt_classes" and "gt_boxes".
            smooth_l1_beta (float): The transition point between L1 and L2 loss in
                the smooth L1 loss function. When set to 0, the loss becomes L1. When
                set to +inf, the loss becomes constant 0.
        """
        self.box2box_transform = box2box_transform
        self.num_preds_per_image = [len(p) for p in proposals]
        self.pred_class_logits = pred_class_logits
        self.pred_proposal_deltas = pred_proposal_deltas
        self.smooth_l1_beta = smooth_l1_beta
        self.viewpoint_logits = viewpoint_logits
        self.viewpoint = True if viewpoint_logits is not None else False
        self.vp_bins = vp_bins
        self.viewpoint_res = True if viewpoint_res_logits is not None else False
        self.viewpoint_res_logits = viewpoint_res_logits
        self.rotated_box_training = rotated_box_training
        self.height_logits = height_logits
        self.height_training = True if height_logits is not None else False
        self.weights_height = weights_height

        box_type = type(proposals[0].proposal_boxes)
        # cat(..., dim=0) concatenates over all images in the batch
        self.proposals = box_type.cat([p.proposal_boxes for p in proposals])
        assert not self.proposals.tensor.requires_grad, "Proposals should not require gradients!"
        self.image_shapes = [x.image_size for x in proposals]

        # The following fields should exist only when training.
        if proposals[0].has("gt_boxes"):
            if self.rotated_box_training:
                self.gt_boxes = cat([p.gt_bbox3D for p in proposals])
            else:
                self.gt_boxes = box_type.cat([p.gt_boxes for p in proposals])
            assert proposals[0].has("gt_classes")
            self.gt_classes = cat([p.gt_classes for p in proposals], dim=0)
            if proposals[0].has("gt_viewpoint") and self.viewpoint:
                self.gt_viewpoint = cat([p.gt_viewpoint for p in proposals],
                                        dim=0)
                if proposals[0].has(
                        "gt_viewpoint_rads") and self.viewpoint_res:
                    self.gt_viewpoint_rads = cat(
                        [p.gt_viewpoint_rads for p in proposals], dim=0)
            if proposals[0].has("gt_height") and self.height_training:
                self.gt_height = cat([p.gt_height for p in proposals], dim=0)
    def inference_single_image(self, logits, init_boxes, refine_boxes,
                               centerness, image_size):
        boxes_all = []
        init_boxes_all = []
        class_idxs_all = []
        scores_all = []
        for logit, init_box, refine_box, ctr_score in zip(
                logits, init_boxes, refine_boxes, centerness):
            #for logit, init_box, refine_box in zip(logits, init_boxes, refine_boxes):
            logit_score = logit.sigmoid() * ctr_score.sigmoid()
            scores, cls = logit_score.max(0)
            #scores, cls = logit.sigmoid().max(0)
            cls = cls.view(-1)
            scores = scores.view(-1)
            init_box = init_box.view(4, -1).permute(1, 0)
            refine_box = refine_box.view(4, -1).permute(1, 0)

            predicted_prob, topk_idxs = scores.sort(descending=True)
            num_topk = min(self.topk_candidates, cls.size(0))
            predicted_prob = predicted_prob[:num_topk]
            topk_idxs = topk_idxs[:num_topk]

            # filter out the proposals with low confidence score
            keep_idxs = predicted_prob > self.score_threshold
            predicted_prob = predicted_prob[keep_idxs]
            topk_idxs = topk_idxs[keep_idxs]
            init_box_topk = init_box[topk_idxs]
            refine_box_topk = refine_box[topk_idxs]
            cls_topk = cls[topk_idxs]
            score_topk = scores[topk_idxs]

            boxes_all.append(refine_box_topk)
            init_boxes_all.append(init_box_topk)
            class_idxs_all.append(cls_topk)
            scores_all.append(score_topk)
            # The following code is the decoding procedure of RetinaNet in D2.
            # However, it fails to handle the predictions though I thought it could.
            """
            cls = logit.flatten().sigmoid()

            # pre nms
            num_topk = min(self.topk_candidates, cls.size(0))

            predicted_prob, topk_idxs = cls.sort(descending=True)
            predicted_prob = predicted_prob[:num_topk]
            topk_idxs = topk_idxs[:num_topk]

            # filter out the proposals with low confidence score
            keep_idxs = predicted_prob > self.score_threshold
            predicted_prob = predicted_prob[keep_idxs]
            topk_idxs = topk_idxs[keep_idxs]

            points_idxs = topk_idxs // self.num_classes
            classes_idxs = topk_idxs % self.num_classes

            init_box = init_box.reshape(4, -1).clone()
            refine_box = refine_box.reshape(4, -1).clone()
            init_box = init_box[:, points_idxs].permute(1, 0)
            refine_box_topk = refine_box[:, points_idxs].permute(1, 0)

            boxes_all.append(refine_box_topk)
            init_boxes_all.append(init_box)
            class_idxs_all.append(classes_idxs)
            scores_all.append(predicted_prob)
            """

        boxes_all, scores_all, class_idxs_all, init_boxes_all = [
            cat(x)
            for x in [boxes_all, scores_all, class_idxs_all, init_boxes_all]
        ]
        keep = batched_nms(boxes_all, scores_all, class_idxs_all,
                           self.nms_threshold)
        keep = keep[:self.max_detections_per_image]

        result = Instances(image_size)
        #result.pred_boxes = Boxes(init_boxes_all[keep])
        result.pred_boxes = Boxes(boxes_all[keep])
        result.scores = scores_all[keep]
        result.pred_classes = class_idxs_all[keep]
        result.init_boxes = init_boxes_all[keep]
        return result
Exemple #27
0
    def __init__(
            self,
            box2box_transform,
            pred_class_logits,
            pred_proposal_deltas,
            proposals,
            smooth_l1_beta,
            pred_overlap_deltas=None,
            pred_overlap_prob=None,
            overlap_configs=dict(),
            giou=False,
            allow_oob=False,
    ):
        """
        Args:
            box2box_transform (Box2BoxTransform/Box2BoxTransformRotated):
                box2box transform instance for proposal-to-detection transformations.
            pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class
                logits for all R predicted object instances.
                Each row corresponds to a predicted object instance.
            pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for
                class-specific or class-agnostic regression. It stores the predicted deltas that
                transform proposals into final box detections.
                B is the box dimension (4 or 5).
                When B is 4, each row is [dx, dy, dw, dh (, ....)].
                When B is 5, each row is [dx, dy, dw, dh, da (, ....)].
            proposals (list[Instances]): A list of N Instances, where Instances i stores the
                proposals for image i, in the field "proposal_boxes".
                When training, each Instances must have ground-truth labels
                stored in the field "gt_classes" and "gt_boxes".
            smooth_l1_beta (float): The transition point between L1 and L2 loss in
                the smooth L1 loss function. When set to 0, the loss becomes L1. When
                set to +inf, the loss becomes constant 0.
        """
        self.box2box_transform = box2box_transform
        self.num_preds_per_image = [len(p) for p in proposals]
        self.pred_class_logits = pred_class_logits
        self.pred_proposal_deltas = pred_proposal_deltas
        self.smooth_l1_beta = smooth_l1_beta

        self.pred_overlap_deltas = pred_overlap_deltas
        self.pred_overlap_prob = pred_overlap_prob

        assert isinstance(overlap_configs,
                          dict), "overlap configs must be dict, {}".format(
                              type(overlap_configs))
        self.overlap_iou_threshold = overlap_configs.get(
            "overlap_iou_threshold", 0.3)
        self.loss_overlap_reg_coeff = overlap_configs.get(
            "loss_overlap_reg_coeff", 0.1)
        self.uniform_reg_divisor = overlap_configs.get("uniform_reg_divisor",
                                                       False)
        self.cls_box_beta = overlap_configs.get("cls_box_beta", 0.1)

        self.giou = giou
        self.allow_oob = allow_oob

        box_type = type(proposals[0].proposal_boxes)
        # cat(..., dim=0) concatenates over all images in the batch
        self.proposals = box_type.cat([p.proposal_boxes for p in proposals])
        assert not self.proposals.tensor.requires_grad, "Proposals should not require gradients!"
        self.image_shapes = [x.image_size for x in proposals]

        # The following fields should exist only when training.
        if proposals[0].has("gt_boxes"):
            self.gt_boxes = box_type.cat([p.gt_boxes for p in proposals])
            assert proposals[0].has("gt_classes")
            self.gt_classes = cat([p.gt_classes for p in proposals], dim=0)

        if proposals[0].has("overlap_iou"):
            self.overlap_iou = cat([p.overlap_iou for p in proposals], dim=0)
            self.overlap_gt_boxes = box_type.cat(
                [p.overlap_gt_boxes for p in proposals])
Exemple #28
0
    def losses(self):
        """
        Return the losses from a set of SMInst predictions and their associated ground-truth.

        Returns:
            dict[loss name -> loss value]: A dict mapping from loss name to loss value.
        """

        training_targets = self._get_ground_truth()
        labels, reg_targets, mask_targets = training_targets["labels"], training_targets["reg_targets"], \
                                            training_targets["mask_targets"]

        # Collect all logits and regression predictions over feature maps
        # and images to arrive at the same shape as the labels and targets
        # The final ordering is L, N, H, W from slowest to fastest axis.
        logits_pred = cat(
            [
                # Reshape: (N, C, Hi, Wi) -> (N, Hi, Wi, C) -> (N*Hi*Wi, C)
                x.permute(0, 2, 3, 1).reshape(-1, self.num_classes)
                for x in self.logits_pred
            ],
            dim=0,
        )
        reg_pred = cat(
            [
                # Reshape: (N, B, Hi, Wi) -> (N, Hi, Wi, B) -> (N*Hi*Wi, B)
                x.permute(0, 2, 3, 1).reshape(-1, 4) for x in self.reg_pred
            ],
            dim=0,
        )
        ctrness_pred = cat(
            [
                # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
                x.reshape(-1) for x in self.ctrness_pred
            ],
            dim=0,
        )

        labels = cat(
            [
                # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
                x.reshape(-1) for x in labels
            ],
            dim=0,
        )

        reg_targets = cat(
            [
                # Reshape: (N, Hi, Wi, 4) -> (N*Hi*Wi, 4)
                x.reshape(-1, 4) for x in reg_targets
            ],
            dim=0,
        )

        mask_pred = cat(
            [
                # Reshape: (N, D, Hi, Wi) -> (N, Hi, Wi, D) -> (N*Hi*Wi, D)
                x.permute(0, 2, 3, 1).reshape(-1, self.num_codes)
                for x in self.mask_regression
            ],
            dim=0,
        )

        # mask_activation_pred = cat(
        #     [
        #         # Reshape: (N, D, Hi, Wi) -> (N, Hi, Wi, D) -> (N*Hi*Wi, D)
        #         x.permute(0, 2, 3, 1).reshape(-1, self.num_codes)
        #         for x in self.mask_activation
        #     ], dim=0, )

        num_levels = len(self.mask_prediction)
        num_outputs = len(self.mask_prediction[0])
        mask_prediction_list = []
        for m in range(num_outputs):
            temp_ = []
            for n in range(num_levels):
                temp_.append(self.mask_prediction[n][m])

            mask_prediction = cat(
                [
                    # Reshape: (N, D, Hi, Wi) -> (N, Hi, Wi, D) -> (N*Hi*Wi, D)
                    x.permute(0, 2, 3, 1).reshape(-1, self.mask_size**2)
                    for x in temp_
                ],
                dim=0,
            )
            mask_prediction_list.append(mask_prediction)

        # mask_tower_interm_outputs = []
        # for _outputs in self.mask_tower_interm_outputs:
        #     mask_tower_interm_output = cat(
        #         [
        #             x.permute(0, 2, 3, 1).reshape(-1, x.shape[1]) for x in _outputs
        #         ], dim=0
        #     )
        #     mask_tower_interm_outputs.append(mask_tower_interm_output)
        #     # print('interm: ', mask_tower_interm_output.size())
        # mask_tower_interm_outputs = cat(mask_tower_interm_outputs, dim=0)

        mask_targets = cat(
            [
                # Reshape: (N, Hi, Wi, mask_size^2) -> (N*Hi*Wi, mask_size^2)
                x.reshape(-1, self.mask_size**2) for x in mask_targets
            ],
            dim=0,
        )

        return self.SMInst_losses(labels, reg_targets, logits_pred, reg_pred,
                                  ctrness_pred, mask_pred,
                                  mask_prediction_list, mask_targets)
Exemple #29
0
    def __call__(self, box_predictor, predictions, proposals):
        """equivalent to FastRCNNOutputLayers.inference"""
        num_classes = box_predictor.num_classes
        score_thresh = box_predictor.test_score_thresh
        nms_thresh = box_predictor.test_nms_thresh
        topk_per_image = box_predictor.test_topk_per_image
        is_rotated = len(box_predictor.box2box_transform.weights) == 5

        if is_rotated:
            box_dim = 5
            assert box_predictor.box2box_transform.weights[4] == 1, (
                "The weights for Rotated BBoxTransform in C2 have only 4 dimensions,"
                + " thus enforcing the angle weight to be 1 for now")
            box2box_transform_weights = box_predictor.box2box_transform.weights[:
                                                                                4]
        else:
            box_dim = 4
            box2box_transform_weights = box_predictor.box2box_transform.weights

        class_logits, box_regression = predictions
        if num_classes + 1 == class_logits.shape[1]:
            class_prob = F.softmax(class_logits, -1)
        else:
            assert num_classes == class_logits.shape[1]
            class_prob = F.sigmoid(class_logits)
            # BoxWithNMSLimit will infer num_classes from the shape of the class_prob
            # So append a zero column as placeholder for the background class
            class_prob = torch.cat(
                (class_prob, torch.zeros(class_prob.shape[0], 1)), dim=1)

        assert box_regression.shape[1] % box_dim == 0
        cls_agnostic_bbox_reg = box_regression.shape[1] // box_dim == 1

        input_tensor_mode = proposals[0].proposal_boxes.tensor.shape[
            1] == box_dim + 1

        rois = type(proposals[0].proposal_boxes).cat(
            [p.proposal_boxes for p in proposals])
        device, dtype = rois.tensor.device, rois.tensor.dtype
        if input_tensor_mode:
            im_info = proposals[0].image_size
            rois = rois.tensor
        else:
            im_info = torch.tensor(
                [[sz[0], sz[1], 1.0]
                 for sz in [x.image_size for x in proposals]])
            batch_ids = cat(
                [
                    torch.full((b, 1), i, dtype=dtype, device=device)
                    for i, b in enumerate(len(p) for p in proposals)
                ],
                dim=0,
            )
            rois = torch.cat([batch_ids, rois.tensor], dim=1)

        roi_pred_bbox, roi_batch_splits = torch.ops._caffe2.BBoxTransform(
            to_device(rois, "cpu"),
            to_device(box_regression, "cpu"),
            to_device(im_info, "cpu"),
            weights=box2box_transform_weights,
            apply_scale=True,
            rotated=is_rotated,
            angle_bound_on=True,
            angle_bound_lo=-180,
            angle_bound_hi=180,
            clip_angle_thresh=1.0,
            legacy_plus_one=False,
        )
        roi_pred_bbox = to_device(roi_pred_bbox, device)
        roi_batch_splits = to_device(roi_batch_splits, device)

        nms_outputs = torch.ops._caffe2.BoxWithNMSLimit(
            to_device(class_prob, "cpu"),
            to_device(roi_pred_bbox, "cpu"),
            to_device(roi_batch_splits, "cpu"),
            score_thresh=float(score_thresh),
            nms=float(nms_thresh),
            detections_per_im=int(topk_per_image),
            soft_nms_enabled=False,
            soft_nms_method="linear",
            soft_nms_sigma=0.5,
            soft_nms_min_score_thres=0.001,
            rotated=is_rotated,
            cls_agnostic_bbox_reg=cls_agnostic_bbox_reg,
            input_boxes_include_bg_cls=False,
            output_classes_include_bg_cls=False,
            legacy_plus_one=False,
        )
        roi_score_nms = to_device(nms_outputs[0], device)
        roi_bbox_nms = to_device(nms_outputs[1], device)
        roi_class_nms = to_device(nms_outputs[2], device)
        roi_batch_splits_nms = to_device(nms_outputs[3], device)
        roi_keeps_nms = to_device(nms_outputs[4], device)
        roi_keeps_size_nms = to_device(nms_outputs[5], device)
        if not self.tensor_mode:
            roi_class_nms = roi_class_nms.to(torch.int64)

        roi_batch_ids = cat(
            [
                torch.full((b, 1), i, dtype=dtype, device=device)
                for i, b in enumerate(
                    int(x.item()) for x in roi_batch_splits_nms)
            ],
            dim=0,
        )

        roi_class_nms = alias(roi_class_nms, "class_nms")
        roi_score_nms = alias(roi_score_nms, "score_nms")
        roi_bbox_nms = alias(roi_bbox_nms, "bbox_nms")
        roi_batch_splits_nms = alias(roi_batch_splits_nms, "batch_splits_nms")
        roi_keeps_nms = alias(roi_keeps_nms, "keeps_nms")
        roi_keeps_size_nms = alias(roi_keeps_size_nms, "keeps_size_nms")

        results = InstancesList(
            im_info=im_info,
            indices=roi_batch_ids[:, 0],
            extra_fields={
                "pred_boxes": Caffe2Boxes(roi_bbox_nms),
                "scores": roi_score_nms,
                "pred_classes": roi_class_nms,
            },
        )

        if not self.tensor_mode:
            results = InstancesList.to_d2_instances_list(results)
            batch_splits = roi_batch_splits_nms.int().tolist()
            kept_indices = list(
                roi_keeps_nms.to(torch.int64).split(batch_splits))
        else:
            results = [results]
            kept_indices = [roi_keeps_nms]

        return results, kept_indices
Exemple #30
0
    def inference_single_image(self, cls_logits, pts_refine, pts_strides,
                               points, image_size):
        """
        Single-image inference. Return bounding-box detection results by
        thresholding on scores and applying non-maximum suppression (NMS).
        Arguemnts:
            cls_logits (list[Tensor]): list of #feature levels. Each entry
                contains tensor of size (H x W, K)
            pts_refine (list[Tensor]): Same shape as 'cls_logits' except that K
                becomes 2 * num_points.
            pts_strides (list(Tensor)): list of #feature levels. Each entry
                contains tensor of size (H x W, )
            points (list[Tensor]): list of #feature levels. Each entry contains
                a tensor, which contains all the points for that
                image in that feature level.
            image_size (tuple(H, W)): a tuple of the image height and width.
        Returns:
            Same as `inference`, but only for one image
        """
        assert len(cls_logits) == len(pts_refine) == len(pts_strides)
        boxes_all = []
        scores_all = []
        class_idxs_all = []

        # Iterate over every feature level
        for cls_logits_i, pts_refine_i, points_i, pts_strides_i in zip(
                cls_logits, pts_refine, points, pts_strides):
            bbox_pos_center = torch.cat([points_i, points_i], dim=1)
            bbox_pred = self.pts_to_bbox(pts_refine_i)
            bbox_pred = bbox_pred * pts_strides_i.reshape(-1,
                                                          1) + bbox_pos_center
            bbox_pred[:, 0].clamp_(min=0, max=image_size[1])
            bbox_pred[:, 1].clamp_(min=0, max=image_size[0])
            bbox_pred[:, 2].clamp_(min=0, max=image_size[1])
            bbox_pred[:, 3].clamp_(min=0, max=image_size[0])

            # (HxWxK, )
            point_cls_i = cls_logits_i.flatten().sigmoid_()

            # keep top k scoring indices only
            num_topk = min(self.topk_candidates, point_cls_i.size(0))
            # torch.sort is actually faster than .topk (at least on GPUs)
            predicted_prob, topk_idxs = point_cls_i.sort(descending=True)
            predicted_prob = predicted_prob[:num_topk]
            topk_idxs = topk_idxs[:num_topk]

            # filter out the proposals with low confidence score
            keep_idxs = predicted_prob > self.score_threshold
            predicted_prob = predicted_prob[keep_idxs]
            topk_idxs = topk_idxs[keep_idxs]

            point_idxs = topk_idxs // self.num_classes
            classes_idxs = topk_idxs % self.num_classes

            predicted_boxes = bbox_pred[point_idxs]

            boxes_all.append(predicted_boxes)
            scores_all.append(predicted_prob)
            class_idxs_all.append(classes_idxs)

        boxes_all, scores_all, class_idxs_all = [
            cat(x) for x in [boxes_all, scores_all, class_idxs_all]
        ]

        keep = batched_nms(boxes_all, scores_all, class_idxs_all,
                           self.nms_threshold)
        keep = keep[:self.max_detections_per_image]

        result = Instances(image_size)
        result.pred_boxes = Boxes(boxes_all[keep])
        result.scores = scores_all[keep]
        result.pred_classes = class_idxs_all[keep]

        return result