Exemple #1
0
def permute_all_cls_and_box_to_N_HWA_K_and_concat(box_cls,
                                                  box_delta,
                                                  box_center,
                                                  border_cls,
                                                  border_delta,
                                                  num_classes=80):
    """
    Rearrange the tensor layout from the network output, i.e.:
    list[Tensor]: #lvl tensors of shape (N, A x K, Hi, Wi)
    to per-image predictions, i.e.:
    Tensor: of shape (N x sum(Hi x Wi x A), K)
    """
    # for each feature level, permute the outputs to make them be in the
    # same format as the labels. Note that the labels are computed for
    # all feature levels concatenated, so we keep the same representation
    # for the objectness, the box_delta and the centerness
    box_cls_flattened = [permute_to_N_HWA_K(x, num_classes) for x in box_cls]
    box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta]
    box_center_flattened = [permute_to_N_HWA_K(x, 1) for x in box_center]

    border_cls_flattened = [
        permute_to_N_HWA_K(x, num_classes) for x in border_cls
    ]
    border_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in border_delta]
    # concatenate on the first dimension (representing the feature levels), to
    # take into account the way the labels were generated (with all feature maps
    # being concatenated as well)
    box_cls = cat(box_cls_flattened, dim=1).view(-1, num_classes)
    box_delta = cat(box_delta_flattened, dim=1).view(-1, 4)
    box_center = cat(box_center_flattened, dim=1).view(-1, 1)

    border_cls = cat(border_cls_flattened, dim=1).view(-1, num_classes)
    border_delta = cat(border_delta_flattened, dim=1).view(-1, 4)
    return box_cls, box_delta, box_center, border_cls, border_delta
Exemple #2
0
def keypoint_rcnn_loss(pred_keypoint_logits, instances, normalizer):
    """
    Arguments:
        pred_keypoint_logits (Tensor): A tensor of shape (N, K, S, S) where N is the total number
            of instances in the batch, K is the number of keypoints, and S is the side length
            of the keypoint heatmap. The values are spatial logits.
        instances (list[Instances]): A list of M Instances, where M is the batch size.
            These instances are predictions from the model
            that are in 1:1 correspondence with pred_keypoint_logits.
            Each Instances should contain a `gt_keypoints` field containing a `structures.Keypoint`
            instance.
        normalizer (float): Normalize the loss by this amount.
            If not specified, we normalize by the number of visible keypoints in the minibatch.

    Returns a scalar tensor containing the loss.
    """
    heatmaps = []
    valid = []

    keypoint_side_len = pred_keypoint_logits.shape[2]
    for instances_per_image in instances:
        if len(instances_per_image) == 0:
            continue
        keypoints = instances_per_image.gt_keypoints
        heatmaps_per_image, valid_per_image = keypoints.to_heatmap(
            instances_per_image.proposal_boxes.tensor, keypoint_side_len)
        heatmaps.append(heatmaps_per_image.view(-1))
        valid.append(valid_per_image.view(-1))

    if len(heatmaps):
        keypoint_targets = cat(heatmaps, dim=0)
        valid = cat(valid, dim=0).to(dtype=torch.uint8)
        valid = torch.nonzero(valid, as_tuple=False).squeeze(1)

    # torch.mean (in binary_cross_entropy_with_logits) doesn't
    # accept empty tensors, so handle it separately
    if len(heatmaps) == 0 or valid.numel() == 0:
        global _TOTAL_SKIPPED
        _TOTAL_SKIPPED += 1
        storage = get_event_storage()
        storage.put_scalar("kpts_num_skipped_batches",
                           _TOTAL_SKIPPED,
                           smoothing_hint=False)
        return pred_keypoint_logits.sum() * 0

    N, K, H, W = pred_keypoint_logits.shape
    pred_keypoint_logits = pred_keypoint_logits.view(N * K, H * W)

    keypoint_loss = F.cross_entropy(pred_keypoint_logits[valid],
                                    keypoint_targets[valid],
                                    reduction="sum")

    # If a normalizer isn't specified, normalize by the number of visible keypoints in the minibatch
    if normalizer is None:
        normalizer = valid.numel()
    keypoint_loss /= normalizer

    return keypoint_loss
Exemple #3
0
    def cat(instance_lists: List["Instances"]) -> "Instances":
        """
        Args:
            instance_lists (list[Instances])

        Returns:
            Instances
        """
        assert all(isinstance(i, Instances) for i in instance_lists)
        assert len(instance_lists) > 0
        if len(instance_lists) == 1:
            return instance_lists[0]

        image_size = instance_lists[0].image_size
        for i in instance_lists[1:]:
            assert i.image_size == image_size
        ret = Instances(image_size)
        for k in instance_lists[0]._fields.keys():
            values = [i.get(k) for i in instance_lists]
            v0 = values[0]
            if isinstance(v0, torch.Tensor):
                values = cat(values, dim=0)
            elif isinstance(v0, list):
                values = list(itertools.chain(*values))
            elif hasattr(type(v0), "cat"):
                values = type(v0).cat(values)
            else:
                raise ValueError(
                    "Unsupported type {} for concatenation".format(type(v0)))
            ret.set(k, values)
        return ret
Exemple #4
0
def convert_boxes_to_pooler_format(box_lists):
    """
    Convert all boxes in `box_lists` to the low-level format used by ROI pooling ops
    (see description under Returns).

    Args:
        box_lists (list[Boxes] | list[RotatedBoxes]):
            A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.

    Returns:
        When input is list[Boxes]:
            A tensor of shape (M, 5), where M is the total number of boxes aggregated over all
            N batch images.
            The 5 columns are (batch index, x0, y0, x1, y1), where batch index
            is the index in [0, N) identifying which batch image the box with corners at
            (x0, y0, x1, y1) comes from.
        When input is list[RotatedBoxes]:
            A tensor of shape (M, 6), where M is the total number of boxes aggregated over all
            N batch images.
            The 6 columns are (batch index, x_ctr, y_ctr, width, height, angle_degrees),
            where batch index is the index in [0, N) identifying which batch image the
            rotated box (x_ctr, y_ctr, width, height, angle_degrees) comes from.
    """

    def fmt_box_list(box_tensor, batch_index):
        repeated_index = torch.full(
            (len(box_tensor), 1), batch_index, dtype=box_tensor.dtype, device=box_tensor.device
        )
        return cat((repeated_index, box_tensor), dim=1)

    pooler_fmt_boxes = cat(
        [fmt_box_list(box_list.tensor, i) for i, box_list in enumerate(box_lists)], dim=0
    )

    return pooler_fmt_boxes
Exemple #5
0
def assign_boxes_to_levels(box_lists, min_level, max_level, canonical_box_size, canonical_level):
    """
    Map each box in `box_lists` to a feature map level index and return the assignment
    vector.

    Args:
        box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes,
            where N is the number of images in the batch.
        min_level (int): Smallest feature map level index. The input is considered index 0,
            the output of stage 1 is index 1, and so.
        max_level (int): Largest feature map level index.
        canonical_box_size (int): A canonical box size in pixels (sqrt(box area)).
        canonical_level (int): The feature map level index on which a canonically-sized box
            should be placed.

    Returns:
        A tensor of length M, where M is the total number of boxes aggregated over all
            N batch images. The memory layout corresponds to the concatenation of boxes
            from all images. Each element is the feature map index, as an offset from
            `self.min_level`, for the corresponding box (so value i means the box is at
            `self.min_level + i`).
    """
    eps = sys.float_info.epsilon
    box_sizes = torch.sqrt(cat([boxes.area() for boxes in box_lists]))
    # Eqn.(1) in FPN paper
    level_assignments = torch.floor(
        canonical_level + torch.log2(box_sizes / canonical_box_size + eps)
    )
    # clamp level to (min, max), in case the box size is too large or too small
    # for the available feature maps
    level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level)
    return level_assignments.to(torch.int64) - min_level
Exemple #6
0
    def aux_losses(self, gt_classes, pred_class_logits):
        pred_class_logits = cat([
            permute_to_N_HWA_K(x, self.num_classes) for x in pred_class_logits
        ],
                                dim=1).view(-1, self.num_classes)

        gt_classes = gt_classes.flatten()

        valid_idxs = gt_classes >= 0
        foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes)
        num_foreground = foreground_idxs.sum()

        gt_classes_target = torch.zeros_like(pred_class_logits)
        gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1

        num_foreground = comm.all_reduce(num_foreground) / float(
            comm.get_world_size())

        # logits loss
        loss_cls_aux = sigmoid_focal_loss_jit(
            pred_class_logits[valid_idxs],
            gt_classes_target[valid_idxs],
            alpha=self.focal_loss_alpha,
            gamma=self.focal_loss_gamma,
            reduction="sum",
        ) / max(1.0, num_foreground)

        return {"loss_cls_aux": loss_cls_aux}
Exemple #7
0
def keypoint_rcnn_inference(pred_keypoint_logits, pred_instances):
    """
    Post process each predicted keypoint heatmap in `pred_keypoint_logits` into (x, y, score)
        and add it to the `pred_instances` as a `pred_keypoints` field.

    Args:
        pred_keypoint_logits (Tensor): A tensor of shape (R, K, S, S) where R is the total number
           of instances in the batch, K is the number of keypoints, and S is the side length of
           the keypoint heatmap. The values are spatial logits.
        pred_instances (list[Instances]): A list of N Instances, where N is the number of images.

    Returns:
        None. Each element in pred_instances will contain an extra "pred_keypoints" field.
            The field is a tensor of shape (#instance, K, 3) where the last
            dimension corresponds to (x, y, score).
            The scores are larger than 0.
    """
    # flatten all bboxes from all images together (list[Boxes] -> Rx4 tensor)
    bboxes_flat = cat([b.pred_boxes.tensor for b in pred_instances], dim=0)

    keypoint_results = heatmaps_to_keypoints(pred_keypoint_logits.detach(), bboxes_flat.detach())
    num_instances_per_image = [len(i) for i in pred_instances]
    keypoint_results = keypoint_results[:, :, [0, 1, 3]].split(num_instances_per_image, dim=0)

    for keypoint_results_per_image, instances_per_image in zip(keypoint_results, pred_instances):
        # keypoint_results_per_image is (num instances)x(num keypoints)x(x, y, score)
        instances_per_image.pred_keypoints = keypoint_results_per_image
Exemple #8
0
 def forward(self, fine_grained_features, coarse_features):
     x = torch.cat((fine_grained_features, coarse_features), dim=1)
     for layer in self.fc_layers:
         x = F.relu(layer(x))
         if self.coarse_pred_each_layer:
             x = cat((x, coarse_features), dim=1)
     return self.predictor(x)
Exemple #9
0
def point_sample_fine_grained_features(features_list, feature_scales, boxes,
                                       point_coords):
    """
    Get features from feature maps in `features_list` that correspond to specific point coordinates
        inside each bounding box from `boxes`.

    Args:
        features_list (list[Tensor]): A list of feature map tensors to get features from.
        feature_scales (list[float]): A list of scales for tensors in `features_list`.
        boxes (list[Boxes]): A list of I Boxes  objects that contain R_1 + ... + R_I = R boxes all
            together.
        point_coords (Tensor): A tensor of shape (R, P, 2) that contains
            [0, 1] x [0, 1] box-normalized coordinates of the P sampled points.

    Returns:
        point_features (Tensor): A tensor of shape (R, C, P) that contains features sampled
            from all features maps in feature_list for P sampled points for all R boxes in `boxes`.
        point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains image-level
            coordinates of P points.
    """
    cat_boxes = Boxes.cat(boxes)
    num_boxes = [len(b) for b in boxes]

    point_coords_wrt_image = get_point_coords_wrt_image(
        cat_boxes.tensor, point_coords)
    split_point_coords_wrt_image = torch.split(point_coords_wrt_image,
                                               num_boxes)

    point_features = []
    for idx_img, point_coords_wrt_image_per_image in enumerate(
            split_point_coords_wrt_image):
        point_features_per_image = []
        for idx_feature, feature_map in enumerate(features_list):
            h, w = feature_map.shape[-2:]
            scale = torch.tensor(
                [w, h],
                device=feature_map.device) / feature_scales[idx_feature]
            point_coords_scaled = point_coords_wrt_image_per_image / scale
            point_features_per_image.append(
                point_sample(
                    feature_map[idx_img].unsqueeze(0),
                    point_coords_scaled.unsqueeze(0),
                    align_corners=False,
                ).squeeze(0).transpose(1, 0))
        point_features.append(cat(point_features_per_image, dim=1))

    return cat(point_features, dim=0), point_coords_wrt_image
Exemple #10
0
    def losses(self, gt_conf, gt_default_boxes_deltas, conf_pred, loc_pred):
        """
        SSD Weighted Loss Function:
            L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
            Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
            weighted by α, which is set to 1 by cross val.
            where:
                c: class confidences,
                l: predicted boxes,
                g: ground truth boxes
                N: number of matched default boxes
            See: https://arxiv.org/pdf/1512.02325.pdf for more details.

        Args:
            For `gt_conf` and `gt_default_boxes_deltas` parameters, see
                :method:`get_ground_truth`.
                Their concatenated shapes are [N, R] and [N, R, 4] respectively, where the R
                is the total number of default box, i.e. sum(Hi x Wi x D) for all levels, the
                C is the total number of class, the D is the number of default box in each location.
            For `conf_pred` and `loc_pred`, see: method:`SSDHead.forward`.
                Their shapes are [N, R, C] and [N, R,, 4] respectively.

        Returns:
            dict[str: Tensor]:
                mapping from a named loss to a scalar tensor storing the loss.
                Used during training only. The dict keys are: "loss_conf" and "loss_loc".
        """
        # shape=[#batch_size, #default_boxes, #num_classes] and [#batch_size, #default_boxes, 4]
        conf_pred = cat(conf_pred, dim=1)
        loc_pred = cat(loc_pred, dim=1)

        # filter out the negative samples
        positive_mask = gt_conf < 80

        # the number of matched default box
        num_pos_samples = positive_mask.sum()

        loss_conf, loss_loc = multi_apply(
            self.loss_single,
            conf_pred,
            loc_pred,
            gt_conf,
            gt_default_boxes_deltas,
            num_total_samples=num_pos_samples
        )
        return {"loss_conf": sum(loss_conf), "loss_loc": sum(loss_loc)}
Exemple #11
0
def _paste_mask_lists_in_image(masks, boxes, image_shape, threshold=0.5):
    """
    Paste a list of masks that are of various resolutions (e.g., 28 x 28) into an image.
    The location, height, and width for pasting each mask is determined by their
    corresponding bounding boxes in boxes.
    Args:
        masks (list(Tensor)): A list of Tensor of shape (1, Hmask_i, Wmask_i).
                            Values are in [0, 1]. The list length, Bimg, is the
                            number of detected object instances in the image.
        boxes (Boxes): A Boxes of length Bimg. boxes.tensor[i] and masks[i] correspond
                            to the same object instance.
        image_shape (tuple): height, width
        threshold (float): A threshold in [0, 1] for converting the (soft) masks to
            binary masks.
    Returns:
        img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the
        number of detected object instances and Himage, Wimage are the image width
        and height. img_masks[i] is a binary mask for object instance i.
    """
    if len(masks) == 0:
        return torch.empty((0, 1) + image_shape, dtype=torch.uint8)

    # Loop over masks groups. Each group has the same mask prediction size.
    img_masks = []
    ind_masks = []
    mask_sizes = torch.tensor([m.shape[-1] for m in masks])
    unique_sizes = torch.unique(mask_sizes)
    for msize in unique_sizes.tolist():
        cur_ind = torch.where(mask_sizes == msize)[0]
        ind_masks.append(cur_ind)

        cur_masks = cat([masks[i] for i in cur_ind])
        cur_boxes = boxes[cur_ind]
        img_masks.append(
            paste_masks_in_image(cur_masks, cur_boxes, image_shape, threshold)
        )

    img_masks = cat(img_masks)
    ind_masks = cat(ind_masks)

    img_masks_out = torch.empty_like(img_masks)
    img_masks_out[ind_masks, :, :] = img_masks

    return img_masks_out
Exemple #12
0
    def inference(self, pred_logits, pred_deltas, pred_masks, anchors, indexes,
                  images):
        """
        Arguments:
            pred_logits, pred_deltas, pred_masks: Same as the output of:
                meth:`TensorMaskHead.forward`
            anchors, indexes: Same as the input of meth:`TensorMask.get_ground_truth`
            images (ImageList): the input images
        Returns:
            results (List[Instances]): a list of #images elements.
        """
        assert len(anchors) == len(images)
        results = []

        pred_logits = [
            permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits
        ]
        pred_deltas = [permute_to_N_HWA_K(x, 4) for x in pred_deltas]

        pred_logits = cat(pred_logits, dim=1)
        pred_deltas = cat(pred_deltas, dim=1)

        for img_idx, (anchors_im,
                      indexes_im) in enumerate(zip(anchors, indexes)):
            # Get the size of the current image
            image_size = images.image_sizes[img_idx]

            logits_im = pred_logits[img_idx]
            deltas_im = pred_deltas[img_idx]

            if self.mask_on:
                masks_im = [[mla[img_idx] for mla in ml] for ml in pred_masks]
            else:
                masks_im = [None] * self.num_levels
            results_im = self.inference_single_image(
                logits_im,
                deltas_im,
                masks_im,
                Boxes.cat(anchors_im),
                cat(indexes_im),
                tuple(image_size),
            )
            results.append(results_im)
        return results
Exemple #13
0
    def forward(self, x, box_lists):
        assert not self.training

        pooler_fmt_boxes = self.c2_preprocess(box_lists)
        num_level_assignments = len(self.level_poolers)

        if num_level_assignments == 1:
            out = torch.ops._caffe2.RoIAlign(
                x[0],
                pooler_fmt_boxes,
                order="NCHW",
                spatial_scale=float(self.level_poolers[0].spatial_scale),
                pooled_h=int(self.output_size[0]),
                pooled_w=int(self.output_size[1]),
                sampling_ratio=int(self.level_poolers[0].sampling_ratio),
                aligned=bool(self.level_poolers[0].aligned),
            )
            return out

        device = pooler_fmt_boxes.device
        assert (
            self.max_level - self.min_level +
            1 == 4), "Currently DistributeFpnProposals only support 4 levels"
        fpn_outputs = torch.ops._caffe2.DistributeFpnProposals(
            to_device(pooler_fmt_boxes, "cpu"),
            roi_canonical_scale=self.canonical_box_size,
            roi_canonical_level=self.canonical_level,
            roi_max_level=self.max_level,
            roi_min_level=self.min_level,
            legacy_plus_one=False,
        )
        fpn_outputs = [to_device(x, device) for x in fpn_outputs]

        rois_fpn_list = fpn_outputs[:-1]
        rois_idx_restore_int32 = fpn_outputs[-1]

        roi_feat_fpn_list = []
        for roi_fpn, x_level, pooler in zip(rois_fpn_list, x,
                                            self.level_poolers):
            roi_feat_fpn = torch.ops._caffe2.RoIAlign(
                x_level,
                roi_fpn,
                order="NCHW",
                spatial_scale=float(pooler.spatial_scale),
                pooled_h=int(self.output_size[0]),
                pooled_w=int(self.output_size[1]),
                sampling_ratio=int(pooler.sampling_ratio),
                aligned=bool(pooler.aligned),
            )
            roi_feat_fpn_list.append(roi_feat_fpn)

        roi_feat_shuffled = cat(roi_feat_fpn_list, dim=0)
        roi_feat = torch.ops._caffe2.BatchPermutation(roi_feat_shuffled,
                                                      rois_idx_restore_int32)
        return roi_feat
Exemple #14
0
 def get_ground_truth(self, anchors, bbox_preds, targets):
     anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]
     N = len(anchors)
     # list[Tensor(R, 4)], one for each image
     all_anchors = Boxes.cat(anchors).tensor.reshape(N, -1, 4)
     # Boxes(Tensor(N*R, 4))
     box_delta = cat(bbox_preds, dim=1)
     # box_pred: xyxy; targets: xyxy
     box_pred = self.box2box_transform.apply_deltas(box_delta, all_anchors)
     indices = self.matcher(box_pred, all_anchors, targets)
     return indices
Exemple #15
0
def permute_all_to_N_HWA_K_and_concat(
    box_cls,
    box_delta,
    box_center,
    box_parmas,
    param_count,
    num_classes=80,
):
    box_cls_flattened = [permute_to_N_HWA_K(x, num_classes) for x in box_cls]
    box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta]
    box_center_flattened = [permute_to_N_HWA_K(x, 1) for x in box_center]
    box_parmas_flattened = [
        permute_to_N_HWA_K(x, param_count) for x in box_parmas
    ]

    box_cls = cat(box_cls_flattened, dim=1).reshape(-1, num_classes)
    box_delta = cat(box_delta_flattened, dim=1).reshape(-1, 4)
    box_center = cat(box_center_flattened, dim=1).reshape(-1, 1)
    box_parmas = cat(box_parmas_flattened, dim=1).reshape(-1, param_count)
    return box_cls, box_delta, box_center, box_parmas
Exemple #16
0
def get_uncertain_point_coords_with_randomness(
    coarse_logits, uncertainty_func, num_points, oversample_ratio, importance_sample_ratio
):
    """
    Sample points in [0, 1] x [0, 1] coordinate space based on their uncertainty. The unceratinties
        are calculated for each point using 'uncertainty_func' function that takes point's logit
        prediction as input.
    See PointRend paper for details.

    Args:
        coarse_logits (Tensor): A tensor of shape (N, C, Hmask, Wmask) or (N, 1, Hmask, Wmask) for
            class-specific or class-agnostic prediction.
        uncertainty_func: A function that takes a Tensor of shape (N, C, P) or (N, 1, P) that
            contains logit predictions for P points and returns their uncertainties as a Tensor of
            shape (N, 1, P).
        num_points (int): The number of points P to sample.
        oversample_ratio (int): Oversampling parameter.
        importance_sample_ratio (float): Ratio of points that are sampled via importnace sampling.

    Returns:
        point_coords (Tensor): A tensor of shape (N, P, 2) that contains the coordinates of P
            sampled points.
    """
    assert oversample_ratio >= 1
    assert importance_sample_ratio <= 1 and importance_sample_ratio >= 0
    num_boxes = coarse_logits.shape[0]
    num_sampled = int(num_points * oversample_ratio)
    point_coords = torch.rand(num_boxes, num_sampled, 2, device=coarse_logits.device)
    point_logits = point_sample(coarse_logits, point_coords, align_corners=False)
    # It is crucial to calculate uncertainty based on the sampled prediction value for the points.
    # Calculating uncertainties of the coarse predictions first and sampling them for points leads
    # to incorrect results.
    # To illustrate this: assume uncertainty_func(logits)=-abs(logits), a sampled point between
    # two coarse predictions with -1 and 1 logits has 0 logits, and therefore 0 uncertainty value.
    # However, if we calculate uncertainties for the coarse predictions first,
    # both will have -1 uncertainty, and the sampled point will get -1 uncertainty.
    point_uncertainties = uncertainty_func(point_logits)
    num_uncertain_points = int(importance_sample_ratio * num_points)
    num_random_points = num_points - num_uncertain_points
    idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
    shift = num_sampled * torch.arange(num_boxes, dtype=torch.long, device=coarse_logits.device)
    idx += shift[:, None]
    point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
        num_boxes, num_uncertain_points, 2
    )
    if num_random_points > 0:
        point_coords = cat(
            [
                point_coords,
                torch.rand(num_boxes, num_random_points, 2, device=coarse_logits.device),
            ],
            dim=1,
        )
    return point_coords
    def forward(self, x):
        if x.dim() > 2:
            x = torch.flatten(x, start_dim=1)
        scores = self.cls_score(x)

        # only adjust the foreground classification scores
        confidence = self.confidence_layer(x).sigmoid()
        scores_tmp = confidence * (scores[:, :-1] * self.logit_scale +
                                   self.logit_bias)
        scores_tmp = scores_tmp + (1 - confidence) * scores[:, :-1]

        aligned_scores = cat([scores_tmp, scores[:, -1].view(-1, 1)], dim=1)
        proposal_deltas = self.bbox_pred(x)
        return aligned_scores, proposal_deltas
Exemple #18
0
    def cat(bitmasks_list: List["BitMasks"]) -> "BitMasks":
        """
        Concatenates a list of BitMasks into a single BitMasks
        Arguments:
            bitmasks_list (list[BitMasks])
        Returns:
            BitMasks: the concatenated BitMasks
        """
        assert isinstance(bitmasks_list, (list, tuple))
        assert len(bitmasks_list) > 0
        assert all(isinstance(bitmask, BitMasks) for bitmask in bitmasks_list)

        cat_bitmasks = type(bitmasks_list[0])(cat([bm.tensor for bm in bitmasks_list], dim=0))
        return cat_bitmasks
Exemple #19
0
    def cat(boxes_list: List["RotatedBoxes"]) -> "RotatedBoxes":  # type: ignore
        """
        Concatenates a list of RotatedBoxes into a single RotatedBoxes

        Arguments:
            boxes_list (list[RotatedBoxes])

        Returns:
            RotatedBoxes: the concatenated RotatedBoxes
        """
        assert isinstance(boxes_list, (list, tuple))
        assert len(boxes_list) > 0
        assert all(isinstance(box, RotatedBoxes) for box in boxes_list)

        cat_boxes = type(boxes_list[0])(cat([b.tensor for b in boxes_list], dim=0))
        return cat_boxes
Exemple #20
0
    def __init__(self, box2box_transform, pred_class_logits,
                 pred_proposal_deltas, proposals, smooth_l1_beta):
        """
        Args:
            box2box_transform (Box2BoxTransform/Box2BoxTransformRotated):
                box2box transform instance for proposal-to-detection transformations.
            pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class
                logits for all R predicted object instances.
                Each row corresponds to a predicted object instance.
            pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for
                class-specific or class-agnostic regression. It stores the predicted deltas that
                transform proposals into final box detections.
                B is the box dimension (4 or 5).
                When B is 4, each row is [dx, dy, dw, dh (, ....)].
                When B is 5, each row is [dx, dy, dw, dh, da (, ....)].
            proposals (list[Instances]): A list of N Instances, where Instances i stores the
                proposals for image i, in the field "proposal_boxes".
                When training, each Instances must have ground-truth labels
                stored in the field "gt_classes" and "gt_boxes".
            smooth_l1_beta (float): The transition point between L1 and L2 loss in
                the smooth L1 loss function. When set to 0, the loss becomes L1. When
                set to +inf, the loss becomes constant 0.
        """
        self.box2box_transform = box2box_transform
        self.num_preds_per_image = [len(p) for p in proposals]
        self.pred_class_logits = pred_class_logits
        self.pred_proposal_deltas = pred_proposal_deltas
        self.smooth_l1_beta = smooth_l1_beta

        if len(proposals):
            box_type = type(proposals[0].proposal_boxes)
            # cat(..., dim=0) concatenates over all images in the batch
            self.proposals = box_type.cat(
                [p.proposal_boxes for p in proposals])
            assert (not self.proposals.tensor.requires_grad
                    ), "Proposals should not require gradients!"
            self.image_shapes = [x.image_size for x in proposals]

            # The following fields should exist only when training.
            if proposals[0].has("gt_boxes"):
                self.gt_boxes = box_type.cat([p.gt_boxes for p in proposals])
                assert proposals[0].has("gt_classes")
                self.gt_classes = cat([p.gt_classes for p in proposals], dim=0)
        else:
            self.proposals = Boxes(
                torch.zeros(0, 4, device=self.pred_proposal_deltas.device))
        self._no_instances = len(proposals) == 0  # no instances found
Exemple #21
0
    def cat(cls, boxes_list: List["Boxes"]) -> "Boxes":
        """
        Concatenates a list of Boxes into a single Boxes

        Arguments:
            boxes_list (list[Boxes])

        Returns:
            Boxes: the concatenated Boxes
        """
        assert isinstance(boxes_list, (list, tuple))
        assert all(isinstance(box, Boxes) for box in boxes_list)

        if len(boxes_list) == 0:
            return cls(torch.empty(0))

        cat_boxes = type(boxes_list[0])(cat([b.tensor for b in boxes_list], dim=0))
        return cat_boxes
Exemple #22
0
def mask_rcnn_inference(pred_mask_logits, pred_instances):
    """
    Convert pred_mask_logits to estimated foreground probability masks while also
    extracting only the masks for the predicted classes in pred_instances. For each
    predicted box, the mask of the same class is attached to the instance by adding a
    new "pred_masks" field to pred_instances.

    Args:
        pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask)
            for class-specific or class-agnostic, where B is the total number of predicted masks
            in all images, C is the number of foreground classes, and Hmask, Wmask are the height
            and width of the mask predictions. The values are logits.
        pred_instances (list[Instances]): A list of N Instances, where N is the number of images
            in the batch. Each Instances must have field "pred_classes".

    Returns:
        None. pred_instances will contain an extra "pred_masks" field storing a mask of size (Hmask,
            Wmask) for predicted class. Note that the masks are returned as a soft (non-quantized)
            masks the resolution predicted by the network; post-processing steps, such as resizing
            the predicted masks to the original image resolution and/or binarizing them, is left
            to the caller.
    """
    cls_agnostic_mask = pred_mask_logits.size(1) == 1

    if cls_agnostic_mask:
        mask_probs_pred = pred_mask_logits.sigmoid()
    else:
        # Select masks corresponding to the predicted classes
        num_masks = pred_mask_logits.shape[0]
        class_pred = cat([i.pred_classes for i in pred_instances])
        indices = torch.arange(num_masks, device=class_pred.device)
        mask_probs_pred = pred_mask_logits[indices, class_pred][:, None].sigmoid()
    # mask_probs_pred.shape: (B, 1, Hmask, Wmask)

    num_boxes_per_image = [len(i) for i in pred_instances]
    mask_probs_pred = mask_probs_pred.split(num_boxes_per_image, dim=0)

    for prob, instances in zip(mask_probs_pred, pred_instances):
        instances.pred_masks = prob  # (1, Hmask, Wmask)
Exemple #23
0
    def inference_single_image(self, box_cls, box_center, border_cls,
                               border_delta, bd_based_box, image_size):
        """
        Single-image inference. Return bounding-box detection results by thresholding
        on scores and applying non-maximum suppression (NMS).

        Arguments:
            box_cls (list[Tensor]): list of #feature levels. Each entry contains
                tensor of size (H x W, K)
            box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4.
            box_center (list[Tensor]): Same shape as 'box_cls' except that K becomes 1.
            shifts (list[Tensor]): list of #feature levels. Each entry contains
                a tensor, which contains all the shifts for that
                image in that feature level.
            image_size (tuple(H, W)): a tuple of the image height and width.

        Returns:
            Same as `inference`, but for only one image.
        """
        boxes_all = []
        scores_all = []
        class_idxs_all = []

        border_bbox_std = bd_based_box[0].new_tensor(self.border_bbox_std)

        # Iterate over every feature level
        for box_cls_i, box_ctr_i, bd_box_cls_i, bd_box_reg_i, bd_based_box_i in zip(
                box_cls, box_center, border_cls, border_delta, bd_based_box):
            # (HxWxK,)
            box_cls_i = box_cls_i.sigmoid_()
            box_ctr_i = box_ctr_i.sigmoid_()
            bd_box_cls_i = bd_box_cls_i.sigmoid_()

            predicted_prob = (box_cls_i * box_ctr_i).sqrt()

            # filter out the proposals with low confidence score
            keep_idxs = predicted_prob > self.score_threshold

            predicted_prob = predicted_prob * bd_box_cls_i

            predicted_prob = predicted_prob[keep_idxs]
            # Keep top k top scoring indices only.
            num_topk = min(self.topk_candidates, predicted_prob.size(0))
            # torch.sort is actually faster than .topk (at least on GPUs)
            predicted_prob, topk_idxs = predicted_prob.sort(descending=True)
            topk_idxs = topk_idxs[:num_topk]

            keep_idxs = keep_idxs.nonzero()
            keep_idxs = keep_idxs[topk_idxs]
            keep_box_idxs = keep_idxs[:, 0]
            classes_idxs = keep_idxs[:, 1]

            predicted_prob = predicted_prob[:num_topk]
            bd_box_reg_i = bd_box_reg_i[keep_box_idxs]
            bd_based_box_i = bd_based_box_i[keep_box_idxs]

            det_wh = (bd_based_box_i[..., 2:4] - bd_based_box_i[..., :2])
            det_wh = torch.cat([det_wh, det_wh], dim=1)
            predicted_boxes = bd_based_box_i + (bd_box_reg_i *
                                                border_bbox_std * det_wh)

            boxes_all.append(predicted_boxes)
            scores_all.append(predicted_prob.sqrt())
            class_idxs_all.append(classes_idxs)

        boxes_all, scores_all, class_idxs_all = [
            cat(x) for x in [boxes_all, scores_all, class_idxs_all]
        ]

        keep = generalized_batched_nms(boxes_all,
                                       scores_all,
                                       class_idxs_all,
                                       self.nms_threshold,
                                       nms_type=self.nms_type)
        boxes_all = boxes_all[keep]
        scores_all = scores_all[keep]
        class_idxs_all = class_idxs_all[keep]

        number_of_detections = len(keep)
        # Limit to max_per_image detections **over all classes**
        if number_of_detections > self.max_detections_per_image > 0:
            image_thresh, _ = torch.kthvalue(
                scores_all,
                number_of_detections - self.max_detections_per_image + 1)
            keep = scores_all >= image_thresh.item()
            keep = torch.nonzero(keep).squeeze(1)
            boxes_all = boxes_all[keep]
            scores_all = scores_all[keep]
            class_idxs_all = class_idxs_all[keep]

        result = Instances(image_size)
        result.pred_boxes = Boxes(boxes_all)
        result.scores = scores_all
        result.pred_classes = class_idxs_all
        return result
Exemple #24
0
    def inference_single_image(self, box_cls, box_delta, box_center, shifts,
                               image_size):
        """
        Single-image inference. Return bounding-box detection results by thresholding
        on scores and applying non-maximum suppression (NMS).

        Arguments:
            box_cls (list[Tensor]): list of #feature levels. Each entry contains
                tensor of size (H x W, K)
            box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4.
            box_center (list[Tensor]): Same shape as 'box_cls' except that K becomes 1.
            shifts (list[Tensor]): list of #feature levels. Each entry contains
                a tensor, which contains all the shifts for that
                image in that feature level.
            image_size (tuple(H, W)): a tuple of the image height and width.

        Returns:
            Same as `inference`, but for only one image.
        """
        boxes_all = []
        scores_all = []
        class_idxs_all = []

        # Iterate over every feature level
        for box_cls_i, box_reg_i, box_ctr_i, shifts_i in zip(
                box_cls, box_delta, box_center, shifts):
            # (HxWxK,)
            box_cls_i = box_cls_i.flatten().sigmoid_()

            # Keep top k top scoring indices only.
            num_topk = min(self.topk_candidates, box_reg_i.size(0))
            # torch.sort is actually faster than .topk (at least on GPUs)
            predicted_prob, topk_idxs = box_cls_i.sort(descending=True)
            predicted_prob = predicted_prob[:num_topk]
            topk_idxs = topk_idxs[:num_topk]

            # filter out the proposals with low confidence score
            keep_idxs = predicted_prob > self.score_threshold
            predicted_prob = predicted_prob[keep_idxs]
            topk_idxs = topk_idxs[keep_idxs]

            shift_idxs = topk_idxs // self.num_classes
            classes_idxs = topk_idxs % self.num_classes

            box_reg_i = box_reg_i[shift_idxs]
            shifts_i = shifts_i[shift_idxs]
            # predict boxes
            predicted_boxes = self.shift2box_transform.apply_deltas(
                box_reg_i, shifts_i)

            box_ctr_i = box_ctr_i.flatten().sigmoid_()[shift_idxs]
            predicted_prob = torch.sqrt(predicted_prob * box_ctr_i)

            boxes_all.append(predicted_boxes)
            scores_all.append(predicted_prob)
            class_idxs_all.append(classes_idxs)

        boxes_all, scores_all, class_idxs_all = [
            cat(x) for x in [boxes_all, scores_all, class_idxs_all]
        ]

        keep = generalized_batched_nms(boxes_all,
                                       scores_all,
                                       class_idxs_all,
                                       self.nms_threshold,
                                       nms_type=self.nms_type)
        keep = keep[:self.max_detections_per_image]

        result = Instances(image_size)
        result.pred_boxes = Boxes(boxes_all[keep])
        result.scores = scores_all[keep]
        result.pred_classes = class_idxs_all[keep]
        return result
Exemple #25
0
    def get_ground_truth(self, anchors, unit_lengths, indexes, targets):
        """
        Args:
            anchors (list[list[Boxes]]): a list of N=#image elements. Each is a
                list of #feature level Boxes. The Boxes contains anchors of
                this image on the specific feature level.
            unit_lengths (list[list[Tensor]]): a list of N=#image elements. Each is a
                list of #feature level Tensor. The tensor contains unit lengths for anchors of
                this image on the specific feature level.
            indexes (list[list[Tensor]]): a list of N=#image elements. Each is a
                list of #feature level Tensor. The tensor contains the 5D index of
                each anchor, the second dimension means (L, N, H, W, A), where L
                is level, I is image, H is height, W is width, and A is anchor.
            targets (list[Instances]): a list of N `Instances`s. The i-th
                `Instances` contains the ground-truth per-instance annotations
                for the i-th input image.  Specify `targets` during training only.
        Returns:
            gt_class_info (Tensor, Tensor): A pair of two tensors for classification.
                The first one is an integer tensor of shape (R, #classes) storing ground-truth
                labels for each anchor. R is the total number of anchors in the batch.
                The second one is an integer tensor of shape (R,), to indicate which
                anchors are valid for loss computation, which anchors are not.
            gt_delta_info (Tensor, Tensor): A pair of two tensors for boxes.
                The first one, of shape (F, 4). F=#foreground anchors.
                The last dimension represents ground-truth box2box transform
                targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box.
                Only foreground anchors have values in this tensor. Could be `None` if F=0.
                The second one, of shape (R,), is an integer tensor indicating which anchors
                are foreground ones used for box regression. Could be `None` if F=0.
            gt_mask_info (list[list[Tensor]], list[list[Tensor]]): A pair of two lists for masks.
                The first one is a list of P=#feature level elements. Each is a
                list of A=#anchor tensors. Each tensor contains the ground truth
                masks of the same size and for the same feature level. Could be `None`.
                The second one is a list of P=#feature level elements. Each is a
                list of A=#anchor tensors. Each tensor contains the location of the ground truth
                masks of the same size and for the same feature level. The second dimension means
                (N, H, W), where N is image, H is height, and W is width. Could be `None`.
            num_fg (int): F=#foreground anchors, used later for loss normalization.
        """
        gt_classes = []
        gt_deltas = []
        gt_masks = [[[] for _ in range(self.num_anchors)]
                    for _ in range(self.num_levels)]
        gt_mask_inds = [[[] for _ in range(self.num_anchors)]
                        for _ in range(self.num_levels)]

        anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]
        unit_lengths = [cat(unit_lengths_i) for unit_lengths_i in unit_lengths]
        indexes = [cat(indexes_i) for indexes_i in indexes]

        num_fg = 0
        for i, (anchors_im, unit_lengths_im, indexes_im,
                targets_im) in enumerate(
                    zip(anchors, unit_lengths, indexes, targets)):
            # Initialize all
            gt_classes_i = torch.full_like(unit_lengths_im,
                                           self.num_classes,
                                           dtype=torch.int64,
                                           device=self.device)
            # Ground truth classes
            has_gt = len(targets_im) > 0
            if has_gt:
                # Compute the pairwise matrix
                gt_matched_inds, anchor_labels = _assignment_rule(
                    targets_im.gt_boxes,
                    anchors_im,
                    unit_lengths_im,
                    self.min_anchor_size,
                )
                # Find the foreground instances
                fg_inds = anchor_labels == 1
                fg_anchors = anchors_im[fg_inds]
                num_fg += len(fg_anchors)
                # Find the ground truths for foreground instances
                gt_fg_matched_inds = gt_matched_inds[fg_inds]
                # Assign labels for foreground instances
                gt_classes_i[fg_inds] = targets_im.gt_classes[
                    gt_fg_matched_inds]
                # Anchors with label -1 are ignored, others are left as negative
                gt_classes_i[anchor_labels == -1] = -1

                # Boxes
                # Ground truth box regression, only for foregrounds
                matched_gt_boxes = targets_im[gt_fg_matched_inds].gt_boxes
                # Compute box regression offsets for foregrounds only
                gt_deltas_i = self.box2box_transform.get_deltas(
                    fg_anchors.tensor, matched_gt_boxes.tensor)
                gt_deltas.append(gt_deltas_i)

                # Masks
                if self.mask_on:
                    # Compute masks for each level and each anchor
                    matched_indexes = indexes_im[fg_inds, :]
                    for lvl in range(self.num_levels):
                        ids_lvl = matched_indexes[:, 0] == lvl
                        if torch.any(ids_lvl):
                            cur_level_factor = 2**lvl if self.bipyramid_on else 1
                            for anc in range(self.num_anchors):
                                ids_lvl_anchor = ids_lvl & (
                                    matched_indexes[:, 4] == anc)
                                if torch.any(ids_lvl_anchor):
                                    gt_masks[lvl][anc].append(
                                        targets_im[
                                            gt_fg_matched_inds[ids_lvl_anchor]]
                                        .gt_masks.crop_and_resize(
                                            fg_anchors[ids_lvl_anchor].tensor,
                                            self.mask_sizes[anc] *
                                            cur_level_factor,
                                        ))
                                    # Select (N, H, W) dimensions
                                    gt_mask_inds_lvl_anc = matched_indexes[
                                        ids_lvl_anchor, 1:4]
                                    # Set the image index to the current image
                                    gt_mask_inds_lvl_anc[:, 0] = i
                                    gt_mask_inds[lvl][anc].append(
                                        gt_mask_inds_lvl_anc)
            gt_classes.append(gt_classes_i)

        # Classes and boxes
        gt_classes = cat(gt_classes)
        gt_valid_inds = gt_classes >= 0
        gt_fg_inds = gt_valid_inds & (gt_classes < self.num_classes)
        gt_classes_target = torch.zeros(
            (gt_classes.shape[0], self.num_classes),
            dtype=torch.float32,
            device=self.device,
        )
        gt_classes_target[gt_fg_inds, gt_classes[gt_fg_inds]] = 1
        gt_deltas = cat(gt_deltas) if gt_deltas else None

        # Masks
        gt_masks = [[cat(mla) if mla else None for mla in ml]
                    for ml in gt_masks]
        gt_mask_inds = [[cat(ila) if ila else None for ila in il]
                        for il in gt_mask_inds]
        return (
            (gt_classes_target, gt_valid_inds),
            (gt_deltas, gt_fg_inds),
            (gt_masks, gt_mask_inds),
            num_fg,
        )
Exemple #26
0
def _assignment_rule(
    gt_boxes,
    anchor_boxes,
    unit_lengths,
    min_anchor_size,
    scale_thresh=2.0,
    spatial_thresh=1.0,
    uniqueness_on=True,
):
    """
    Given two lists of boxes of N ground truth boxes and M anchor boxes,
    compute the assignment between the two, following the assignment rules in
    https://arxiv.org/abs/1903.12174.
    The box order must be (xmin, ymin, xmax, ymax), so please make sure to
    convert to BoxMode.XYXY_ABS before calling this function.
    Args:
        gt_boxes, anchor_boxes (Boxes): two Boxes. Contains N & M boxes/anchors
            , respectively. unit_lengths (Tensor): Contains the unit lengths of M
            anchor boxes.
        min_anchor_size (float): Minimum size of the anchor, in pixels
        scale_thresh (float): The `scale` threshold: the maximum size of the anchor
                              should not be greater than scale_thresh x max(h, w) of
                              the ground truth box.
        spatial_thresh (float): The `spatial` threshold: the l2 distance between the
                              center of the anchor and the ground truth box should not
                              be greater than spatial_thresh x u where u is the unit length.
    Returns:
        matches (Tensor[int64]): a vector of length M, where matches[i] is a matched
                ground-truth index in [0, N)
        match_labels (Tensor[int8]): a vector of length M, where pred_labels[i] indicates
            whether a prediction is a true or false positive or ignored
    """
    gt_boxes, anchor_boxes = gt_boxes.tensor, anchor_boxes.tensor
    N = gt_boxes.shape[0]
    M = anchor_boxes.shape[0]
    if N == 0 or M == 0:
        return (
            gt_boxes.new_full((N, ), 0, dtype=torch.int64),
            gt_boxes.new_full((N, ), -1, dtype=torch.int8),
        )

    # Containment rule
    lt = torch.min(gt_boxes[:, None, :2], anchor_boxes[:, :2])  # [N,M,2]
    rb = torch.max(gt_boxes[:, None, 2:], anchor_boxes[:, 2:])  # [N,M,2]
    union = cat([lt, rb], dim=2)  # [N,M,4]

    dummy_gt_boxes = torch.zeros_like(gt_boxes)
    anchor = dummy_gt_boxes[:, None, :] + anchor_boxes[:, :]  # [N,M,4]

    contain_matrix = torch.all(union == anchor, dim=2)  # [N,M]

    # Centrality rule, scale
    gt_size_lower = torch.max(gt_boxes[:, 2:] - gt_boxes[:, :2],
                              dim=1)[0]  # [N]
    gt_size_upper = gt_size_lower * scale_thresh  # [N]
    # Fall back for small objects
    gt_size_upper[gt_size_upper < min_anchor_size] = min_anchor_size
    # Due to sampling of locations, the anchor sizes are deducted with sampling strides
    anchor_size = (
        torch.max(anchor_boxes[:, 2:] - anchor_boxes[:, :2], dim=1)[0] -
        unit_lengths)  # [M]

    size_diff_upper = gt_size_upper[:, None] - anchor_size  # [N,M]
    scale_matrix = size_diff_upper >= 0  # [N,M]

    # Centrality rule, spatial
    gt_center = (gt_boxes[:, 2:] + gt_boxes[:, :2]) / 2  # [N,2]
    anchor_center = (anchor_boxes[:, 2:] + anchor_boxes[:, :2]) / 2  # [M,2]
    offset_center = gt_center[:, None, :] - anchor_center[:, :]  # [N,M,2]
    offset_center /= unit_lengths[:, None]  # [N,M,2]
    spatial_square = spatial_thresh * spatial_thresh
    spatial_matrix = torch.sum(offset_center * offset_center,
                               dim=2) <= spatial_square

    assign_matrix = (contain_matrix & scale_matrix & spatial_matrix).int()

    # assign_matrix is N (gt) x M (predicted)
    # Max over gt elements (dim 0) to find best gt candidate for each prediction
    matched_vals, matches = assign_matrix.max(dim=0)
    match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)

    match_labels[matched_vals == 0] = 0
    match_labels[matched_vals == 1] = 1

    # find all the elements that match to ground truths multiple times
    not_unique_idxs = assign_matrix.sum(dim=0) > 1
    if uniqueness_on:
        match_labels[not_unique_idxs] = 0
    else:
        match_labels[not_unique_idxs] = -1

    return matches, match_labels
Exemple #27
0
    def forward(self, features, targets=None):
        coarse_sem_seg_logits = self.coarse_sem_seg_head.layers(features)

        if self.training:
            losses = self.coarse_sem_seg_head.losses(coarse_sem_seg_logits,
                                                     targets)

            with torch.no_grad():
                point_coords = get_uncertain_point_coords_with_randomness(
                    coarse_sem_seg_logits,
                    calculate_uncertainty_sem_seg,
                    self.train_num_points,
                    self.oversample_ratio,
                    self.importance_sample_ratio,
                )
            coarse_features = point_sample(coarse_sem_seg_logits,
                                           point_coords,
                                           align_corners=False)

            fine_grained_features = cat([
                point_sample(features[in_feature],
                             point_coords,
                             align_corners=False)
                for in_feature in self.in_features
            ])
            point_logits = self.point_head(fine_grained_features,
                                           coarse_features)
            point_targets = (point_sample(
                targets.unsqueeze(1).to(torch.float),
                point_coords,
                mode="nearest",
                align_corners=False,
            ).squeeze(1).to(torch.long))
            losses["loss_sem_seg_point"] = F.cross_entropy(
                point_logits,
                point_targets,
                reduction="mean",
                ignore_index=self.ignore_value)
            return None, losses
        else:
            sem_seg_logits = coarse_sem_seg_logits.clone()
            for _ in range(self.subdivision_steps):
                sem_seg_logits = F.interpolate(sem_seg_logits,
                                               scale_factor=2,
                                               mode="bilinear",
                                               align_corners=False)
                uncertainty_map = calculate_uncertainty_sem_seg(sem_seg_logits)
                point_indices, point_coords = get_uncertain_point_coords_on_grid(
                    uncertainty_map, self.subdivision_num_points)
                fine_grained_features = cat([
                    point_sample(features[in_feature],
                                 point_coords,
                                 align_corners=False)
                    for in_feature in self.in_features
                ])
                coarse_features = point_sample(coarse_sem_seg_logits,
                                               point_coords,
                                               align_corners=False)
                point_logits = self.point_head(fine_grained_features,
                                               coarse_features)

                # put sem seg point predictions to the right places on the upsampled grid.
                N, C, H, W = sem_seg_logits.shape
                point_indices = point_indices.unsqueeze(1).expand(-1, C, -1)
                sem_seg_logits = (sem_seg_logits.reshape(N, C, H * W).scatter_(
                    2, point_indices, point_logits).view(N, C, H, W))
            return sem_seg_logits, {}
Exemple #28
0
    def _forward_mask_point(self, features, mask_coarse_logits, instances):
        """
        Forward logic of the mask point head.
        """
        if not self.mask_point_on:
            return {} if self.training else mask_coarse_logits

        mask_features_list = [features[k] for k in self.mask_point_in_features]
        features_scales = [
            self._feature_scales[k] for k in self.mask_point_in_features
        ]

        if self.training:
            proposal_boxes = [x.proposal_boxes for x in instances]
            gt_classes = cat([x.gt_classes for x in instances])
            with torch.no_grad():
                point_coords = get_uncertain_point_coords_with_randomness(
                    mask_coarse_logits,
                    lambda logits: calculate_uncertainty_ins_seg(
                        logits, gt_classes),
                    self.mask_point_train_num_points,
                    self.mask_point_oversample_ratio,
                    self.mask_point_importance_sample_ratio,
                )

            fine_grained_features, point_coords_wrt_image = point_sample_fine_grained_features(
                mask_features_list, features_scales, proposal_boxes,
                point_coords)
            coarse_features = point_sample(mask_coarse_logits,
                                           point_coords,
                                           align_corners=False)
            point_logits = self.mask_point_head(fine_grained_features,
                                                coarse_features)
            return {
                "loss_mask_point":
                roi_mask_point_loss(point_logits, instances,
                                    point_coords_wrt_image)
            }
        else:
            pred_boxes = [x.pred_boxes for x in instances]
            pred_classes = cat([x.pred_classes for x in instances])
            # The subdivision code will fail with the empty list of boxes
            if len(pred_classes) == 0:
                return mask_coarse_logits

            mask_logits = mask_coarse_logits.clone()
            for subdivions_step in range(self.mask_point_subdivision_steps):
                mask_logits = interpolate(mask_logits,
                                          scale_factor=2,
                                          mode="bilinear",
                                          align_corners=False)
                # If `mask_point_subdivision_num_points` is larger or equal to the
                # resolution of the next step, then we can skip this step
                H, W = mask_logits.shape[-2:]
                if (self.mask_point_subdivision_num_points >= 4 * H * W
                        and subdivions_step <
                        self.mask_point_subdivision_steps - 1):
                    continue
                uncertainty_map = calculate_uncertainty_ins_seg(
                    mask_logits, pred_classes)
                point_indices, point_coords = get_uncertain_point_coords_on_grid(
                    uncertainty_map, self.mask_point_subdivision_num_points)
                fine_grained_features, _ = point_sample_fine_grained_features(
                    mask_features_list, features_scales, pred_boxes,
                    point_coords)
                coarse_features = point_sample(mask_coarse_logits,
                                               point_coords,
                                               align_corners=False)
                point_logits = self.mask_point_head(fine_grained_features,
                                                    coarse_features)

                # put mask point predictions to the right places on the upsampled grid.
                R, C, H, W = mask_logits.shape
                point_indices = point_indices.unsqueeze(1).expand(-1, C, -1)
                mask_logits = (mask_logits.reshape(R, C, H * W).scatter_(
                    2, point_indices, point_logits).view(R, C, H, W))
            return mask_logits
Exemple #29
0
def roi_mask_point_loss(mask_logits, instances, points_coord):
    """
    Compute the point-based loss for instance segmentation mask predictions.

    Args:
        mask_logits (Tensor): A tensor of shape (R, C, P) or (R, 1, P) for class-specific or
            class-agnostic, where R is the total number of predicted masks in all images, C is the
            number of foreground classes, and P is the number of points sampled for each mask.
            The values are logits.
        instances (list[Instances]): A list of N Instances, where N is the number of images
            in the batch. These instances are in 1:1 correspondence with the `mask_logits`. So, i_th
            elememt of the list contains R_i objects and R_1 + ... + R_N is equal to R.
            The ground-truth labels (class, box, mask, ...) associated with each instance are stored
            in fields.
        points_coords (Tensor): A tensor of shape (R, P, 2), where R is the total number of
            predicted masks and P is the number of points for each mask. The coordinates are in
            the image pixel coordinate space, i.e. [0, H] x [0, W].
    Returns:
        point_loss (Tensor): A scalar tensor containing the loss.
    """
    with torch.no_grad():
        cls_agnostic_mask = mask_logits.size(1) == 1
        total_num_masks = mask_logits.size(0)

        gt_classes = []
        gt_mask_logits = []
        idx = 0
        for instances_per_image in instances:
            if len(instances_per_image) == 0:
                continue
            assert isinstance(
                instances_per_image.gt_masks, BitMasks
            ), "Point head works with GT in 'bitmask' format. Set INPUT.MASK_FORMAT to 'bitmask'."

            if not cls_agnostic_mask:
                gt_classes_per_image = instances_per_image.gt_classes.to(
                    dtype=torch.int64)
                gt_classes.append(gt_classes_per_image)

            gt_bit_masks = instances_per_image.gt_masks.tensor
            h, w = instances_per_image.gt_masks.image_size
            scale = torch.tensor([w, h],
                                 dtype=torch.float,
                                 device=gt_bit_masks.device)
            points_coord_grid_sample_format = (
                points_coord[idx:idx + len(instances_per_image)] / scale)
            idx += len(instances_per_image)
            gt_mask_logits.append(
                point_sample(
                    gt_bit_masks.to(torch.float32).unsqueeze(1),
                    points_coord_grid_sample_format,
                    align_corners=False,
                ).squeeze(1))

    if len(gt_mask_logits) == 0:
        return mask_logits.sum() * 0

    gt_mask_logits = cat(gt_mask_logits)
    assert gt_mask_logits.numel() > 0, gt_mask_logits.shape

    if cls_agnostic_mask:
        mask_logits = mask_logits[:, 0]
    else:
        indices = torch.arange(total_num_masks)
        gt_classes = cat(gt_classes, dim=0)
        mask_logits = mask_logits[indices, gt_classes]

    # Log the training accuracy (using gt classes and 0.0 threshold for the logits)
    mask_accurate = (mask_logits > 0.0) == gt_mask_logits.to(dtype=torch.uint8)
    mask_accuracy = mask_accurate.nonzero(
        as_tuple=False).size(0) / mask_accurate.numel()
    get_event_storage().put_scalar("point_rend/accuracy", mask_accuracy)

    point_loss = F.binary_cross_entropy_with_logits(
        mask_logits, gt_mask_logits.to(dtype=torch.float32), reduction="mean")
    return point_loss
Exemple #30
0
def find_top_rpn_proposals(
        proposals,
        pred_objectness_logits,
        images,
        nms_thresh,
        pre_nms_topk,
        post_nms_topk,
        min_box_side_len,
        training,  # pylint: disable=W0613
):
    """
    For each feature map, select the `pre_nms_topk` highest scoring proposals,
    apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
    highest scoring proposals among all the feature maps if `training` is True,
    otherwise, returns the highest `post_nms_topk` scoring proposals for each
    feature map.

    Args:
        proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4).
            All proposal predictions on the feature maps.
        pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
        images (ImageList): Input images as an :class:`ImageList`.
        nms_thresh (float): IoU threshold to use for NMS
        pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
            When RPN is run on multiple feature maps (as in FPN) this number is per
            feature map.
        post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
            When RPN is run on multiple feature maps (as in FPN) this number is total,
            over all feature maps.
        min_box_side_len (float): minimum proposal box side length in pixels (absolute units
            wrt input images).
        training (bool): True if proposals are to be used in training, otherwise False.
            This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
            comment.

    Returns:
        proposals (list[Instances]): list of N Instances. The i-th Instances
            stores post_nms_topk object proposals for image i.
    """
    image_sizes = images.image_sizes  # in (h, w) order
    num_images = len(image_sizes)
    device = proposals[0].device

    # 1. Select top-k anchor for every level and every image
    topk_scores = []  # #lvl Tensor, each of shape N x topk
    topk_proposals = []
    level_ids = []  # #lvl Tensor, each of shape (topk,)
    batch_idx = torch.arange(num_images, device=device)
    for level_id, proposals_i, logits_i in zip(itertools.count(), proposals,
                                               pred_objectness_logits):
        Hi_Wi_A = logits_i.shape[1]
        num_proposals_i = min(pre_nms_topk, Hi_Wi_A)

        # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812)
        # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
        logits_i, idx = logits_i.sort(descending=True, dim=1)
        topk_scores_i = logits_i[batch_idx, :num_proposals_i]
        topk_idx = idx[batch_idx, :num_proposals_i]

        # each is N x topk
        topk_proposals_i = proposals_i[batch_idx[:, None],
                                       topk_idx]  # N x topk x 4

        topk_proposals.append(topk_proposals_i)
        topk_scores.append(topk_scores_i)
        level_ids.append(
            torch.full((num_proposals_i, ),
                       level_id,
                       dtype=torch.int64,
                       device=device))

    # 2. Concat all levels together
    topk_scores = cat(topk_scores, dim=1)
    topk_proposals = cat(topk_proposals, dim=1)
    level_ids = cat(level_ids, dim=0)

    # 3. For each image, run a per-level NMS, and choose topk results.
    results = []
    for n, image_size in enumerate(image_sizes):
        boxes = Boxes(topk_proposals[n])
        scores_per_img = topk_scores[n]
        # boxes.clip(image_size)

        # filter empty boxes
        keep = boxes.nonempty(threshold=min_box_side_len)
        lvl = level_ids
        if keep.sum().item() != len(boxes):
            boxes, scores_per_img, lvl = boxes[keep], scores_per_img[
                keep], level_ids[keep]

        keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh)
        # In Detectron1, there was different behavior during training vs. testing.
        # (https://github.com/facebookresearch/Detectron/issues/459)
        # During training, topk is over the proposals from *all* images in the training batch.
        # During testing, it is over the proposals for each image separately.
        # As a result, the training behavior becomes batch-dependent,
        # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
        # This bug is addressed in cvpods to make the behavior independent of batch size.
        keep = keep[:post_nms_topk]

        res = Instances(image_size)
        res.proposal_boxes = boxes[keep]
        res.objectness_logits = scores_per_img[keep]
        results.append(res)
    return results