def keypoint_rcnn_loss(pred_keypoint_logits, instances, normalizer): """ Arguments: pred_keypoint_logits (Tensor): A tensor of shape (N, K, S, S) where N is the total number of instances in the batch, K is the number of keypoints, and S is the side length of the keypoint heatmap. The values are spatial logits. instances (list[Instances]): A list of M Instances, where M is the batch size. These instances are predictions from the model that are in 1:1 correspondence with pred_keypoint_logits. Each Instances should contain a `gt_keypoints` field containing a `structures.Keypoint` instance. normalizer (float): Normalize the loss by this amount. If not specified, we normalize by the number of visible keypoints in the minibatch. Returns a scalar tensor containing the loss. """ heatmaps = [] valid = [] keypoint_side_len = pred_keypoint_logits.shape[2] for instances_per_image in instances: if len(instances_per_image) == 0: continue keypoints = instances_per_image.gt_keypoints heatmaps_per_image, valid_per_image = keypoints.to_heatmap( instances_per_image.proposal_boxes.tensor, keypoint_side_len ) heatmaps.append(heatmaps_per_image.view(-1)) valid.append(valid_per_image.view(-1)) if len(heatmaps): keypoint_targets = cat(heatmaps, dim=0) valid = cat(valid, dim=0).to(dtype=torch.uint8) valid = torch.nonzero(valid).squeeze(1) # torch.mean (in binary_cross_entropy_with_logits) doesn't # accept empty tensors, so handle it separately if len(heatmaps) == 0 or valid.numel() == 0: global _TOTAL_SKIPPED _TOTAL_SKIPPED += 1 storage = get_event_storage() storage.put_scalar("kpts_num_skipped_batches", _TOTAL_SKIPPED, smoothing_hint=False) return pred_keypoint_logits.sum() * 0 N, K, H, W = pred_keypoint_logits.shape pred_keypoint_logits = pred_keypoint_logits.view(N * K, H * W) keypoint_loss = F.cross_entropy( pred_keypoint_logits[valid], keypoint_targets[valid], reduction="sum" ) # If a normalizer isn't specified, normalize by the number of visible keypoints in the minibatch if normalizer is None: normalizer = valid.numel() keypoint_loss /= normalizer return keypoint_loss
def convert_boxes_to_pooler_format(box_lists: List[Boxes]): """ Convert all boxes in `box_lists` to the low-level format used by ROI pooling ops (see description under Returns). Args: box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch. Returns: When input is list[Boxes]: A tensor of shape (M, 5), where M is the total number of boxes aggregated over all N batch images. The 5 columns are (batch index, x0, y0, x1, y1), where batch index is the index in [0, N) identifying which batch image the box with corners at (x0, y0, x1, y1) comes from. When input is list[RotatedBoxes]: A tensor of shape (M, 6), where M is the total number of boxes aggregated over all N batch images. The 6 columns are (batch index, x_ctr, y_ctr, width, height, angle_degrees), where batch index is the index in [0, N) identifying which batch image the rotated box (x_ctr, y_ctr, width, height, angle_degrees) comes from. """ pooler_fmt_boxes = cat([ _fmt_box_list(box_list.tensor, i) for i, box_list in enumerate(box_lists) ], dim=0) return pooler_fmt_boxes
def keypoint_rcnn_inference(pred_keypoint_logits: torch.Tensor, pred_instances: List[Instances]): """ Post process each predicted keypoint heatmap in `pred_keypoint_logits` into (x, y, score) and add it to the `pred_instances` as a `pred_keypoints` field. Args: pred_keypoint_logits (Tensor): A tensor of shape (R, K, S, S) where R is the total number of instances in the batch, K is the number of keypoints, and S is the side length of the keypoint heatmap. The values are spatial logits. pred_instances (list[Instances]): A list of N Instances, where N is the number of images. Returns: None. Each element in pred_instances will contain extra "pred_keypoints" and "pred_keypoint_heatmaps" fields. "pred_keypoints" is a tensor of shape (#instance, K, 3) where the last dimension corresponds to (x, y, score). The scores are larger than 0. "pred_keypoint_heatmaps" contains the raw keypoint logits as passed to this function. """ # flatten all bboxes from all images together (list[Boxes] -> Rx4 tensor) bboxes_flat = cat([b.pred_boxes.tensor for b in pred_instances], dim=0) pred_keypoint_logits = pred_keypoint_logits.detach() keypoint_results = heatmaps_to_keypoints(pred_keypoint_logits, bboxes_flat.detach()) num_instances_per_image = [len(i) for i in pred_instances] keypoint_results = keypoint_results[:, :, [0, 1, 3]].split(num_instances_per_image, dim=0) heatmap_results = pred_keypoint_logits.split(num_instances_per_image, dim=0) for keypoint_results_per_image, heatmap_results_per_image, instances_per_image in zip( keypoint_results, heatmap_results, pred_instances ): # keypoint_results_per_image is (num instances)x(num keypoints)x(x, y, score) # heatmap_results_per_image is (num instances)x(num keypoints)x(side)x(side) instances_per_image.pred_keypoints = keypoint_results_per_image instances_per_image.pred_keypoint_heatmaps = heatmap_results_per_image
def predict_boxes(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]): """ Args: predictions: return values of :meth:`forward()`. proposals (list[Instances]): proposals that match the features that were used to compute predictions. The ``proposal_boxes`` field is expected. Returns: list[Tensor]: A list of Tensors of predicted class-specific or class-agnostic boxes for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is the number of proposals for image i and B is the box dimension (4 or 5) """ if not len(proposals): return [] _, proposal_deltas = predictions num_prop_per_image = [len(p) for p in proposals] proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0) predict_boxes = self.box2box_transform.apply_deltas( proposal_deltas, proposal_boxes, ) # Nx(KxB) return predict_boxes.split(num_prop_per_image)
def predict_boxes_for_gt_classes(self, predictions, proposals): """ Args: predictions: return values of :meth:`forward()`. proposals (list[Instances]): proposals that match the features that were used to compute predictions. The fields ``proposal_boxes``, ``gt_classes`` are expected. Returns: list[Tensor]: A list of Tensors of predicted boxes for GT classes in case of class-specific box head. Element i of the list has shape (Ri, B), where Ri is the number of proposals for image i and B is the box dimension (4 or 5) """ if not len(proposals): return [] scores, proposal_deltas = predictions proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0) N, B = proposal_boxes.shape predict_boxes = self.box2box_transform.apply_deltas( proposal_deltas, proposal_boxes) # Nx(KxB) K = predict_boxes.shape[1] // B if K > 1: gt_classes = torch.cat([p.gt_classes for p in proposals], dim=0) # Some proposals are ignored or have a background class. Their gt_classes # cannot be used as index. gt_classes = gt_classes.clamp_(0, K - 1) predict_boxes = predict_boxes.view(N, K, B)[ torch.arange(N, dtype=torch.long, device=predict_boxes.device), gt_classes] num_prop_per_image = [len(p) for p in proposals] return predict_boxes.split(num_prop_per_image)
def forward(self, fine_grained_features, coarse_features): x = torch.cat((fine_grained_features, coarse_features), dim=1) for layer in self.fc_layers: x = F.relu(layer(x)) if self.coarse_pred_each_layer: x = cat((x, coarse_features), dim=1) return self.predictor(x)
def point_sample_fine_grained_features(features_list, feature_scales, boxes, point_coords): """ Get features from feature maps in `features_list` that correspond to specific point coordinates inside each bounding box from `boxes`. Args: features_list (list[Tensor]): A list of feature map tensors to get features from. feature_scales (list[float]): A list of scales for tensors in `features_list`. boxes (list[Boxes]): A list of I Boxes objects that contain R_1 + ... + R_I = R boxes all together. point_coords (Tensor): A tensor of shape (R, P, 2) that contains [0, 1] x [0, 1] box-normalized coordinates of the P sampled points. Returns: point_features (Tensor): A tensor of shape (R, C, P) that contains features sampled from all features maps in feature_list for P sampled points for all R boxes in `boxes`. point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains image-level coordinates of P points. """ cat_boxes = Boxes.cat(boxes) num_boxes = [b.tensor.size(0) for b in boxes] point_coords_wrt_image = get_point_coords_wrt_image( cat_boxes.tensor, point_coords) split_point_coords_wrt_image = torch.split(point_coords_wrt_image, num_boxes) point_features = [] for idx_img, point_coords_wrt_image_per_image in enumerate( split_point_coords_wrt_image): point_features_per_image = [] for idx_feature, feature_map in enumerate(features_list): h, w = feature_map.shape[-2:] scale = _as_tensor([w, h]) / feature_scales[idx_feature] point_coords_scaled = point_coords_wrt_image_per_image / scale.to( feature_map.device) point_features_per_image.append( point_sample( feature_map[idx_img].unsqueeze(0), point_coords_scaled.unsqueeze(0), align_corners=False, ).squeeze(0).transpose(1, 0)) point_features.append(cat(point_features_per_image, dim=1)) return cat(point_features, dim=0), point_coords_wrt_image
def _dense_box_regression_loss( anchors: List[Boxes], box2box_transform: Box2BoxTransform, pred_anchor_deltas: List[torch.Tensor], gt_boxes: List[torch.Tensor], fg_mask: torch.Tensor, box_reg_loss_type="smooth_l1", smooth_l1_beta=0.0, ): """ Compute loss for dense multi-level box regression. Loss is accumulated over ``fg_mask``. Args: anchors: #lvl anchor boxes, each is (HixWixA, 4) pred_anchor_deltas: #lvl predictions, each is (N, HixWixA, 4) gt_boxes: N ground truth boxes, each has shape (R, 4) (R = sum(Hi * Wi * A)) fg_mask: the foreground boolean mask of shape (N, R) to compute loss on box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou". smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1" """ anchors = type(anchors[0]).cat(anchors).tensor # (R, 4) if box_reg_loss_type == "smooth_l1": gt_anchor_deltas = [ box2box_transform.get_deltas(anchors, k) for k in gt_boxes ] gt_anchor_deltas = torch.stack(gt_anchor_deltas) # (N, R, 4) loss_box_reg = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[fg_mask], gt_anchor_deltas[fg_mask], beta=smooth_l1_beta, reduction="sum", ) elif box_reg_loss_type == "giou": pred_boxes = [ box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1) ] loss_box_reg = giou_loss(torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum") else: raise ValueError( f"Invalid dense box regression loss type '{box_reg_loss_type}'") return loss_box_reg
def _subdivision_inference(self, features, mask_representations, instances): assert not self.training pred_boxes = [x.pred_boxes for x in instances] pred_classes = cat([x.pred_classes for x in instances]) mask_logits = None # +1 here to include an initial step to generate the coarsest mask # prediction with init_resolution, when mask_logits is None. # We compute initial mask by sampling on a regular grid. coarse_mask # can be used as initial mask as well, but it's typically very low-res # so it will be completely overwritten during subdivision anyway. for _ in range(self.mask_point_subdivision_steps + 1): if mask_logits is None: point_coords = generate_regular_grid_point_coords( pred_classes.size(0), self.mask_point_subdivision_init_resolution, pred_boxes[0].device, ) else: mask_logits = interpolate(mask_logits, scale_factor=2, mode="bilinear", align_corners=False) uncertainty_map = calculate_uncertainty( mask_logits, pred_classes) point_indices, point_coords = get_uncertain_point_coords_on_grid( uncertainty_map, self.mask_point_subdivision_num_points) # Run the point head for every point in point_coords fine_grained_features = self._point_pooler(features, pred_boxes, point_coords) point_logits = self._get_point_logits(fine_grained_features, point_coords, mask_representations) if mask_logits is None: # Create initial mask_logits using point_logits on this regular grid R, C, _ = point_logits.shape mask_logits = point_logits.reshape( R, C, self.mask_point_subdivision_init_resolution, self.mask_point_subdivision_init_resolution, ) # The subdivision code will fail with the empty list of boxes if len(pred_classes) == 0: mask_rcnn_inference(mask_logits, instances) return instances else: # Put point predictions to the right places on the upsampled grid. R, C, H, W = mask_logits.shape point_indices = point_indices.unsqueeze(1).expand(-1, C, -1) mask_logits = (mask_logits.reshape(R, C, H * W).scatter_( 2, point_indices, point_logits).view(R, C, H, W)) mask_rcnn_inference(mask_logits, instances) return instances
def roi_mask_point_loss(mask_logits, instances, point_labels): """ Compute the point-based loss for instance segmentation mask predictions given point-wise mask prediction and its corresponding point-wise labels. Args: mask_logits (Tensor): A tensor of shape (R, C, P) or (R, 1, P) for class-specific or class-agnostic, where R is the total number of predicted masks in all images, C is the number of foreground classes, and P is the number of points sampled for each mask. The values are logits. instances (list[Instances]): A list of N Instances, where N is the number of images in the batch. These instances are in 1:1 correspondence with the `mask_logits`. So, i_th elememt of the list contains R_i objects and R_1 + ... + R_N is equal to R. The ground-truth labels (class, box, mask, ...) associated with each instance are stored in fields. point_labels (Tensor): A tensor of shape (R, P), where R is the total number of predicted masks and P is the number of points for each mask. Labels with value of -1 will be ignored. Returns: point_loss (Tensor): A scalar tensor containing the loss. """ with torch.no_grad(): cls_agnostic_mask = mask_logits.size(1) == 1 total_num_masks = mask_logits.size(0) gt_classes = [] for instances_per_image in instances: if len(instances_per_image) == 0: continue if not cls_agnostic_mask: gt_classes_per_image = instances_per_image.gt_classes.to(dtype=torch.int64) gt_classes.append(gt_classes_per_image) gt_mask_logits = point_labels point_ignores = point_labels == -1 if gt_mask_logits.shape[0] == 0: return mask_logits.sum() * 0 assert gt_mask_logits.numel() > 0, gt_mask_logits.shape if cls_agnostic_mask: mask_logits = mask_logits[:, 0] else: indices = torch.arange(total_num_masks) gt_classes = cat(gt_classes, dim=0) mask_logits = mask_logits[indices, gt_classes] # Log the training accuracy (using gt classes and 0.0 threshold for the logits) mask_accurate = (mask_logits > 0.0) == gt_mask_logits.to(dtype=torch.uint8) mask_accurate = mask_accurate[~point_ignores] mask_accuracy = mask_accurate.nonzero().size(0) / max(mask_accurate.numel(), 1.0) get_event_storage().put_scalar("point/accuracy", mask_accuracy) point_loss = F.binary_cross_entropy_with_logits( mask_logits, gt_mask_logits.to(dtype=torch.float32), weight=~point_ignores, reduction="mean" ) return point_loss
def losses(self, predictions, proposals): """ Args: predictions: return values of :meth:`forward()`. proposals (list[Instances]): proposals that match the features that were used to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``, ``gt_classes`` are expected. Returns: Dict[str, Tensor]: dict of losses """ scores, proposal_deltas = predictions # parse classification outputs gt_classes = (cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0)) _log_classification_stats(scores, gt_classes) # parse box regression outputs if len(proposals): proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0) # Nx4 assert not proposal_boxes.requires_grad, "Proposals should not require gradients!" # If "gt_boxes" does not exist, the proposals must be all negative and # should not be included in regression loss computation. # Here we just use proposal_boxes as an arbitrary placeholder because its # value won't be used in self.box_reg_loss(). gt_boxes = cat( [(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals], dim=0, ) else: proposal_boxes = gt_boxes = torch.empty( (0, 4), device=proposal_deltas.device) losses = { "loss_cls": cross_entropy(scores, gt_classes, reduction="mean"), "loss_box_reg": self.box_reg_loss(proposal_boxes, gt_boxes, proposal_deltas, gt_classes), } return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
def sample_point_labels(instances, point_coords): """ Sample point labels from ground truth mask given point_coords. Args: instances (list[Instances]): A list of N Instances, where N is the number of images in the batch. So, i_th elememt of the list contains R_i objects and R_1 + ... + R_N is equal to R. The ground-truth gt_masks in each instance will be used to compute labels. points_coords (Tensor): A tensor of shape (R, P, 2), where R is the total number of instances and P is the number of points for each instance. The coordinates are in the absolute image pixel coordinate space, i.e. [0, H] x [0, W]. Returns: Tensor: A tensor of shape (R, P) that contains the labels of P sampled points. """ with torch.no_grad(): gt_mask_logits = [] point_coords_splits = torch.split( point_coords, [len(instances_per_image) for instances_per_image in instances]) for i, instances_per_image in enumerate(instances): if len(instances_per_image) == 0: continue assert isinstance( instances_per_image.gt_masks, BitMasks ), "Point head works with GT in 'bitmask' format. Set INPUT.MASK_FORMAT to 'bitmask'." gt_bit_masks = instances_per_image.gt_masks.tensor h, w = instances_per_image.gt_masks.image_size scale = torch.tensor([w, h], dtype=torch.float, device=gt_bit_masks.device) points_coord_grid_sample_format = point_coords_splits[i] / scale gt_mask_logits.append( point_sample( gt_bit_masks.to(torch.float32).unsqueeze(1), points_coord_grid_sample_format, align_corners=False, ).squeeze(1)) point_labels = cat(gt_mask_logits) return point_labels
def _sample_train_points(self, coarse_mask, instances): assert self.training gt_classes = cat([x.gt_classes for x in instances]) with torch.no_grad(): # sample point_coords point_coords = get_uncertain_point_coords_with_randomness( coarse_mask, lambda logits: calculate_uncertainty(logits, gt_classes), self.mask_point_train_num_points, self.mask_point_oversample_ratio, self.mask_point_importance_sample_ratio, ) # sample point_labels proposal_boxes = [x.proposal_boxes for x in instances] cat_boxes = Boxes.cat(proposal_boxes) point_coords_wrt_image = get_point_coords_wrt_image( cat_boxes.tensor, point_coords) point_labels = sample_point_labels(instances, point_coords_wrt_image) return point_coords, point_labels
def mask_rcnn_inference(pred_mask_logits: torch.Tensor, pred_instances: List[Instances]): """ Convert pred_mask_logits to estimated foreground probability masks while also extracting only the masks for the predicted classes in pred_instances. For each predicted box, the mask of the same class is attached to the instance by adding a new "pred_masks" field to pred_instances. Args: pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask) for class-specific or class-agnostic, where B is the total number of predicted masks in all images, C is the number of foreground classes, and Hmask, Wmask are the height and width of the mask predictions. The values are logits. pred_instances (list[Instances]): A list of N Instances, where N is the number of images in the batch. Each Instances must have field "pred_classes". Returns: None. pred_instances will contain an extra "pred_masks" field storing a mask of size (Hmask, Wmask) for predicted class. Note that the masks are returned as a soft (non-quantized) masks the resolution predicted by the network; post-processing steps, such as resizing the predicted masks to the original image resolution and/or binarizing them, is left to the caller. """ cls_agnostic_mask = pred_mask_logits.size(1) == 1 if cls_agnostic_mask: mask_probs_pred = pred_mask_logits.sigmoid() else: # Select masks corresponding to the predicted classes num_masks = pred_mask_logits.shape[0] class_pred = cat([i.pred_classes for i in pred_instances]) indices = torch.arange(num_masks, device=class_pred.device) mask_probs_pred = pred_mask_logits[indices, class_pred][:, None].sigmoid() # mask_probs_pred.shape: (B, 1, Hmask, Wmask) num_boxes_per_image = [len(i) for i in pred_instances] mask_probs_pred = mask_probs_pred.split(num_boxes_per_image, dim=0) for prob, instances in zip(mask_probs_pred, pred_instances): instances.pred_masks = prob # (1, Hmask, Wmask)
def assign_boxes_to_levels( box_lists: List[Boxes], min_level: int, max_level: int, canonical_box_size: int, canonical_level: int, ): """ Map each box in `box_lists` to a feature map level index and return the assignment vector. Args: box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch. min_level (int): Smallest feature map level index. The input is considered index 0, the output of stage 1 is index 1, and so. max_level (int): Largest feature map level index. canonical_box_size (int): A canonical box size in pixels (sqrt(box area)). canonical_level (int): The feature map level index on which a canonically-sized box should be placed. Returns: A tensor of length M, where M is the total number of boxes aggregated over all N batch images. The memory layout corresponds to the concatenation of boxes from all images. Each element is the feature map index, as an offset from `self.min_level`, for the corresponding box (so value i means the box is at `self.min_level + i`). """ box_sizes = torch.sqrt(cat([boxes.area() for boxes in box_lists])) # Eqn.(1) in FPN paper level_assignments = torch.floor(canonical_level + torch.log2(box_sizes / canonical_box_size + 1e-8)) # clamp level to (min, max), in case the box size is too large or too small # for the available feature maps level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level) return level_assignments.to(torch.int64) - min_level
def __init__( self, box2box_transform, pred_class_logits, pred_proposal_deltas, proposals, smooth_l1_beta=0.0, box_reg_loss_type="smooth_l1", ): """ Args: box2box_transform (Box2BoxTransform/Box2BoxTransformRotated): box2box transform instance for proposal-to-detection transformations. pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class logits for all R predicted object instances. Each row corresponds to a predicted object instance. pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for class-specific or class-agnostic regression. It stores the predicted deltas that transform proposals into final box detections. B is the box dimension (4 or 5). When B is 4, each row is [dx, dy, dw, dh (, ....)]. When B is 5, each row is [dx, dy, dw, dh, da (, ....)]. proposals (list[Instances]): A list of N Instances, where Instances i stores the proposals for image i, in the field "proposal_boxes". When training, each Instances must have ground-truth labels stored in the field "gt_classes" and "gt_boxes". The total number of all instances must be equal to R. smooth_l1_beta (float): The transition point between L1 and L2 loss in the smooth L1 loss function. When set to 0, the loss becomes L1. When set to +inf, the loss becomes constant 0. box_reg_loss_type (str): Box regression loss type. One of: "smooth_l1", "giou" """ self.box2box_transform = box2box_transform self.num_preds_per_image = [len(p) for p in proposals] self.pred_class_logits = pred_class_logits self.pred_proposal_deltas = pred_proposal_deltas self.smooth_l1_beta = smooth_l1_beta self.box_reg_loss_type = box_reg_loss_type self.image_shapes = [x.image_size for x in proposals] if len(proposals): box_type = type(proposals[0].proposal_boxes) # cat(..., dim=0) concatenates over all images in the batch self.proposals = box_type.cat( [p.proposal_boxes for p in proposals]) assert (not self.proposals.tensor.requires_grad ), "Proposals should not require gradients!" # "gt_classes" exists if and only if training. But other gt fields may # not necessarily exist in training for images that have no groundtruth. if proposals[0].has("gt_classes"): self.gt_classes = cat([p.gt_classes for p in proposals], dim=0) # If "gt_boxes" does not exist, the proposals must be all negative and # should not be included in regression loss computation. # Here we just use proposal_boxes as an arbitrary placeholder because its # value won't be used in self.box_reg_loss(). gt_boxes = [ p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes for p in proposals ] self.gt_boxes = box_type.cat(gt_boxes) else: self.proposals = Boxes( torch.zeros(0, 4, device=self.pred_proposal_deltas.device)) self._no_instances = len(self.proposals) == 0 # no instances found
def losses( self, anchors: List[Boxes], pred_objectness_logits: List[torch.Tensor], gt_labels: List[torch.Tensor], pred_anchor_deltas: List[torch.Tensor], gt_boxes: List[torch.Tensor], ) -> Dict[str, torch.Tensor]: """ Return the losses from a set of RPN predictions and their associated ground-truth. Args: anchors (list[Boxes or RotatedBoxes]): anchors for each feature map, each has shape (Hi*Wi*A, B), where B is box dimension (4 or 5). pred_objectness_logits (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, Hi*Wi*A) representing the predicted objectness logits for all anchors. gt_labels (list[Tensor]): Output of :meth:`label_and_sample_anchors`. pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, Hi*Wi*A, 4 or 5) representing the predicted "deltas" used to transform anchors to proposals. gt_boxes (list[Tensor]): Output of :meth:`label_and_sample_anchors`. Returns: dict[loss name -> loss value]: A dict mapping from loss name to loss value. Loss names are: `loss_rpn_cls` for objectness classification and `loss_rpn_loc` for proposal localization. """ num_images = len(gt_labels) gt_labels = torch.stack(gt_labels) # (N, sum(Hi*Wi*Ai)) # Log the number of positive/negative anchors per-image that's used in training pos_mask = gt_labels == 1 num_pos_anchors = pos_mask.sum().item() num_neg_anchors = (gt_labels == 0).sum().item() storage = get_event_storage() storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images) storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images) localization_loss = _dense_box_regression_loss( anchors, self.box2box_transform, pred_anchor_deltas, gt_boxes, pos_mask, box_reg_loss_type=self.box_reg_loss_type, smooth_l1_beta=self.smooth_l1_beta, ) valid_mask = gt_labels >= 0 objectness_loss = F.binary_cross_entropy_with_logits( cat(pred_objectness_logits, dim=1)[valid_mask], gt_labels[valid_mask].to(torch.float32), reduction="sum", ) normalizer = self.batch_size_per_image * num_images losses = { "loss_rpn_cls": objectness_loss / normalizer, # The original Faster R-CNN paper uses a slightly different normalizer # for loc loss. But it doesn't matter in practice "loss_rpn_loc": localization_loss / normalizer, } losses = { k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items() } return losses
def _fmt_box_list(box_tensor, batch_index: int): repeated_index = torch.full_like(box_tensor[:, :1], batch_index, dtype=box_tensor.dtype, device=box_tensor.device) return cat((repeated_index, box_tensor), dim=1)
def mask_rcnn_loss(pred_mask_logits: torch.Tensor, instances: List[Instances], vis_period: int = 0): """ Compute the mask prediction loss defined in the Mask R-CNN paper. Args: pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask) for class-specific or class-agnostic, where B is the total number of predicted masks in all images, C is the number of foreground classes, and Hmask, Wmask are the height and width of the mask predictions. The values are logits. instances (list[Instances]): A list of N Instances, where N is the number of images in the batch. These instances are in 1:1 correspondence with the pred_mask_logits. The ground-truth labels (class, box, mask, ...) associated with each instance are stored in fields. vis_period (int): the period (in steps) to dump visualization. Returns: mask_loss (Tensor): A scalar tensor containing the loss. """ cls_agnostic_mask = pred_mask_logits.size(1) == 1 total_num_masks = pred_mask_logits.size(0) mask_side_len = pred_mask_logits.size(2) assert pred_mask_logits.size(2) == pred_mask_logits.size( 3), "Mask prediction must be square!" gt_classes = [] gt_masks = [] for instances_per_image in instances: if len(instances_per_image) == 0: continue if not cls_agnostic_mask: gt_classes_per_image = instances_per_image.gt_classes.to( dtype=torch.int64) gt_classes.append(gt_classes_per_image) gt_masks_per_image = instances_per_image.gt_masks.crop_and_resize( instances_per_image.proposal_boxes.tensor, mask_side_len).to(device=pred_mask_logits.device) # A tensor of shape (N, M, M), N=#instances in the image; M=mask_side_len gt_masks.append(gt_masks_per_image) if len(gt_masks) == 0: return pred_mask_logits.sum() * 0 gt_masks = cat(gt_masks, dim=0) if cls_agnostic_mask: pred_mask_logits = pred_mask_logits[:, 0] else: indices = torch.arange(total_num_masks) gt_classes = cat(gt_classes, dim=0) pred_mask_logits = pred_mask_logits[indices, gt_classes] if gt_masks.dtype == torch.bool: gt_masks_bool = gt_masks else: # Here we allow gt_masks to be float as well (depend on the implementation of rasterize()) gt_masks_bool = gt_masks > 0.5 gt_masks = gt_masks.to(dtype=torch.float32) # Log the training accuracy (using gt classes and 0.5 threshold) mask_incorrect = (pred_mask_logits > 0.0) != gt_masks_bool mask_accuracy = 1 - (mask_incorrect.sum().item() / max(mask_incorrect.numel(), 1.0)) num_positive = gt_masks_bool.sum().item() false_positive = (mask_incorrect & ~gt_masks_bool).sum().item() / max( gt_masks_bool.numel() - num_positive, 1.0) false_negative = (mask_incorrect & gt_masks_bool).sum().item() / max( num_positive, 1.0) storage = get_event_storage() storage.put_scalar("mask_rcnn/accuracy", mask_accuracy) storage.put_scalar("mask_rcnn/false_positive", false_positive) storage.put_scalar("mask_rcnn/false_negative", false_negative) if vis_period > 0 and storage.iter % vis_period == 0: pred_masks = pred_mask_logits.sigmoid() vis_masks = torch.cat([pred_masks, gt_masks], axis=2) name = "Left: mask prediction; Right: mask GT" for idx, vis_mask in enumerate(vis_masks): vis_mask = torch.stack([vis_mask] * 3, axis=0) storage.put_image(name + f" ({idx})", vis_mask) mask_loss = F.binary_cross_entropy_with_logits(pred_mask_logits, gt_masks, reduction="mean") return mask_loss
def find_top_rpn_proposals( proposals: List[torch.Tensor], pred_objectness_logits: List[torch.Tensor], image_sizes: List[Tuple[int, int]], nms_thresh: float, pre_nms_topk: int, post_nms_topk: int, min_box_size: float, training: bool, ): """ For each feature map, select the `pre_nms_topk` highest scoring proposals, apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` highest scoring proposals among all the feature maps for each image. Args: proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4). All proposal predictions on the feature maps. pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). image_sizes (list[tuple]): sizes (h, w) for each image nms_thresh (float): IoU threshold to use for NMS pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is per feature map. post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is total, over all feature maps. min_box_size (float): minimum proposal box side length in pixels (absolute units wrt input images). training (bool): True if proposals are to be used in training, otherwise False. This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." comment. Returns: list[Instances]: list of N Instances. The i-th Instances stores post_nms_topk object proposals for image i, sorted by their objectness score in descending order. """ num_images = len(image_sizes) device = proposals[0].device # 1. Select top-k anchor for every level and every image topk_scores = [] # #lvl Tensor, each of shape N x topk topk_proposals = [] level_ids = [] # #lvl Tensor, each of shape (topk,) batch_idx = torch.arange(num_images, device=device) for level_id, (proposals_i, logits_i) in enumerate( zip(proposals, pred_objectness_logits)): Hi_Wi_A = logits_i.shape[1] if isinstance(Hi_Wi_A, torch.Tensor): # it's a tensor in tracing num_proposals_i = torch.clamp(Hi_Wi_A, max=pre_nms_topk) else: num_proposals_i = min(Hi_Wi_A, pre_nms_topk) # sort is faster than topk: https://github.com/pytorch/pytorch/issues/22812 # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1) logits_i, idx = logits_i.sort(descending=True, dim=1) topk_scores_i = logits_i.narrow(1, 0, num_proposals_i) topk_idx = idx.narrow(1, 0, num_proposals_i) # each is N x topk topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 4 topk_proposals.append(topk_proposals_i) topk_scores.append(topk_scores_i) level_ids.append( torch.full((num_proposals_i, ), level_id, dtype=torch.int64, device=device)) # 2. Concat all levels together topk_scores = cat(topk_scores, dim=1) topk_proposals = cat(topk_proposals, dim=1) level_ids = cat(level_ids, dim=0) # 3. For each image, run a per-level NMS, and choose topk results. results: List[Instances] = [] for n, image_size in enumerate(image_sizes): boxes = Boxes(topk_proposals[n]) scores_per_img = topk_scores[n] lvl = level_ids valid_mask = torch.isfinite( boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img) if not valid_mask.all(): if training: raise FloatingPointError( "Predicted boxes or scores contain Inf/NaN. Training has diverged." ) boxes = boxes[valid_mask] scores_per_img = scores_per_img[valid_mask] lvl = lvl[valid_mask] boxes.clip(image_size) # filter empty boxes keep = boxes.nonempty(threshold=min_box_size) if _is_tracing() or keep.sum().item() != len(boxes): boxes, scores_per_img, lvl = boxes[keep], scores_per_img[ keep], lvl[keep] keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh) # In Detectron1, there was different behavior during training vs. testing. # (https://github.com/facebookresearch/Detectron/issues/459) # During training, topk is over the proposals from *all* images in the training batch. # During testing, it is over the proposals for each image separately. # As a result, the training behavior becomes batch-dependent, # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. # This bug is addressed in Detectron2 to make the behavior independent of batch size. keep = keep[:post_nms_topk] # keep is already sorted res = Instances(image_size) res.proposal_boxes = boxes[keep] res.objectness_logits = scores_per_img[keep] results.append(res) return results
def get_uncertain_point_coords_with_randomness(coarse_logits, uncertainty_func, num_points, oversample_ratio, importance_sample_ratio): """ Sample points in [0, 1] x [0, 1] coordinate space based on their uncertainty. The unceratinties are calculated for each point using 'uncertainty_func' function that takes point's logit prediction as input. See PointRend paper for details. Args: coarse_logits (Tensor): A tensor of shape (N, C, Hmask, Wmask) or (N, 1, Hmask, Wmask) for class-specific or class-agnostic prediction. uncertainty_func: A function that takes a Tensor of shape (N, C, P) or (N, 1, P) that contains logit predictions for P points and returns their uncertainties as a Tensor of shape (N, 1, P). num_points (int): The number of points P to sample. oversample_ratio (int): Oversampling parameter. importance_sample_ratio (float): Ratio of points that are sampled via importnace sampling. Returns: point_coords (Tensor): A tensor of shape (N, P, 2) that contains the coordinates of P sampled points. """ assert oversample_ratio >= 1 assert importance_sample_ratio <= 1 and importance_sample_ratio >= 0 num_boxes = coarse_logits.shape[0] num_sampled = int(num_points * oversample_ratio) point_coords = torch.rand(num_boxes, num_sampled, 2, device=coarse_logits.device) point_logits = point_sample(coarse_logits, point_coords, align_corners=False) # It is crucial to calculate uncertainty based on the sampled prediction value for the points. # Calculating uncertainties of the coarse predictions first and sampling them for points leads # to incorrect results. # To illustrate this: assume uncertainty_func(logits)=-abs(logits), a sampled point between # two coarse predictions with -1 and 1 logits has 0 logits, and therefore 0 uncertainty value. # However, if we calculate uncertainties for the coarse predictions first, # both will have -1 uncertainty, and the sampled point will get -1 uncertainty. point_uncertainties = uncertainty_func(point_logits) num_uncertain_points = int(importance_sample_ratio * num_points) num_random_points = num_points - num_uncertain_points idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1] shift = num_sampled * torch.arange( num_boxes, dtype=torch.long, device=coarse_logits.device) idx += shift[:, None] point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view( num_boxes, num_uncertain_points, 2) if num_random_points > 0: point_coords = cat( [ point_coords, torch.rand(num_boxes, num_random_points, 2, device=coarse_logits.device), ], dim=1, ) return point_coords