def assign_boxes_to_levels(box_lists, min_level, max_level, canonical_box_size, canonical_level): """ Map each box in `box_lists` to a feature map level index and return the assignment vector. Args: box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch. min_level (int): Smallest feature map level index. The input is considered index 0, the output of stage 1 is index 1, and so. max_level (int): Largest feature map level index. canonical_box_size (int): A canonical box size in pixels (sqrt(box area)). canonical_level (int): The feature map level index on which a canonically-sized box should be placed. Returns: A tensor of length M, where M is the total number of boxes aggregated over all N batch images. The memory layout corresponds to the concatenation of boxes from all images. Each element is the feature map index, as an offset from `self.min_level`, for the corresponding box (so value i means the box is at `self.min_level + i`). """ eps = sys.float_info.epsilon box_sizes = torch.sqrt(cat([boxes.area() for boxes in box_lists])) # Eqn.(1) in FPN paper level_assignments = torch.floor(canonical_level + torch.log2(box_sizes / canonical_box_size + eps)) # clamp level to (min, max), in case the box size is too large or too small # for the available feature maps level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level) return level_assignments.to(torch.int64) - min_level
def cat(instance_lists: List["Instances"]) -> "Instances": """ Args: instance_lists (list[Instances]) Returns: Instances """ assert all(isinstance(i, Instances) for i in instance_lists) assert len(instance_lists) > 0 if len(instance_lists) == 1: return instance_lists[0] image_size = instance_lists[0].image_size for i in instance_lists[1:]: assert i.image_size == image_size ret = Instances(image_size) for k in instance_lists[0]._fields.keys(): values = [i.get(k) for i in instance_lists] v0 = values[0] if isinstance(v0, torch.Tensor): values = cat(values, dim=0) elif isinstance(v0, list): values = list(itertools.chain(*values)) elif hasattr(type(v0), "cat"): values = type(v0).cat(values) else: raise ValueError( "Unsupported type {} for concatenation".format(type(v0))) ret.set(k, values) return ret
def permute_all_cls_and_box_to_N_HWA_K_and_concat(box_cls, box_delta, num_classes=80): """ Rearrange the tensor layout from the network output, i.e.: list[Tensor]: #lvl tensors of shape (N, A x K, Hi, Wi) to per-image predictions, i.e.: Tensor: of shape (N x sum(Hi x Wi x A), K) """ # for each feature level, permute the outputs to make them be in the # same format as the labels. Note that the labels are computed for # all feature levels concatenated, so we keep the same representation # for the objectness and the box_delta box_cls_flattened = [permute_to_N_HWA_K(x, num_classes) for x in box_cls] box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta] # concatenate on the first dimension (representing the feature levels), to # take into account the way the labels were generated (with all feature maps # being concatenated as well) box_cls = cat(box_cls_flattened, dim=1).view(-1, num_classes) box_delta = cat(box_delta_flattened, dim=1).view(-1, 4) return box_cls, box_delta
def cat(boxes_list: List["Boxes"]) -> "Boxes": """ Concatenates a list of Boxes into a single Boxes Arguments: boxes_list (list[Boxes]) Returns: Boxes: the concatenated Boxes """ assert isinstance(boxes_list, (list, tuple)) assert len(boxes_list) > 0 assert all(isinstance(box, Boxes) for box in boxes_list) cat_boxes = type(boxes_list[0])(cat([b.tensor for b in boxes_list], dim=0)) return cat_boxes
def __init__(self, box2box_transform, pred_class_logits, pred_proposal_deltas, pred_cooperative_logits, proposals, smooth_l1_beta): """ Args: box2box_transform (Box2BoxTransform/Box2BoxTransformRotated): box2box transform instance for proposal-to-detection transformations. pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class logits for all R predicted object instances. Each row corresponds to a predicted object instance. pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for class-specific or class-agnostic regression. It stores the predicted deltas that transform proposals into final box detections. B is the box dimension (4 or 5). When B is 4, each row is [dx, dy, dw, dh (, ....)]. When B is 5, each row is [dx, dy, dw, dh, da (, ....)]. proposals (list[Instances]): A list of N Instances, where Instances i stores the proposals for image i, in the field "proposal_boxes". When training, each Instances must have ground-truth labels stored in the field "gt_classes" and "gt_boxes". smooth_l1_beta (float): The transition point between L1 and L2 loss in the smooth L1 loss function. When set to 0, the loss becomes L1. When set to +inf, the loss becomes constant 0. """ self.box2box_transform = box2box_transform self.num_preds_per_image = [len(p) for p in proposals] self.pred_class_logits = pred_class_logits self.pred_proposal_deltas = pred_proposal_deltas self.pred_cooperative_logits = pred_cooperative_logits self.smooth_l1_beta = smooth_l1_beta box_type = type(proposals[0].proposal_boxes) # cat(..., dim=0) concatenates over all images in the batch self.proposals = box_type.cat([p.proposal_boxes for p in proposals]) assert not self.proposals.tensor.requires_grad, "Proposals should not require gradients!" self.image_shapes = [x.image_size for x in proposals] # The following fields should exist only when training. if proposals[0].has("gt_boxes"): self.gt_boxes = box_type.cat([p.gt_boxes for p in proposals]) assert proposals[0].has("gt_classes") self.gt_classes = cat([p.gt_classes for p in proposals], dim=0)
def convert_boxes_to_pooler_format(box_lists): """ Convert all boxes in `box_lists` to the low-level format used by ROI pooling ops (see description under Returns). Args: box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch. Returns: When input is list[Boxes]: A tensor of shape (M, 5), where M is the total number of boxes aggregated over all N batch images. The 5 columns are (batch index, x0, y0, x1, y1), where batch index is the index in [0, N) identifying which batch image the box with corners at (x0, y0, x1, y1) comes from. When input is list[RotatedBoxes]: A tensor of shape (M, 6), where M is the total number of boxes aggregated over all N batch images. The 6 columns are (batch index, x_ctr, y_ctr, width, height, angle_degrees), where batch index is the index in [0, N) identifying which batch image the rotated box (x_ctr, y_ctr, width, height, angle_degrees) comes from. """ def fmt_box_list(box_tensor, batch_index): repeated_index = torch.full((len(box_tensor), 1), batch_index, dtype=box_tensor.dtype, device=box_tensor.device) return cat((repeated_index, box_tensor), dim=1) pooler_fmt_boxes = cat([ fmt_box_list(box_list.tensor, i) for i, box_list in enumerate(box_lists) ], dim=0) return pooler_fmt_boxes
def find_top_rpn_proposals( proposals, pred_objectness_logits, images, nms_thresh, pre_nms_topk, post_nms_topk, min_box_side_len, training, ): """ For each feature map, select the `pre_nms_topk` highest scoring proposals, apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` highest scoring proposals among all the feature maps if `training` is True, otherwise, returns the highest `post_nms_topk` scoring proposals for each feature map. Args: proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4). All proposal predictions on the feature maps. pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). images (ImageList): Input images as an :class:`ImageList`. nms_thresh (float): IoU threshold to use for NMS pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is per feature map. post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is total, over all feature maps. min_box_side_len (float): minimum proposal box side length in pixels (absolute units wrt input images). training (bool): True if proposals are to be used in training, otherwise False. This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." comment. Returns: proposals (list[Instances]): list of N Instances. The i-th Instances stores post_nms_topk object proposals for image i. """ image_sizes = images.image_sizes # in (h, w) order num_images = len(image_sizes) device = proposals[0].device # 1. Select top-k anchor for every level and every image topk_scores = [] # #lvl Tensor, each of shape N x topk topk_proposals = [] level_ids = [] # #lvl Tensor, each of shape (topk,) batch_idx = torch.arange(num_images, device=device) for level_id, proposals_i, logits_i in zip(itertools.count(), proposals, pred_objectness_logits): Hi_Wi_A = logits_i.shape[1] num_proposals_i = min(pre_nms_topk, Hi_Wi_A) # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812) # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1) logits_i, idx = logits_i.sort(descending=True, dim=1) topk_scores_i = logits_i[batch_idx, :num_proposals_i] topk_idx = idx[batch_idx, :num_proposals_i] # each is N x topk topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 4 topk_proposals.append(topk_proposals_i) topk_scores.append(topk_scores_i) level_ids.append( torch.full((num_proposals_i, ), level_id, dtype=torch.int64, device=device)) # 2. Concat all levels together topk_scores = cat(topk_scores, dim=1) topk_proposals = cat(topk_proposals, dim=1) level_ids = cat(level_ids, dim=0) # 3. For each image, run a per-level NMS, and choose topk results. results = [] for n, image_size in enumerate(image_sizes): boxes = Boxes(topk_proposals[n]) scores_per_img = topk_scores[n] boxes.clip(image_size) # filter empty boxes keep = boxes.nonempty(threshold=min_box_side_len) lvl = level_ids if keep.sum().item() != len(boxes): boxes, scores_per_img, lvl = boxes[keep], scores_per_img[ keep], level_ids[keep] keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh) # In Detectron1, there was different behavior during training vs. testing. # (https://github.com/facebookresearch/Detectron/issues/459) # During training, topk is over the proposals from *all* images in the training batch. # During testing, it is over the proposals for each image separately. # As a result, the training behavior becomes batch-dependent, # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. # This bug is addressed in Detectron2 to make the behavior independent of batch size. keep = keep[:post_nms_topk] res = Instances(image_size) res.proposal_boxes = boxes[keep] res.objectness_logits = scores_per_img[keep] results.append(res) return results
def losses(self): """ Return the losses from a set of RPN predictions and their associated ground-truth. Returns: dict[loss name -> loss value]: A dict mapping from loss name to loss value. Loss names are: `loss_rpn_cls` for objectness classification and `loss_rpn_loc` for proposal localization. """ def resample(label): """ Randomly sample a subset of positive and negative examples by overwriting the label vector to the ignore value (-1) for all elements that are not included in the sample. """ pos_idx, neg_idx = subsample_labels(label, self.batch_size_per_image, self.positive_fraction, 0) # Fill with the ignore label (-1), then set positive and negative labels label.fill_(-1) label.scatter_(0, pos_idx, 1) label.scatter_(0, neg_idx, 0) return label gt_objectness_logits, gt_anchor_deltas = self._get_ground_truth() """ gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the total number of anchors in image i (i.e., len(anchors[i])) gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), B), where B is the box dimension """ # Collect all objectness labels and delta targets over feature maps and images # The final ordering is L, N, H, W, A from slowest to fastest axis. num_anchors_per_map = [ np.prod(x.shape[1:]) for x in self.pred_objectness_logits ] num_anchors_per_image = sum(num_anchors_per_map) # Stack to: (N, num_anchors_per_image) gt_objectness_logits = torch.stack( [resample(label) for label in gt_objectness_logits], dim=0) # Log the number of positive/negative anchors per-image that's used in training num_pos_anchors = (gt_objectness_logits == 1).sum().item() num_neg_anchors = (gt_objectness_logits == 0).sum().item() storage = get_event_storage() storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / self.num_images) storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / self.num_images) assert gt_objectness_logits.shape[1] == num_anchors_per_image # Split to tuple of L tensors, each with shape (N, num_anchors_per_map) gt_objectness_logits = torch.split(gt_objectness_logits, num_anchors_per_map, dim=1) # Concat from all feature maps gt_objectness_logits = cat([x.flatten() for x in gt_objectness_logits], dim=0) # Stack to: (N, num_anchors_per_image, B) gt_anchor_deltas = torch.stack(gt_anchor_deltas, dim=0) assert gt_anchor_deltas.shape[1] == num_anchors_per_image B = gt_anchor_deltas.shape[2] # box dimension (4 or 5) # Split to tuple of L tensors, each with shape (N, num_anchors_per_image) gt_anchor_deltas = torch.split(gt_anchor_deltas, num_anchors_per_map, dim=1) # Concat from all feature maps gt_anchor_deltas = cat([x.reshape(-1, B) for x in gt_anchor_deltas], dim=0) # Collect all objectness logits and delta predictions over feature maps # and images to arrive at the same shape as the labels and targets # The final ordering is L, N, H, W, A from slowest to fastest axis. pred_objectness_logits = cat( [ # Reshape: (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N*Hi*Wi*A, ) x.permute(0, 2, 3, 1).flatten() for x in self.pred_objectness_logits ], dim=0, ) pred_anchor_deltas = cat( [ # Reshape: (N, A*B, Hi, Wi) -> (N, A, B, Hi, Wi) -> (N, Hi, Wi, A, B) # -> (N*Hi*Wi*A, B) x.view(x.shape[0], -1, B, x.shape[-2], x.shape[-1]).permute( 0, 3, 4, 1, 2).reshape(-1, B) for x in self.pred_anchor_deltas ], dim=0, ) objectness_loss, localization_loss = rpn_losses( gt_objectness_logits, gt_anchor_deltas, pred_objectness_logits, pred_anchor_deltas, self.smooth_l1_beta, ) normalizer = 1.0 / (self.batch_size_per_image * self.num_images) loss_cls = objectness_loss * normalizer # cls: classification loss loss_loc = localization_loss * normalizer # loc: localization loss losses = {"loss_rpn_cls": loss_cls, "loss_rpn_loc": loss_loc} return losses
def fmt_box_list(box_tensor, batch_index): repeated_index = torch.full((len(box_tensor), 1), batch_index, dtype=box_tensor.dtype, device=box_tensor.device) return cat((repeated_index, box_tensor), dim=1)
def inference_single_image(self, box_cls, box_delta, anchors, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W x A, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. anchors (list[Boxes]): list of #feature levels. Each entry contains a Boxes object, which contains all the anchors for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ boxes_all = [] scores_all = [] class_idxs_all = [] # Iterate over every feature level for box_cls_i, box_reg_i, anchors_i in zip(box_cls, box_delta, anchors): # (HxWxAxK,) box_cls_i = box_cls_i.flatten().sigmoid_() # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, box_reg_i.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = box_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] anchor_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes box_reg_i = box_reg_i[anchor_idxs] anchors_i = anchors_i[anchor_idxs] # predict boxes predicted_boxes = self.box2box_transform.apply_deltas(box_reg_i, anchors_i.tensor) boxes_all.append(predicted_boxes) scores_all.append(predicted_prob) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold) keep = keep[: self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return result