def visualize_training(self, batched_inputs, proposals): """ A function used to visualize images and proposals. It shows ground truth bounding boxes on the original image and up to 20 predicted object proposals on the original image. Users can implement different visualization functions for different models. Args: batched_inputs (list): a list that contains input to the model. proposals (list): a list that contains predicted proposals. Both batched_inputs and proposals should have the same length. """ storage = get_event_storage() max_vis_prop = 20 for input, prop in zip(batched_inputs, proposals): img = input["image"].cpu().numpy() assert img.shape[0] == 3, "Images should have 3 channels." if self.input_format == "BGR": img = img[::-1, :, :] img = img.transpose(1, 2, 0) v_gt = Visualizer(img, None) v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes) anno_img = v_gt.get_image() box_size = min(len(prop.proposal_boxes), max_vis_prop) v_pred = Visualizer(img, None) v_pred = v_pred.overlay_instances( boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy() ) prop_img = v_pred.get_image() vis_img = np.concatenate((anno_img, prop_img), axis=1) vis_img = vis_img.transpose(2, 0, 1) vis_name = " 1. GT bounding boxes 2. Predicted proposals" storage.put_image(vis_name, vis_img)
def keypoint_rcnn_loss(pred_keypoint_logits, instances, normalizer): """ Arguments: pred_keypoint_logits (Tensor): A tensor of shape (N, K, S, S) where N is the total number of instances in the batch, K is the number of keypoints, and S is the side length of the keypoint heatmap. The values are spatial logits. instances (list[Instances]): A list of M Instances, where M is the batch size. These instances are predictions from the model that are in 1:1 correspondence with pred_keypoint_logits. Each Instances should contain a `gt_keypoints` field containing a `structures.Keypoint` instance. normalizer (float): Normalize the loss by this amount. If not specified, we normalize by the number of visible keypoints in the minibatch. Returns a scalar tensor containing the loss. """ heatmaps = [] valid = [] keypoint_side_len = pred_keypoint_logits.shape[2] for instances_per_image in instances: if len(instances_per_image) == 0: continue keypoints = instances_per_image.gt_keypoints heatmaps_per_image, valid_per_image = keypoints.to_heatmap( instances_per_image.proposal_boxes.tensor, keypoint_side_len ) heatmaps.append(heatmaps_per_image.view(-1)) valid.append(valid_per_image.view(-1)) if len(heatmaps): keypoint_targets = cat(heatmaps, dim=0) valid = cat(valid, dim=0).to(dtype=torch.uint8) valid = torch.nonzero(valid).squeeze(1) # torch.mean (in binary_cross_entropy_with_logits) doesn't # accept empty tensors, so handle it separately if len(heatmaps) == 0 or valid.numel() == 0: global _TOTAL_SKIPPED _TOTAL_SKIPPED += 1 storage = get_event_storage() storage.put_scalar("kpts_num_skipped_batches", _TOTAL_SKIPPED, smoothing_hint=False) return pred_keypoint_logits.sum() * 0 N, K, H, W = pred_keypoint_logits.shape pred_keypoint_logits = pred_keypoint_logits.view(N * K, H * W) keypoint_loss = F.cross_entropy( pred_keypoint_logits[valid], keypoint_targets[valid], reduction="sum" ) # If a normalizer isn't specified, normalize by the number of visible keypoints in the minibatch if normalizer is None: normalizer = valid.numel() keypoint_loss /= normalizer return keypoint_loss
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances (optional): groundtruth :class:`Instances` * proposals (optional): :class:`Instances`, precomputed proposals. Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "instances" whose value is a :class:`Instances`. The :class:`Instances` object has the following keys: "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" """ if not self.training: return self.inference(batched_inputs) images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 ) gt_instances = [x["targets"].to(self.device) for x in batched_inputs] else: gt_instances = None features = self.backbone(images.tensor) if self.proposal_generator: proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) else: assert "proposals" in batched_inputs[0] proposals = [x["proposals"].to(self.device) for x in batched_inputs] proposal_losses = {} _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: self.visualize_training(batched_inputs, proposals) losses = {} losses.update(detector_losses) losses.update(proposal_losses) return losses
def _match_and_label_boxes(self, proposals, stage, targets): """ Match proposals with groundtruth using the matcher at the given stage. Label the proposals as foreground or background based on the match. Args: proposals (list[Instances]): One Instances for each image, with the field "proposal_boxes". stage (int): the current stage targets (list[Instances]): the ground truth instances Returns: list[Instances]: the same proposals, but with fields "gt_classes" and "gt_boxes" """ num_fg_samples, num_bg_samples = [], [] for proposals_per_image, targets_per_image in zip(proposals, targets): match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes ) # proposal_labels are 0 or 1 matched_idxs, proposal_labels = self.proposal_matchers[stage](match_quality_matrix) if len(targets_per_image) > 0: gt_classes = targets_per_image.gt_classes[matched_idxs] # Label unmatched proposals (0 label from matcher) as background (label=num_classes) gt_classes[proposal_labels == 0] = self.num_classes gt_boxes = targets_per_image.gt_boxes[matched_idxs] else: gt_classes = torch.zeros_like(matched_idxs) + self.num_classes gt_boxes = Boxes( targets_per_image.gt_boxes.tensor.new_zeros((len(proposals_per_image), 4)) ) proposals_per_image.gt_classes = gt_classes proposals_per_image.gt_boxes = gt_boxes num_fg_samples.append((proposal_labels == 1).sum().item()) num_bg_samples.append(proposal_labels.numel() - num_fg_samples[-1]) # Log the number of fg/bg samples in each stage storage = get_event_storage() storage.put_scalar( "stage{}/roi_head/num_fg_samples".format(stage), sum(num_fg_samples) / len(num_fg_samples), ) storage.put_scalar( "stage{}/roi_head/num_bg_samples".format(stage), sum(num_bg_samples) / len(num_bg_samples), ) return proposals
def select_proposals_with_visible_keypoints(proposals): """ Args: proposals (list[Instances]): a list of N Instances, where N is the number of images. Returns: proposals: only contains proposals with at least one visible keypoint. Note that this is still slightly different from Detectron. In Detectron, proposals for training keypoint head are re-sampled from all the proposals with IOU>threshold & >=1 visible keypoint. Here, the proposals are first sampled from all proposals with IOU>threshold, then proposals with no visible keypoint are filtered out. This strategy seems to make no difference on Detectron and is easier to implement. """ ret = [] all_num_fg = [] for proposals_per_image in proposals: # If empty/unannotated image (hard negatives), skip filtering for train if len(proposals_per_image) == 0: ret.append(proposals_per_image) continue gt_keypoints = proposals_per_image.gt_keypoints.tensor # #fg x K x 3 vis_mask = gt_keypoints[:, :, 2] >= 1 xs, ys = gt_keypoints[:, :, 0], gt_keypoints[:, :, 1] proposal_boxes = proposals_per_image.proposal_boxes.tensor.unsqueeze( dim=1) # #fg x 1 x 4 kp_in_box = ((xs >= proposal_boxes[:, :, 0]) & (xs <= proposal_boxes[:, :, 2]) & (ys >= proposal_boxes[:, :, 1]) & (ys <= proposal_boxes[:, :, 3])) selection = (kp_in_box & vis_mask).any(dim=1) selection_idxs = torch.nonzero(selection).squeeze(1) all_num_fg.append(selection_idxs.numel()) ret.append(proposals_per_image[selection_idxs]) storage = get_event_storage() storage.put_scalar("keypoint_head/num_fg_samples", np.mean(all_num_fg)) return ret
def _forward_box(self, features, proposals, targets=None): head_outputs = [] image_sizes = [x.image_size for x in proposals] for k in range(self.num_cascade_stages): if k > 0: # The output boxes of the previous stage are the input proposals of the next stage proposals = self._create_proposals_from_boxes( head_outputs[-1].predict_boxes(), image_sizes ) if self.training: proposals = self._match_and_label_boxes(proposals, k, targets) head_outputs.append(self._run_stage(features, proposals, k)) if self.training: losses = {} storage = get_event_storage() for stage, output in enumerate(head_outputs): with storage.name_scope("stage{}".format(stage)): stage_losses = output.losses() losses.update({k + "_stage{}".format(stage): v for k, v in stage_losses.items()}) return losses else: # Each is a list[Tensor] of length #image. Each tensor is Ri x (K+1) scores_per_stage = [h.predict_probs() for h in head_outputs] # Average the scores across heads scores = [ sum(list(scores_per_image)) * (1.0 / self.num_cascade_stages) for scores_per_image in zip(*scores_per_stage) ] # Use the boxes of the last head boxes = head_outputs[-1].predict_boxes() pred_instances, _ = fast_rcnn_inference( boxes, scores, image_sizes, self.test_score_thresh, self.test_nms_thresh, self.test_detections_per_img, ) return pred_instances
def _log_accuracy(self): """ Log the accuracy metrics to EventStorage. """ num_instances = self.gt_classes.numel() pred_classes = self.pred_class_logits.argmax(dim=1) bg_class_ind = self.pred_class_logits.shape[1] - 1 fg_inds = (self.gt_classes >= 0) & (self.gt_classes < bg_class_ind) num_fg = fg_inds.nonzero().numel() fg_gt_classes = self.gt_classes[fg_inds] fg_pred_classes = pred_classes[fg_inds] num_false_negative = (fg_pred_classes == bg_class_ind).nonzero().numel() num_accurate = (pred_classes == self.gt_classes).nonzero().numel() fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel() storage = get_event_storage() storage.put_scalar("fast_rcnn/cls_accuracy", num_accurate / num_instances) if num_fg > 0: storage.put_scalar("fast_rcnn/fg_cls_accuracy", fg_num_accurate / num_fg) storage.put_scalar("fast_rcnn/false_negative", num_false_negative / num_fg)
def losses(self): """ Return the losses from a set of RPN predictions and their associated ground-truth. Returns: dict[loss name -> loss value]: A dict mapping from loss name to loss value. Loss names are: `loss_rpn_cls` for objectness classification and `loss_rpn_loc` for proposal localization. """ def resample(label): """ Randomly sample a subset of positive and negative examples by overwriting the label vector to the ignore value (-1) for all elements that are not included in the sample. """ pos_idx, neg_idx = subsample_labels(label, self.batch_size_per_image, self.positive_fraction, 0) # Fill with the ignore label (-1), then set positive and negative labels label.fill_(-1) label.scatter_(0, pos_idx, 1) label.scatter_(0, neg_idx, 0) return label gt_objectness_logits, gt_anchor_deltas = self._get_ground_truth() """ gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the total number of anchors in image i (i.e., len(anchors[i])) gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), B), where B is the box dimension """ # Collect all objectness labels and delta targets over feature maps and images # The final ordering is L, N, H, W, A from slowest to fastest axis. num_anchors_per_map = [ np.prod(x.shape[1:]) for x in self.pred_objectness_logits ] num_anchors_per_image = sum(num_anchors_per_map) # Stack to: (N, num_anchors_per_image) gt_objectness_logits = torch.stack( [resample(label) for label in gt_objectness_logits], dim=0) # Log the number of positive/negative anchors per-image that's used in training num_pos_anchors = (gt_objectness_logits == 1).sum().item() num_neg_anchors = (gt_objectness_logits == 0).sum().item() storage = get_event_storage() storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / self.num_images) storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / self.num_images) assert gt_objectness_logits.shape[1] == num_anchors_per_image # Split to tuple of L tensors, each with shape (N, num_anchors_per_map) gt_objectness_logits = torch.split(gt_objectness_logits, num_anchors_per_map, dim=1) # Concat from all feature maps gt_objectness_logits = cat([x.flatten() for x in gt_objectness_logits], dim=0) # Stack to: (N, num_anchors_per_image, B) gt_anchor_deltas = torch.stack(gt_anchor_deltas, dim=0) assert gt_anchor_deltas.shape[1] == num_anchors_per_image B = gt_anchor_deltas.shape[2] # box dimension (4 or 5) # Split to tuple of L tensors, each with shape (N, num_anchors_per_image) gt_anchor_deltas = torch.split(gt_anchor_deltas, num_anchors_per_map, dim=1) # Concat from all feature maps gt_anchor_deltas = cat([x.reshape(-1, B) for x in gt_anchor_deltas], dim=0) # Collect all objectness logits and delta predictions over feature maps # and images to arrive at the same shape as the labels and targets # The final ordering is L, N, H, W, A from slowest to fastest axis. pred_objectness_logits = cat( [ # Reshape: (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N*Hi*Wi*A, ) x.permute(0, 2, 3, 1).flatten() for x in self.pred_objectness_logits ], dim=0, ) pred_anchor_deltas = cat( [ # Reshape: (N, A*B, Hi, Wi) -> (N, A, B, Hi, Wi) -> (N, Hi, Wi, A, B) # -> (N*Hi*Wi*A, B) x.view(x.shape[0], -1, B, x.shape[-2], x.shape[-1]).permute( 0, 3, 4, 1, 2).reshape(-1, B) for x in self.pred_anchor_deltas ], dim=0, ) objectness_loss, localization_loss = rpn_losses( gt_objectness_logits, gt_anchor_deltas, pred_objectness_logits, pred_anchor_deltas, self.smooth_l1_beta, ) normalizer = 1.0 / (self.batch_size_per_image * self.num_images) loss_cls = objectness_loss * normalizer # cls: classification loss loss_loc = localization_loss * normalizer # loc: localization loss losses = {"loss_rpn_cls": loss_cls, "loss_rpn_loc": loss_loc} return losses
def label_and_sample_proposals(self, proposals, targets): """ Prepare some proposals to be used to train the ROI heads. It performs box matching between `proposals` and `targets`, and assigns training labels to the proposals. It returns ``self.batch_size_per_image`` random samples from proposals and groundtruth boxes, with a fraction of positives that is no larger than ``self.positive_sample_fraction``. Args: See :meth:`ROIHeads.forward` Returns: list[Instances]: length `N` list of `Instances`s containing the proposals sampled for training. Each `Instances` has the following fields: - proposal_boxes: the proposal boxes - gt_boxes: the ground-truth box that the proposal is assigned to (this is only meaningful if the proposal has a label > 0; if label = 0 then the ground-truth box is random) Other fields such as "gt_classes", "gt_masks", that's included in `targets`. """ gt_boxes = [x.gt_boxes for x in targets] # Augment proposals with ground-truth boxes. # In the case of learned proposals (e.g., RPN), when training starts # the proposals will be low quality due to random initialization. # It's possible that none of these initial # proposals have high enough overlap with the gt objects to be used # as positive examples for the second stage components (box head, # cls head, mask head). Adding the gt boxes to the set of proposals # ensures that the second stage components will have some positive # examples from the start of training. For RPN, this augmentation improves # convergence and empirically improves box AP on COCO by about 0.5 # points (under one tested configuration). if self.proposal_append_gt: proposals = add_ground_truth_to_proposals(gt_boxes, proposals) proposals_with_gt = [] num_fg_samples = [] num_bg_samples = [] for proposals_per_image, targets_per_image in zip(proposals, targets): has_gt = len(targets_per_image) > 0 match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes) matched_idxs, matched_labels = self.proposal_matcher( match_quality_matrix) sampled_idxs, gt_classes = self._sample_proposals( matched_idxs, matched_labels, targets_per_image.gt_classes) # Set target attributes of the sampled proposals: proposals_per_image = proposals_per_image[sampled_idxs] proposals_per_image.gt_classes = gt_classes # We index all the attributes of targets that start with "gt_" # and have not been added to proposals yet (="gt_classes"). if has_gt: sampled_targets = matched_idxs[sampled_idxs] # NOTE: here the indexing waste some compute, because heads # like masks, keypoints, etc, will filter the proposals again, # (by foreground/background, or number of keypoints in the image, etc) # so we essentially index the data twice. for (trg_name, trg_value) in targets_per_image.get_fields().items(): if trg_name.startswith( "gt_") and not proposals_per_image.has(trg_name): proposals_per_image.set(trg_name, trg_value[sampled_targets]) else: gt_boxes = Boxes( targets_per_image.gt_boxes.tensor.new_zeros( (len(sampled_idxs), 4))) proposals_per_image.gt_boxes = gt_boxes num_bg_samples.append( (gt_classes == self.num_classes).sum().item()) num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) proposals_with_gt.append(proposals_per_image) # Log the number of fg/bg samples that are selected for training ROI heads storage = get_event_storage() storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) return proposals_with_gt
def mask_rcnn_loss(pred_mask_logits, instances): """ Compute the mask prediction loss defined in the Mask R-CNN paper. Args: pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask) for class-specific or class-agnostic, where B is the total number of predicted masks in all images, C is the number of foreground classes, and Hmask, Wmask are the height and width of the mask predictions. The values are logits. instances (list[Instances]): A list of N Instances, where N is the number of images in the batch. These instances are in 1:1 correspondence with the pred_mask_logits. The ground-truth labels (class, box, mask, ...) associated with each instance are stored in fields. Returns: mask_loss (Tensor): A scalar tensor containing the loss. """ cls_agnostic_mask = pred_mask_logits.size(1) == 1 total_num_masks = pred_mask_logits.size(0) mask_side_len = pred_mask_logits.size(2) assert pred_mask_logits.size(2) == pred_mask_logits.size( 3), "Mask prediction must be square!" gt_classes = [] gt_masks = [] for instances_per_image in instances: if len(instances_per_image) == 0: continue if not cls_agnostic_mask: gt_classes_per_image = instances_per_image.gt_classes.to( dtype=torch.int64) gt_classes.append(gt_classes_per_image) gt_masks_per_image = instances_per_image.gt_masks.crop_and_resize( instances_per_image.proposal_boxes.tensor, mask_side_len).to(device=pred_mask_logits.device) # A tensor of shape (N, M, M), N=#instances in the image; M=mask_side_len gt_masks.append(gt_masks_per_image) if len(gt_masks) == 0: return pred_mask_logits.sum() * 0 gt_masks = cat(gt_masks, dim=0) if cls_agnostic_mask: pred_mask_logits = pred_mask_logits[:, 0] else: indices = torch.arange(total_num_masks) gt_classes = cat(gt_classes, dim=0) pred_mask_logits = pred_mask_logits[indices, gt_classes] if gt_masks.dtype == torch.bool: gt_masks_bool = gt_masks else: # Here we allow gt_masks to be float as well (depend on the implementation of rasterize()) gt_masks_bool = gt_masks > 0.5 # Log the training accuracy (using gt classes and 0.5 threshold) mask_incorrect = (pred_mask_logits > 0.0) != gt_masks_bool mask_accuracy = 1 - (mask_incorrect.sum().item() / max(mask_incorrect.numel(), 1.0)) num_positive = gt_masks_bool.sum().item() false_positive = (mask_incorrect & ~gt_masks_bool).sum().item() / max( gt_masks_bool.numel() - num_positive, 1.0) false_negative = (mask_incorrect & gt_masks_bool).sum().item() / max( num_positive, 1.0) storage = get_event_storage() storage.put_scalar("mask_rcnn/accuracy", mask_accuracy) storage.put_scalar("mask_rcnn/false_positive", false_positive) storage.put_scalar("mask_rcnn/false_negative", false_negative) mask_loss = F.binary_cross_entropy_with_logits( pred_mask_logits, gt_masks.to(dtype=torch.float32), reduction="mean") return mask_loss
def label_and_sample_proposals(self, proposals, targets): """ Prepare some proposals to be used to train the RROI heads. It performs box matching between `proposals` and `targets`, and assigns training labels to the proposals. It returns `self.batch_size_per_image` random samples from proposals and groundtruth boxes, with a fraction of positives that is no larger than `self.positive_sample_fraction. Args: See :meth:`StandardROIHeads.forward` Returns: list[Instances]: length `N` list of `Instances`s containing the proposals sampled for training. Each `Instances` has the following fields: - proposal_boxes: the rotated proposal boxes - gt_boxes: the ground-truth rotated boxes that the proposal is assigned to (this is only meaningful if the proposal has a label > 0; if label = 0 then the ground-truth box is random) - gt_classes: the ground-truth classification lable for each proposal """ gt_boxes = [x.gt_boxes for x in targets] if self.proposal_append_gt: proposals = add_ground_truth_to_proposals(gt_boxes, proposals) proposals_with_gt = [] num_fg_samples = [] num_bg_samples = [] for proposals_per_image, targets_per_image in zip(proposals, targets): has_gt = len(targets_per_image) > 0 match_quality_matrix = pairwise_iou_rotated( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes) matched_idxs, matched_labels = self.proposal_matcher( match_quality_matrix) sampled_idxs, gt_classes = self._sample_proposals( matched_idxs, matched_labels, targets_per_image.gt_classes) proposals_per_image = proposals_per_image[sampled_idxs] proposals_per_image.gt_classes = gt_classes if has_gt: sampled_targets = matched_idxs[sampled_idxs] proposals_per_image.gt_boxes = targets_per_image.gt_boxes[ sampled_targets] else: gt_boxes = RotatedBoxes( targets_per_image.gt_boxes.tensor.new_zeros( (len(sampled_idxs), 5))) proposals_per_image.gt_boxes = gt_boxes num_bg_samples.append( (gt_classes == self.num_classes).sum().item()) num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) proposals_with_gt.append(proposals_per_image) # Log the number of fg/bg samples that are selected for training ROI heads storage = get_event_storage() storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) return proposals_with_gt