def transform_proposals(dataset_dict, image_shape, transforms, *, proposal_topk, min_box_size=0): if "proposal_boxes" in dataset_dict: boxes = transforms.apply_box( BoxMode.convert( dataset_dict.pop("proposal_boxes"), dataset_dict.pop("proposal_bbox_mode"), BoxMode.XYXY_ABS, )) boxes = Boxes(boxes) objectness_logits = torch.as_tensor( dataset_dict.pop("proposal_objectness_logits").astype("float32")) boxes.clip(image_shape) keep = boxes.nonempty(threshold=min_box_size) boxes = boxes[keep] objectness_logits = objectness_logits[keep] proposals = Instances(image_shape) proposals.proposal_boxes = boxes[:proposal_topk] proposals.objectness_logits = objectness_logits[:proposal_topk] dataset_dict["proposals"] = proposals
def _forward_box( self, features: Dict[str, torch.Tensor], proposals: List[Instances] ) -> Union[Dict[str, torch.Tensor], List[Instances]]: features = [features[f] for f in self.box_in_features] box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) box_features = self.box_head(box_features) predictions = self.box_predictor(box_features) del box_features if self.training: losses = self.box_predictor.losses(predictions, proposals) if self.train_on_pred_boxes: with torch.no_grad(): pred_boxes = self.box_predictor.predict_boxes_for_gt_classes( predictions, proposals) for proposals_per_image, pred_boxes_per_image in zip( proposals, pred_boxes): proposals_per_image.proposal_boxes = Boxes( pred_boxes_per_image) return losses else: pred_instances, _ = self.box_predictor.inference( predictions, proposals) return pred_instances
def __init__( self, box2box_transform, pred_class_logits, pred_proposal_deltas, proposals, smooth_l1_beta=0, ): self.box2box_transform = box2box_transform self.num_preds_per_image = [len(p) for p in proposals] self.pred_class_logits = pred_class_logits self.pred_proposal_deltas = pred_proposal_deltas self.smooth_l1_beta = smooth_l1_beta self.image_shapes = [x.image_size for x in proposals] if len(proposals): box_type = type(proposals[0].proposal_boxes) self.proposals = box_type.cat( [p.proposal_boxes for p in proposals]) assert not self.proposals.tensor.requires_grad, \ "Proposals should not require gradients!" if proposals[0].has("gt_boxes"): self.gt_boxes = box_type.cat([p.gt_boxes for p in proposals]) assert proposals[0].has("gt_classes") self.gt_classes = cat([p.gt_classes for p in proposals], dim=0) else: self.proposals = Boxes( torch.zeros(0, 4, device=self.pred_proposal_deltas.device)) self._no_instances = len(proposals) == 0
def get_ground_truth(self, anchors, targets): gt_classes = [] gt_anchors_deltas = [] anchors = Boxes.cat(anchors) for targets_per_image in targets: match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, anchors) gt_matched_idxs, anchor_labels = self.matcher(match_quality_matrix) has_gt = len(targets_per_image) > 0 if has_gt: matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs] gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas( anchors.tensor, matched_gt_boxes.tensor) gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] gt_classes_i[anchor_labels == 0] = self.num_classes gt_classes_i[anchor_labels == -1] = -1 else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes gt_anchors_reg_deltas_i = torch.zeros_like(anchors.tensor) gt_classes.append(gt_classes_i) gt_anchors_deltas.append(gt_anchors_reg_deltas_i) return torch.stack(gt_classes), torch.stack(gt_anchors_deltas)
def point_sample_fine_grained_features(features_list, feature_scales, boxes, point_coords): cat_boxes = Boxes.cat(boxes) num_boxes = [len(b) for b in boxes] point_coords_wrt_image = get_point_coords_wrt_image(cat_boxes.tensor, point_coords) split_point_coords_wrt_image = torch.split(point_coords_wrt_image, num_boxes) point_features = [] for idx_img, point_coords_wrt_image_per_image in enumerate(split_point_coords_wrt_image): point_features_per_image = [] for idx_feature, feature_map in enumerate(features_list): h, w = feature_map.shape[-2:] scale = torch.tensor([w, h], device=feature_map.device) / feature_scales[idx_feature] point_coords_scaled = point_coords_wrt_image_per_image / scale point_features_per_image.append( point_sample( feature_map[idx_img].unsqueeze(0), point_coords_scaled.unsqueeze(0), align_corners=False, ) .squeeze(0) .transpose(1, 0) ) point_features.append(cat(point_features_per_image, dim=1)) return cat(point_features, dim=0), point_coords_wrt_image
def _match_and_label_boxes(self, proposals, stage, targets): num_fg_samples, num_bg_samples = [], [] for proposals_per_image, targets_per_image in zip(proposals, targets): match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes) matched_idxs, proposal_labels = self.proposal_matchers[stage]( match_quality_matrix) if len(targets_per_image) > 0: gt_classes = targets_per_image.gt_classes[matched_idxs] gt_classes[proposal_labels == 0] = self.num_classes gt_boxes = targets_per_image.gt_boxes[matched_idxs] else: gt_classes = torch.zeros_like(matched_idxs) + self.num_classes gt_boxes = Boxes( targets_per_image.gt_boxes.tensor.new_zeros( (len(proposals_per_image), 4))) proposals_per_image.gt_classes = gt_classes proposals_per_image.gt_boxes = gt_boxes num_fg_samples.append((proposal_labels == 1).sum().item()) num_bg_samples.append(proposal_labels.numel() - num_fg_samples[-1]) storage = get_event_storage() storage.put_scalar( "stage{}/roi_head/num_fg_samples".format(stage), sum(num_fg_samples) / len(num_fg_samples), ) storage.put_scalar( "stage{}/roi_head/num_bg_samples".format(stage), sum(num_bg_samples) / len(num_bg_samples), ) return proposals
def label_and_sample_anchors(self, anchors: List[Boxes], gt_instances: List[Instances]): anchors = Boxes.cat(anchors) gt_boxes = [x.gt_boxes for x in gt_instances] image_sizes = [x.image_size for x in gt_instances] del gt_instances gt_labels = [] matched_gt_boxes = [] for image_size_i, gt_boxes_i in zip(image_sizes, gt_boxes): match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i, anchors) matched_idxs, gt_labels_i = retry_if_cuda_oom( self.anchor_matcher)(match_quality_matrix) gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device) del match_quality_matrix if self.anchor_boundary_thresh >= 0: anchors_inside_image = anchors.inside_box( image_size_i, self.anchor_boundary_thresh) gt_labels_i[~anchors_inside_image] = -1 gt_labels_i = self._subsample_labels(gt_labels_i) if len(gt_boxes_i) == 0: matched_gt_boxes_i = torch.zeros_like(anchors.tensor) else: matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor gt_labels.append(gt_labels_i) matched_gt_boxes.append(matched_gt_boxes_i) return gt_labels, matched_gt_boxes
def label_anchors(self, anchors, gt_instances): anchors = Boxes.cat(anchors) gt_labels = [] matched_gt_boxes = [] for gt_per_image in gt_instances: match_quality_matrix = pairwise_iou(gt_per_image.gt_boxes, anchors) matched_idxs, anchor_labels = self.anchor_matcher( match_quality_matrix) del match_quality_matrix if len(gt_per_image) > 0: matched_gt_boxes_i = gt_per_image.gt_boxes.tensor[matched_idxs] gt_labels_i = gt_per_image.gt_classes[matched_idxs] gt_labels_i[anchor_labels == 0] = self.num_classes gt_labels_i[anchor_labels == -1] = -1 else: matched_gt_boxes_i = torch.zeros_like(anchors.tensor) gt_labels_i = torch.zeros_like(matched_idxs) + self.num_classes gt_labels.append(gt_labels_i) matched_gt_boxes.append(matched_gt_boxes_i) return gt_labels, matched_gt_boxes
def _create_proposals_from_boxes(self, boxes, image_sizes): boxes = [Boxes(b.detach()) for b in boxes] proposals = [] for boxes_per_image, image_size in zip(boxes, image_sizes): boxes_per_image.clip(image_size) if self.training: boxes_per_image = boxes_per_image[boxes_per_image.nonempty()] prop = Instances(image_size) prop.proposal_boxes = boxes_per_image proposals.append(prop) return proposals
def fast_rcnn_inference_single_image(boxes, scores, image_shape, score_thresh, nms_thresh, topk_per_image): valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all( dim=1) if not valid_mask.all(): boxes = boxes[valid_mask] scores = scores[valid_mask] scores = scores[:, :-1] num_bbox_reg_classes = boxes.shape[1] // 4 boxes = Boxes(boxes.reshape(-1, 4)) boxes.clip(image_shape) boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) filter_mask = scores > score_thresh filter_inds = filter_mask.nonzero() if num_bbox_reg_classes == 1: boxes = boxes[filter_inds[:, 0], 0] else: boxes = boxes[filter_mask] scores = scores[filter_mask] keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh) if topk_per_image >= 0: keep = keep[:topk_per_image] boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep] result = Instances(image_shape) result.pred_boxes = Boxes(boxes) result.scores = scores result.pred_classes = filter_inds[:, 1] return result, filter_inds[:, 0]
def label_and_sample_proposals( self, proposals: List[Instances], targets: List[Instances]) -> List[Instances]: gt_boxes = [x.gt_boxes for x in targets] if self.proposal_append_gt: proposals = add_ground_truth_to_proposals(gt_boxes, proposals) proposals_with_gt = [] num_fg_samples = [] num_bg_samples = [] for proposals_per_image, targets_per_image in zip(proposals, targets): has_gt = len(targets_per_image) > 0 match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes) matched_idxs, matched_labels = self.proposal_matcher( match_quality_matrix) sampled_idxs, gt_classes = self._sample_proposals( matched_idxs, matched_labels, targets_per_image.gt_classes) proposals_per_image = proposals_per_image[sampled_idxs] proposals_per_image.gt_classes = gt_classes if has_gt: sampled_targets = matched_idxs[sampled_idxs] for (trg_name, trg_value) in targets_per_image.get_fields().items(): if trg_name.startswith( "gt_") and not proposals_per_image.has(trg_name): proposals_per_image.set(trg_name, trg_value[sampled_targets]) else: gt_boxes = Boxes( targets_per_image.gt_boxes.tensor.new_zeros( (len(sampled_idxs), 4))) proposals_per_image.gt_boxes = gt_boxes num_bg_samples.append( (gt_classes == self.num_classes).sum().item()) num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) proposals_with_gt.append(proposals_per_image) storage = get_event_storage() storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) return proposals_with_gt
def annotations_to_instances(annos, image_size, mask_format="polygon"): boxes = [ BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos ] target = Instances(image_size) target.gt_boxes = Boxes(boxes) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] if mask_format == "polygon": masks = PolygonMasks(segms) else: assert mask_format == "bitmask", mask_format masks = [] for segm in segms: if isinstance(segm, list): masks.append(polygons_to_bitmask(segm, *image_size)) elif isinstance(segm, dict): masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, f"Expect segmentation of 2 dimensions, got {segm.ndim}." masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a full-image segmentation mask " "as a 2D ndarray.".format(type(segm))) masks = BitMasks( torch.stack([ torch.from_numpy(np.ascontiguousarray(x)) for x in masks ])) target.gt_masks = masks if len(annos) and "keypoints" in annos[0]: kpts = [obj.get("keypoints", []) for obj in annos] target.gt_keypoints = Keypoints(kpts) return target
def inference_single_image(self, box_cls, box_delta, anchors, image_size): boxes_all = [] scores_all = [] class_idxs_all = [] for box_cls_i, box_reg_i, anchors_i in zip(box_cls, box_delta, anchors): box_cls_i = F.softmax(box_cls_i, dim=-1)[:, :-1].flatten() num_topk = box_reg_i.size(0) predicted_prob, topk_idxs = box_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] anchor_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes box_reg_i = box_reg_i[anchor_idxs] anchors_i = anchors_i[anchor_idxs] predicted_boxes = self.box2box_transform.apply_deltas( box_reg_i, anchors_i.tensor) boxes_all.append(predicted_boxes) scores_all.append(predicted_prob) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return result
def create_instances(predictions, image_size, conf_threshold=0.5): ret = Instances(image_size) score = np.asarray([x["score"] for x in predictions]) chosen = (score > conf_threshold).nonzero()[0] score = score[chosen] bbox = np.asarray([predictions[i]["bbox"] for i in chosen]).reshape(-1, 4) labels = np.asarray( [dataset_id_map(predictions[i]["category_id"]) for i in chosen]) ret.scores = score ret.pred_boxes = Boxes( BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)) ret.pred_classes = labels try: ret.pred_masks = [predictions[i]["segmentation"] for i in chosen] except KeyError: pass return ret
def _evaluate_box_proposals(dataset_predictions, lvis_api, thresholds=None, area="all", limit=None): areas = { "all": 0, "small": 1, "medium": 2, "large": 3, "96-128": 4, "128-256": 5, "256-512": 6, "512-inf": 7, } area_ranges = [ [0**2, 1e5**2], [0**2, 32**2], [32**2, 96**2], [96**2, 1e5**2], [96**2, 128**2], [128**2, 256**2], [256**2, 512**2], [512**2, 1e5**2], ] assert area in areas, f"Unknown area range: {area}".format() area_range = area_ranges[areas[area]] gt_overlaps = [] num_pos = 0 for prediction_dict in dataset_predictions: predictions = prediction_dict["proposals"] inds = predictions.objectness_logits.sort(descending=True)[1] predictions = predictions[inds] ann_ids = lvis_api.get_ann_ids(img_ids=[prediction_dict["image_id"]]) anno = lvis_api.load_anns(ann_ids) gt_boxes = [ BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno ] gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) gt_boxes = Boxes(gt_boxes) gt_areas = torch.as_tensor([obj["area"] for obj in anno]) if len(gt_boxes) == 0 or len(predictions) == 0: continue valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) gt_boxes = gt_boxes[valid_gt_inds] num_pos += len(gt_boxes) if len(gt_boxes) == 0: continue if limit is not None and len(predictions) > limit: predictions = predictions[:limit] overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes) _gt_overlaps = torch.zeros(len(gt_boxes)) for j in range(min(len(predictions), len(gt_boxes))): max_overlaps, argmax_overlaps = overlaps.max(dim=0) gt_ovr, gt_ind = max_overlaps.max(dim=0) assert gt_ovr >= 0 box_ind = argmax_overlaps[gt_ind] _gt_overlaps[j] = overlaps[box_ind, gt_ind] assert _gt_overlaps[j] == gt_ovr overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 gt_overlaps.append(_gt_overlaps) gt_overlaps = (torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32)) gt_overlaps, _ = torch.sort(gt_overlaps) if thresholds is None: step = 0.05 thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) recalls = torch.zeros_like(thresholds) for i, t in enumerate(thresholds): recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) ar = recalls.mean() return { "ar": ar, "recalls": recalls, "thresholds": thresholds, "gt_overlaps": gt_overlaps, "num_pos": num_pos, }
def forward(self, features): grid_sizes = [feature_map.shape[-2:] for feature_map in features] anchors_over_all_feature_maps = self._grid_anchors(grid_sizes) return [Boxes(x) for x in anchors_over_all_feature_maps]
def find_top_rpn_proposals( proposals: List[torch.Tensor], pred_objectness_logits: List[torch.Tensor], image_sizes: List[Tuple[int, int]], nms_thresh: float, pre_nms_topk: int, post_nms_topk: int, min_box_size: int, training: bool, ): num_images = len(image_sizes) device = proposals[0].device topk_scores = [] topk_proposals = [] level_ids = [] batch_idx = torch.arange(num_images, device=device) for level_id, proposals_i, logits_i in zip( itertools.count(), proposals, pred_objectness_logits ): Hi_Wi_A = logits_i.shape[1] num_proposals_i = min(pre_nms_topk, Hi_Wi_A) logits_i, idx = logits_i.sort(descending=True, dim=1) topk_scores_i = logits_i[batch_idx, :num_proposals_i] topk_idx = idx[batch_idx, :num_proposals_i] topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] topk_proposals.append(topk_proposals_i) topk_scores.append(topk_scores_i) level_ids.append( torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device) ) topk_scores = cat(topk_scores, dim=1) topk_proposals = cat(topk_proposals, dim=1) level_ids = cat(level_ids, dim=0) results = [] for n, image_size in enumerate(image_sizes): boxes = Boxes(topk_proposals[n]) scores_per_img = topk_scores[n] lvl = level_ids valid_mask = torch.isfinite(boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img) if not valid_mask.all(): if training: raise FloatingPointError( "Predicted boxes or scores contain Inf/NaN. Training has diverged." ) boxes = boxes[valid_mask] scores_per_img = scores_per_img[valid_mask] lvl = lvl[valid_mask] boxes.clip(image_size) keep = boxes.nonempty(threshold=min_box_size) if keep.sum().item() != len(boxes): boxes, scores_per_img, lvl = boxes[keep], scores_per_img[keep], lvl[keep] keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh) keep = keep[:post_nms_topk] res = Instances(image_size) res.proposal_boxes = boxes[keep] res.objectness_logits = scores_per_img[keep] results.append(res) return results
def convert_to_coco_dict(dataset_name): dataset_dicts = DatasetCatalog.get(dataset_name) metadata = MetadataCatalog.get(dataset_name) if hasattr(metadata, "thing_dataset_id_to_contiguous_id"): reverse_id_mapping = {v: k for k, v in metadata.thing_dataset_id_to_contiguous_id.items()} reverse_id_mapper = lambda contiguous_id: reverse_id_mapping[contiguous_id] else: reverse_id_mapper = lambda contiguous_id: contiguous_id categories = [ {"id": reverse_id_mapper(id), "name": name} for id, name in enumerate(metadata.thing_classes) ] logger.info("Converting dataset dicts into COCO format") coco_images = [] coco_annotations = [] for image_id, image_dict in enumerate(dataset_dicts): coco_image = { "id": image_dict.get("image_id", image_id), "width": image_dict["width"], "height": image_dict["height"], "file_name": image_dict["file_name"], } coco_images.append(coco_image) anns_per_image = image_dict["annotations"] for annotation in anns_per_image: coco_annotation = {} bbox = annotation["bbox"] bbox_mode = annotation["bbox_mode"] bbox = BoxMode.convert(bbox, bbox_mode, BoxMode.XYWH_ABS) if "segmentation" in annotation: segmentation = annotation["segmentation"] if isinstance(segmentation, list): polygons = PolygonMasks([segmentation]) area = polygons.area()[0].item() elif isinstance(segmentation, dict): area = mask_util.area(segmentation).item() else: raise TypeError(f"Unknown segmentation type {type(segmentation)}!") else: bbox_xy = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) area = Boxes([bbox_xy]).area()[0].item() if "keypoints" in annotation: keypoints = annotation["keypoints"] for idx, v in enumerate(keypoints): if idx % 3 != 2: keypoints[idx] = v - 0.5 if "num_keypoints" in annotation: num_keypoints = annotation["num_keypoints"] else: num_keypoints = sum(kp > 0 for kp in keypoints[2::3]) coco_annotation["id"] = len(coco_annotations) + 1 coco_annotation["image_id"] = coco_image["id"] coco_annotation["bbox"] = [round(float(x), 3) for x in bbox] coco_annotation["area"] = float(area) coco_annotation["iscrowd"] = annotation.get("iscrowd", 0) coco_annotation["category_id"] = reverse_id_mapper(annotation["category_id"]) if "keypoints" in annotation: coco_annotation["keypoints"] = keypoints coco_annotation["num_keypoints"] = num_keypoints if "segmentation" in annotation: seg = coco_annotation["segmentation"] = annotation["segmentation"] if isinstance(seg, dict): counts = seg["counts"] if not isinstance(counts, str): seg["counts"] = counts.decode("ascii") coco_annotations.append(coco_annotation) logger.info( "Conversion finished, " f"#images: {len(coco_images)}, #annotations: {len(coco_annotations)}" ) info = { "date_created": str(datetime.datetime.now()), "description": "Automatically generated COCO json file for tkdetection.", } coco_dict = { "info": info, "images": coco_images, "annotations": coco_annotations, "categories": categories, "licenses": None, } return coco_dict
def inference_on_single_feature_map( self, locations, box_cls, box_regression, centerness, image_sizes ): N, C, H, W = box_cls.shape box_cls = box_cls.view(N, C, H, W).permute(0, 2, 3, 1) box_cls = box_cls.reshape(N, -1, C).sigmoid() box_regression = box_regression.view(N, 4, H, W).permute(0, 2, 3, 1) box_regression = box_regression.reshape(N, -1, 4) centerness = centerness.view(N, 1, H, W).permute(0, 2, 3, 1) centerness = centerness.reshape(N, -1).sigmoid() candidate_inds = box_cls > self.score_threshold pre_nms_top_n = candidate_inds.view(N, -1).sum(1) pre_nms_top_n = pre_nms_top_n.clamp(max=self.topk_candidates) box_cls = box_cls * centerness[:, :, None] results = [] for i in range(N): per_box_cls = box_cls[i] per_candidate_inds = candidate_inds[i] per_box_cls = per_box_cls[per_candidate_inds] per_candidate_nonzeros = per_candidate_inds.nonzero() per_box_loc = per_candidate_nonzeros[:, 0] per_class = per_candidate_nonzeros[:, 1] per_box_regression = box_regression[i] per_box_regression = per_box_regression[per_box_loc] per_locations = locations[per_box_loc] per_pre_nms_top_n = pre_nms_top_n[i] if per_candidate_inds.sum().item() > per_pre_nms_top_n.item(): per_box_cls, top_k_indices = per_box_cls.topk(per_pre_nms_top_n, sorted=False) per_class = per_class[top_k_indices] per_box_regression = per_box_regression[top_k_indices] per_locations = per_locations[top_k_indices] detections = torch.stack( [ per_locations[:, 0] - per_box_regression[:, 0], per_locations[:, 1] - per_box_regression[:, 1], per_locations[:, 0] + per_box_regression[:, 2], per_locations[:, 1] + per_box_regression[:, 3], ], dim=1 ) result = Instances(image_sizes[i]) detections = Boxes(detections) detections.clip(image_sizes[i]) result.pred_boxes = detections result.scores = torch.sqrt(per_box_cls) result.pred_classes = per_class results.append(result) return results