def inference_single_image(self, conf_pred_per_image, loc_pred_per_image, default_boxes, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: conf_pred_per_image (list[Tensor]): list of #feature levels. Each entry contains tensor of size [Hi x Wi x D, C]. loc_pred_per_image (list[Tensor]): same shape as 'conf_pred_per_image' except that C becomes 4. default_boxes (list['Boxes']): a list of 'Boxes' elements. The Boxes contains default boxes of one image on the specific feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ # predict confidence conf_pred = torch.cat(conf_pred_per_image, dim=0) # [R, C] conf_pred = conf_pred.softmax(dim=1) # predict boxes loc_pred = torch.cat(loc_pred_per_image, dim=0) # [R, 4] default_boxes = Boxes.cat(default_boxes) # [R, 4] boxes_pred = self.box2box_transform.apply_deltas( loc_pred, default_boxes.tensor) num_boxes, num_classes = conf_pred.shape boxes_pred = boxes_pred.view(num_boxes, 1, 4).expand( num_boxes, num_classes, 4) # [R, C, 4] labels = torch.arange(num_classes, device=self.device) # [0, ..., C] labels = labels.view(1, num_classes).expand_as(conf_pred) # [R, C] # remove predictions with the background label boxes_pred = boxes_pred[:, :-1] conf_pred = conf_pred[:, :-1] labels = labels[:, :-1] # batch everything, by making every class prediction be a separate instance boxes_pred = boxes_pred.reshape(-1, 4) conf_pred = conf_pred.reshape(-1) labels = labels.reshape(-1) # remove low scoring boxes indices = torch.nonzero(conf_pred > self.score_threshold, as_tuple=False).squeeze(1) boxes_pred, conf_pred, labels = boxes_pred[indices], conf_pred[indices], labels[indices] keep = generalized_batched_nms(boxes_pred, conf_pred, labels, self.nms_threshold, nms_type=self.nms_type) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_pred[keep]) result.scores = conf_pred[keep] result.pred_classes = labels[keep] return result
def fast_rcnn_inference_single_image(boxes, scores, image_shape, score_thresh, nms_thresh, nms_type, topk_per_image): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes per image. Returns: Same as `fast_rcnn_inference`, but for only one image. """ scores = scores[:, :-1] num_bbox_reg_classes = boxes.shape[1] // 4 # Convert to Boxes to use the `clip` function ... boxes = Boxes(boxes.reshape(-1, 4)) boxes.clip(image_shape) boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4 # Filter results based on detection scores filter_mask = scores > score_thresh # R x K # R' x 2. First column contains indices of the R predictions; # Second column contains indices of classes. filter_inds = filter_mask.nonzero(as_tuple=False) if num_bbox_reg_classes == 1: boxes = boxes[filter_inds[:, 0], 0] else: boxes = boxes[filter_mask] scores = scores[filter_mask] # Apply per-class NMS keep = generalized_batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh, nms_type=nms_type) if topk_per_image >= 0: keep = keep[:topk_per_image] boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep] result = Instances(image_shape) result.pred_boxes = Boxes(boxes) result.scores = scores result.pred_classes = filter_inds[:, 1] return result, filter_inds[:, 0]
def _inference_one_image(self, inputs): augmented_inputs = self.tta_mapper(inputs) assert len({x["file_name"] for x in augmented_inputs }) == 1, "inference different images" heights = [k["height"] for k in augmented_inputs] widths = [k["width"] for k in augmented_inputs] assert ( len(set(heights)) == 1 and len(set(widths)) == 1 ), "Augmented version of the inputs should have the same original resolution!" height = heights[0] width = widths[0] # 1. Detect boxes from all augmented versions all_boxes = [] all_scores = [] all_classes = [] for single_input in augmented_inputs: do_hflip = single_input.pop("horiz_flip", False) # 1.1: forward with single augmented image output = self.model._inference_for_ms_test([single_input]) # 1.2: union the results pred_boxes = output.get("pred_boxes").tensor if do_hflip: pred_boxes[:, [0, 2]] = width - pred_boxes[:, [2, 0]] all_boxes.append(pred_boxes) all_scores.append(output.get("scores")) all_classes.append(output.get("pred_classes")) boxes_all = torch.cat(all_boxes, dim=0) scores_all = torch.cat(all_scores, dim=0) class_idxs_all = torch.cat(all_classes, dim=0) keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.model.nms_threshold, nms_type=self.model.nms_type) keep = keep[:self.model.max_detections_per_image] result = Instances((height, width)) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return {"instances": result}
def inference_single_image(self, box_cls, box_center, border_cls, border_delta, bd_based_box, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. box_center (list[Tensor]): Same shape as 'box_cls' except that K becomes 1. shifts (list[Tensor]): list of #feature levels. Each entry contains a tensor, which contains all the shifts for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ boxes_all = [] scores_all = [] class_idxs_all = [] border_bbox_std = bd_based_box[0].new_tensor(self.border_bbox_std) # Iterate over every feature level for box_cls_i, box_ctr_i, bd_box_cls_i, bd_box_reg_i, bd_based_box_i in zip( box_cls, box_center, border_cls, border_delta, bd_based_box): # (HxWxK,) box_cls_i = box_cls_i.sigmoid_() box_ctr_i = box_ctr_i.sigmoid_() bd_box_cls_i = bd_box_cls_i.sigmoid_() predicted_prob = (box_cls_i * box_ctr_i).sqrt() # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob * bd_box_cls_i predicted_prob = predicted_prob[keep_idxs] # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, predicted_prob.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = predicted_prob.sort(descending=True) topk_idxs = topk_idxs[:num_topk] keep_idxs = keep_idxs.nonzero() keep_idxs = keep_idxs[topk_idxs] keep_box_idxs = keep_idxs[:, 0] classes_idxs = keep_idxs[:, 1] predicted_prob = predicted_prob[:num_topk] bd_box_reg_i = bd_box_reg_i[keep_box_idxs] bd_based_box_i = bd_based_box_i[keep_box_idxs] det_wh = (bd_based_box_i[..., 2:4] - bd_based_box_i[..., :2]) det_wh = torch.cat([det_wh, det_wh], dim=1) predicted_boxes = bd_based_box_i + (bd_box_reg_i * border_bbox_std * det_wh) boxes_all.append(predicted_boxes) scores_all.append(predicted_prob.sqrt()) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold, nms_type=self.nms_type) boxes_all = boxes_all[keep] scores_all = scores_all[keep] class_idxs_all = class_idxs_all[keep] number_of_detections = len(keep) # Limit to max_per_image detections **over all classes** if number_of_detections > self.max_detections_per_image > 0: image_thresh, _ = torch.kthvalue( scores_all, number_of_detections - self.max_detections_per_image + 1) keep = scores_all >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) boxes_all = boxes_all[keep] scores_all = scores_all[keep] class_idxs_all = class_idxs_all[keep] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all) result.scores = scores_all result.pred_classes = class_idxs_all return result
def inference_single_image(self, box_cls, box_delta, box_center, shifts, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. box_center (list[Tensor]): Same shape as 'box_cls' except that K becomes 1. shifts (list[Tensor]): list of #feature levels. Each entry contains a tensor, which contains all the shifts for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ boxes_all = [] scores_all = [] class_idxs_all = [] # Iterate over every feature level for box_cls_i, box_reg_i, box_ctr_i, shifts_i in zip( box_cls, box_delta, box_center, shifts): # (HxWxK,) box_cls_i = box_cls_i.flatten().sigmoid_() # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, box_reg_i.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = box_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] shift_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes box_reg_i = box_reg_i[shift_idxs] shifts_i = shifts_i[shift_idxs] # predict boxes predicted_boxes = self.shift2box_transform.apply_deltas( box_reg_i, shifts_i) box_ctr_i = box_ctr_i.flatten().sigmoid_()[shift_idxs] predicted_prob = torch.sqrt(predicted_prob * box_ctr_i) boxes_all.append(predicted_boxes) scores_all.append(predicted_prob) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold, nms_type=self.nms_type) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return result
def inference_single_image(self, pred_logits, pred_deltas, pred_masks, anchors, indexes, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: pred_logits (list[Tensor]): list of #feature levels. Each entry contains tensor of size (AxHxW, K) pred_deltas (list[Tensor]): Same shape as 'pred_logits' except that K becomes 4. pred_masks (list[list[Tensor]]): List of #feature levels, each is a list of #anchors. Each entry contains tensor of size (M_i*M_i, H, W). `None` if mask_on=False. anchors (list[Boxes]): list of #feature levels. Each entry contains a Boxes object, which contains all the anchors for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ pred_logits = pred_logits.flatten().sigmoid_() # We get top locations across all levels to accelerate the inference speed, # which does not seem to affect the accuracy. # First select values above the threshold logits_top_idxs = torch.where(pred_logits > self.score_threshold)[0] # Then get the top values num_topk = min(self.topk_candidates, logits_top_idxs.shape[0]) pred_prob, topk_idxs = pred_logits[logits_top_idxs].sort( descending=True) # Keep top k scoring values pred_prob = pred_prob[:num_topk] # Keep top k values top_idxs = logits_top_idxs[topk_idxs[:num_topk]] # class index cls_idxs = top_idxs % self.num_classes # HWA index top_idxs //= self.num_classes # predict boxes pred_boxes = self.box2box_transform.apply_deltas( pred_deltas[top_idxs], anchors[top_idxs].tensor) # apply nms keep = generalized_batched_nms(pred_boxes, pred_prob, cls_idxs, self.nms_threshold, nms_type=self.nms_type) # pick the top ones keep = keep[:self.detections_im] results = Instances(image_size) results.pred_boxes = Boxes(pred_boxes[keep]) results.scores = pred_prob[keep] results.pred_classes = cls_idxs[keep] # deal with masks result_masks, result_anchors = [], None if self.mask_on: # index and anchors, useful for masks top_indexes = indexes[top_idxs] top_anchors = anchors[top_idxs] result_indexes = top_indexes[keep] result_anchors = top_anchors[keep] # Get masks and do sigmoid for lvl, _, h, w, anc in result_indexes.tolist(): cur_size = self.mask_sizes[anc] * (2**lvl if self.bipyramid_on else 1) result_masks.append( torch.sigmoid(pred_masks[lvl][anc][:, h, w].view( 1, cur_size, cur_size))) return results, (result_masks, result_anchors)
def inference_single_image(self, cls_logits, pts_refine, pts_strides, points, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguemnts: cls_logits (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W, K) pts_refine (list[Tensor]): Same shape as 'cls_logits' except that K becomes 2 * num_points. pts_strides (list(Tensor)): list of #feature levels. Each entry contains tensor of size (H x W, ) points (list[Tensor]): list of #feature levels. Each entry contains a tensor, which contains all the points for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but only for one image """ assert len(cls_logits) == len(pts_refine) == len(pts_strides) boxes_all = [] scores_all = [] class_idxs_all = [] # Iterate over every feature level for cls_logits_i, pts_refine_i, points_i, pts_strides_i in zip( cls_logits, pts_refine, points, pts_strides): bbox_pos_center = torch.cat([points_i, points_i], dim=1) bbox_pred = self.pts_to_bbox(pts_refine_i) bbox_pred = bbox_pred * pts_strides_i.reshape(-1, 1) + bbox_pos_center bbox_pred[:, 0].clamp_(min=0, max=image_size[1]) bbox_pred[:, 1].clamp_(min=0, max=image_size[0]) bbox_pred[:, 2].clamp_(min=0, max=image_size[1]) bbox_pred[:, 3].clamp_(min=0, max=image_size[0]) # (HxWxK, ) point_cls_i = cls_logits_i.flatten().sigmoid_() # keep top k scoring indices only num_topk = min(self.topk_candidates, point_cls_i.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = point_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] point_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes predicted_boxes = bbox_pred[point_idxs] boxes_all.append(predicted_boxes) scores_all.append(predicted_prob) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold, nms_type=self.nms_type) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return result
def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45, nms_type='normal'): """ Postprocess for the output of YOLO model perform box transformation, specify the class for each detection, and perform class-wise non-maximum suppression. Args: prediction (torch tensor): The shape is :math:`(N, B, 4)`. :math:`N` is the number of predictions, :math:`B` the number of boxes. The last axis consists of :math:`xc, yc, w, h` where `xc` and `yc` represent a center of a bounding box. num_classes (int): number of dataset classes. conf_thre (float): confidence threshold ranging from 0 to 1, which is defined in the config file. nms_thre (float): IoU threshold of non-max suppression ranging from 0 to 1. Returns: output (list of torch tensor): """ box_corner = prediction.new(prediction.shape) box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2 box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2 prediction[:, :, :4] = box_corner[:, :, :4] output = [None for _ in range(len(prediction))] for i, image_pred in enumerate(prediction): # If none are remaining => process next image if not image_pred.size(0): continue # Get score and class with highest confidence class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1, keepdim=True) conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze() # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred) detections = torch.cat( (image_pred[:, :5], class_conf, class_pred.float()), 1) detections = detections[conf_mask] if not detections.size(0): continue confidence = detections[:, 4] * detections[:, 5] nms_out_index = generalized_batched_nms(detections[:, :4], confidence, detections[:, -1], nms_thre, nms_type=nms_type) detections[:, 4] = confidence / detections[:, 5] detections = detections[nms_out_index] # Iterate through all predicted classes unique_labels = detections[:, -1].unique() for c in unique_labels: # Get the detections with the particular class detections_class = detections[detections[:, -1] == c] if output[i] is None: output[i] = detections_class else: output[i] = torch.cat((output[i], detections_class)) return output
def find_top_rpn_proposals( proposals, pred_objectness_logits, images, nms_thresh, nms_type, pre_nms_topk, post_nms_topk, min_box_side_len, training, ): """ For each feature map, select the `pre_nms_topk` highest scoring proposals, apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` highest scoring proposals among all the feature maps if `training` is True, otherwise, returns the highest `post_nms_topk` scoring proposals for each feature map. Args: proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4). All proposal predictions on the feature maps. pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). images (ImageList): Input images as an :class:`ImageList`. nms_thresh (float): IoU threshold to use for NMS pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is per feature map. post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is total, over all feature maps. min_box_side_len (float): minimum proposal box side length in pixels (absolute units wrt input images). training (bool): True if proposals are to be used in training, otherwise False. This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." comment. Returns: proposals (list[Instances]): list of N Instances. The i-th Instances stores post_nms_topk object proposals for image i. """ image_sizes = images.image_sizes # in (h, w) order num_images = len(image_sizes) device = proposals[0].device # 1. Select top-k anchor for every level and every image topk_scores = [] # #lvl Tensor, each of shape N x topk topk_proposals = [] level_ids = [] # #lvl Tensor, each of shape (topk,) batch_idx = torch.arange(num_images, device=device) for level_id, proposals_i, logits_i in zip(itertools.count(), proposals, pred_objectness_logits): Hi_Wi_A = logits_i.shape[1] num_proposals_i = min(pre_nms_topk, Hi_Wi_A) # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812) # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1) logits_i, idx = logits_i.sort(descending=True, dim=1) topk_scores_i = logits_i[batch_idx, :num_proposals_i] topk_idx = idx[batch_idx, :num_proposals_i] # each is N x topk topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 4 topk_proposals.append(topk_proposals_i) topk_scores.append(topk_scores_i) level_ids.append( torch.full((num_proposals_i, ), level_id, dtype=torch.int64, device=device)) # 2. Concat all levels together topk_scores = cat(topk_scores, dim=1) topk_proposals = cat(topk_proposals, dim=1) level_ids = cat(level_ids, dim=0) # 3. For each image, run a per-level NMS, and choose topk results. results = [] for n, image_size in enumerate(image_sizes): boxes = Boxes(topk_proposals[n]) scores_per_img = topk_scores[n] boxes.clip(image_size) # filter empty boxes keep = boxes.nonempty(threshold=min_box_side_len) lvl = level_ids if keep.sum().item() != len(boxes): boxes, scores_per_img, lvl = boxes[keep], scores_per_img[ keep], level_ids[keep] keep = generalized_batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh, nms_type=nms_type) # In Detectron1, there was different behavior during training vs. testing. # (https://github.com/facebookresearch/Detectron/issues/459) # During training, topk is over the proposals from *all* images in the training batch. # During testing, it is over the proposals for each image separately. # As a result, the training behavior becomes batch-dependent, # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. # This bug is addressed in cvpods to make the behavior independent of batch size. keep = keep[:post_nms_topk] res = Instances(image_size) res.proposal_boxes = boxes[keep] res.objectness_logits = scores_per_img[keep] results.append(res) return results
def inference_single_image(self, box_cls, box_delta, box_center, box_param, shifts, image_size, fpn_levels, img_id): boxes_all = [] scores_all = [] class_idxs_all = [] box_params_all = [] shifts_all = [] fpn_levels_all = [] # Iterate over every feature level for box_cls_i, box_reg_i, box_ctr_i, box_param_i, shifts_i, fpn_level_i in zip( box_cls, box_delta, box_center, box_param, shifts, fpn_levels): box_cls_i = box_cls_i.flatten().sigmoid_() if self.thresh_with_centerness: box_ctr_i = box_ctr_i.expand( (-1, self.num_classes)).flatten().sigmoid() box_cls_i = box_cls_i * box_ctr_i # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, box_reg_i.shape[0]) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = box_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold # after topk predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] shift_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes box_reg_i = box_reg_i[shift_idxs] shifts_i = shifts_i[shift_idxs] fpn_level_i = fpn_level_i[shift_idxs] # predict boxes predicted_boxes = self.shift2box_transform.apply_deltas( box_reg_i, shifts_i) if not self.thresh_with_centerness: box_ctr_i = box_ctr_i.flatten().sigmoid_()[shift_idxs] predicted_prob = predicted_prob * box_ctr_i # instances conv params for predicted boxes box_param = box_param_i[shift_idxs] boxes_all.append(predicted_boxes) scores_all.append(torch.sqrt(predicted_prob)) class_idxs_all.append(classes_idxs) box_params_all.append(box_param) shifts_all.append(shifts_i) fpn_levels_all.append(fpn_level_i) boxes_all, scores_all, class_idxs_all, box_params_all, shifts_all, fpn_levels_all = [ cat(x) for x in [ boxes_all, scores_all, class_idxs_all, box_params_all, shifts_all, fpn_levels_all ] ] keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold, nms_type=self.nms_type) keep = keep[:self.max_detections_per_image] im_inds = scores_all.new_ones(len(scores_all), dtype=torch.long) * img_id proposals_i = Instances(image_size) proposals_i.pred_boxes = Boxes(boxes_all[keep]) proposals_i.scores = scores_all[keep] proposals_i.pred_classes = class_idxs_all[keep] proposals_i.inst_parmas = box_params_all[keep] proposals_i.fpn_levels = fpn_levels_all[keep] proposals_i.shifts = shifts_all[keep] proposals_i.im_inds = im_inds[keep] return proposals_i