def test_rroi_heads(self): torch.manual_seed(121) cfg = RCNNConfig() cfg.MODEL.ANCHOR_GENERATOR.NAME = "RotatedAnchorGenerator" # PROPOSAL_GENERATOR: "RRPN" # ROI_HEADS: "RROIHeads" # ROI_BOX_HEAD.NAME: "FastRCNNConvFCHead" def build_box_head(cfg, input_shape): return FastRCNNConvFCHead(cfg, input_shape) cfg.build_box_head = build_box_head cfg.MODEL.RESNETS.DEPTH = 50 cfg.MODEL.ROI_BOX_HEAD.NUM_FC = 2 cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1, 1) cfg.MODEL.RPN.HEAD_NAME = "StandardRPNHead" cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignRotated" cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5, 1) backbone = build_backbone(cfg) num_images = 2 images_tensor = torch.rand(num_images, 20, 30) image_sizes = [(10, 10), (20, 30)] images = ImageList(images_tensor, image_sizes) num_channels = 1024 features = {"res4": torch.rand(num_images, num_channels, 1, 2)} image_shape = (15, 15) gt_boxes0 = torch.tensor([[2, 2, 2, 2, 30], [4, 4, 4, 4, 0]], dtype=torch.float32) gt_instance0 = Instances(image_shape) gt_instance0.gt_boxes = RotatedBoxes(gt_boxes0) gt_instance0.gt_classes = torch.tensor([2, 1]) gt_boxes1 = torch.tensor([[1.5, 5.5, 1, 3, 0], [8.5, 4, 3, 2, -50]], dtype=torch.float32) gt_instance1 = Instances(image_shape) gt_instance1.gt_boxes = RotatedBoxes(gt_boxes1) gt_instance1.gt_classes = torch.tensor([1, 2]) gt_instances = [gt_instance0, gt_instance1] # currently using DefaultAnchorGenerator in RRPN proposal_generator = RRPN(cfg, backbone.output_shape()) roi_heads = RROIHeads(cfg, backbone.output_shape()) with EventStorage(): # capture events in a new storage to discard them proposals, proposal_losses = proposal_generator(images, features, gt_instances) _, detector_losses = roi_heads(images, features, proposals, gt_instances) expected_losses = { "loss_cls": torch.tensor(4.381618499755859), "loss_box_reg": torch.tensor(0.0011829272843897343), } for name in expected_losses.keys(): err_msg = "detector_losses[{}] = {}, expected losses = {}".format( name, detector_losses[name], expected_losses[name] ) self.assertTrue(torch.allclose(detector_losses[name], expected_losses[name]), err_msg)
def inference(self, box_cls, box_pred, image_sizes): """ Arguments: box_cls (Tensor): tensor of shape (batch_size, num_proposals, K). The tensor predicts the classification probability for each proposal. box_pred (Tensor): tensors of shape (batch_size, num_proposals, 4). The tensor predicts 4-vector (x,y,w,h) box regression values for every proposal image_sizes (List[torch.Size]): the input image sizes Returns: results (List[Instances]): a list of #images elements. """ assert len(box_cls) == len(image_sizes) results = [] if self.use_focal: scores = torch.sigmoid(box_cls) labels = torch.arange(self.num_classes, device=self.device). \ unsqueeze(0).repeat(self.num_proposals, 1).flatten(0, 1) for i, (scores_per_image, box_pred_per_image, image_size) in enumerate(zip(scores, box_pred, image_sizes)): result = Instances(image_size) scores_per_image, topk_indices = scores_per_image.flatten( 0, 1).topk(self.num_proposals, sorted=False) labels_per_image = labels[topk_indices] box_pred_per_image = box_pred_per_image.view(-1, 1, 4).repeat( 1, self.num_classes, 1).view(-1, 4) box_pred_per_image = box_pred_per_image[topk_indices] result.pred_boxes = Boxes(box_pred_per_image) result.scores = scores_per_image result.pred_classes = labels_per_image results.append(result) else: scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1) for i, (scores_per_image, labels_per_image, box_pred_per_image, image_size) \ in enumerate(zip(scores, labels, box_pred, image_sizes)): result = Instances(image_size) result.pred_boxes = Boxes(box_pred_per_image) result.pred_boxes.scale(scale_x=image_size[1], scale_y=image_size[0]) result.scores = scores_per_image result.pred_classes = labels_per_image results.append(result) return results
def annotations_to_instances_rotated(annos, image_size): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Compared to `annotations_to_instances`, this function is for rotated boxes only Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: Containing fields "gt_boxes", "gt_classes", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [obj["bbox"] for obj in annos] target = Instances(image_size) boxes = target.gt_boxes = RotatedBoxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes return target
def create_instances(predictions, image_size): ret = Instances(image_size) score = np.asarray([x["score"] for x in predictions]) chosen = (score > args.conf_threshold).nonzero()[0] score = score[chosen] bbox = np.asarray([predictions[i]["bbox"] for i in chosen]) if score.shape[0] == 0: bbox = np.zeros((0, 4)) else: bbox = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) labels = np.asarray( [dataset_id_map(predictions[i]["category_id"]) for i in chosen]) ret.scores = score ret.pred_boxes = Boxes(bbox) ret.pred_classes = labels try: ret.pred_masks = [predictions[i]["segmentation"] for i in chosen] except KeyError: pass return ret
def inference(self, images): r""" image(tensor): ImageList in cvpods.structures """ n, c, h, w = images.tensor.shape new_h, new_w = (h | 31) + 1, (w | 31) + 1 center_wh = np.array([w // 2, h // 2], dtype=np.float32) size_wh = np.array([new_w, new_h], dtype=np.float32) down_scale = self.cfg.MODEL.CENTERNET.DOWN_SCALE img_info = dict(center=center_wh, size=size_wh, height=new_h // down_scale, width=new_w // down_scale) pad_value = [-x / y for x, y in zip(self.mean, self.std)] aligned_img = torch.Tensor(pad_value).reshape( (1, -1, 1, 1)).expand(n, c, new_h, new_w) aligned_img = aligned_img.to(images.tensor.device) pad_w, pad_h = math.ceil((new_w - w) / 2), math.ceil((new_h - h) / 2) aligned_img[..., pad_h:h + pad_h, pad_w:w + pad_w] = images.tensor features = self.backbone(aligned_img) up_fmap = self.upsample(features["res5"]) pred_dict = self.head(up_fmap) results = self.decode_prediction(pred_dict, img_info) ori_w, ori_h = img_info['center'] * 2 det_instance = Instances((int(ori_h), int(ori_w)), **results) return [{"instances": det_instance}]
def inference_single_image(self, conf_pred_per_image, loc_pred_per_image, default_boxes, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: conf_pred_per_image (list[Tensor]): list of #feature levels. Each entry contains tensor of size [Hi x Wi x D, C]. loc_pred_per_image (list[Tensor]): same shape as 'conf_pred_per_image' except that C becomes 4. default_boxes (list['Boxes']): a list of 'Boxes' elements. The Boxes contains default boxes of one image on the specific feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ # predict confidence conf_pred = torch.cat(conf_pred_per_image, dim=0) # [R, C] conf_pred = conf_pred.softmax(dim=1) # predict boxes loc_pred = torch.cat(loc_pred_per_image, dim=0) # [R, 4] default_boxes = Boxes.cat(default_boxes) # [R, 4] boxes_pred = self.box2box_transform.apply_deltas( loc_pred, default_boxes.tensor) num_boxes, num_classes = conf_pred.shape boxes_pred = boxes_pred.view(num_boxes, 1, 4).expand( num_boxes, num_classes, 4) # [R, C, 4] labels = torch.arange(num_classes, device=self.device) # [0, ..., C] labels = labels.view(1, num_classes).expand_as(conf_pred) # [R, C] # remove predictions with the background label boxes_pred = boxes_pred[:, :-1] conf_pred = conf_pred[:, :-1] labels = labels[:, :-1] # batch everything, by making every class prediction be a separate instance boxes_pred = boxes_pred.reshape(-1, 4) conf_pred = conf_pred.reshape(-1) labels = labels.reshape(-1) # remove low scoring boxes indices = torch.nonzero(conf_pred > self.score_threshold, as_tuple=False).squeeze(1) boxes_pred, conf_pred, labels = boxes_pred[indices], conf_pred[indices], labels[indices] keep = generalized_batched_nms(boxes_pred, conf_pred, labels, self.nms_threshold, nms_type=self.nms_type) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_pred[keep]) result.scores = conf_pred[keep] result.pred_classes = labels[keep] return result
def test_roi_heads(self): torch.manual_seed(121) cfg = RCNNConfig() # PROPOSAL_GENERATOR: "RPN" # ROI_HEADS: "StandardROIHeads" # ROI_BOX_HEAD: "FastRCNNConvFCHead" cfg.MODEL.RESNETS.DEPTH = 50 cfg.MODEL.ROI_BOX_HEAD.NUM_FC = 2 cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2" cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5) def build_box_head(cfg, input_shape): return FastRCNNConvFCHead(cfg, input_shape) cfg.build_box_head = build_box_head backbone = build_backbone(cfg) num_images = 2 images_tensor = torch.rand(num_images, 20, 30) image_sizes = [(10, 10), (20, 30)] images = ImageList(images_tensor, image_sizes) num_channels = 1024 features = {"res4": torch.rand(num_images, num_channels, 1, 2)} image_shape = (15, 15) gt_boxes0 = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32) gt_instance0 = Instances(image_shape) gt_instance0.gt_boxes = Boxes(gt_boxes0) gt_instance0.gt_classes = torch.tensor([2, 1]) gt_boxes1 = torch.tensor([[1, 5, 2, 8], [7, 3, 10, 5]], dtype=torch.float32) gt_instance1 = Instances(image_shape) gt_instance1.gt_boxes = Boxes(gt_boxes1) gt_instance1.gt_classes = torch.tensor([1, 2]) gt_instances = [gt_instance0, gt_instance1] proposal_generator = RPN(cfg, backbone.output_shape()) roi_heads = StandardROIHeads(cfg, backbone.output_shape()) with EventStorage(): # capture events in a new storage to discard them proposals, proposal_losses = proposal_generator(images, features, gt_instances) _, detector_losses = roi_heads(images, features, proposals, gt_instances) expected_losses = { "loss_cls": torch.tensor(4.4236516953), "loss_box_reg": torch.tensor(0.0091214813), } for name in expected_losses.keys(): self.assertTrue(torch.allclose(detector_losses[name], expected_losses[name]))
def set_task_related(): """ 设置一些 task related 信息, 一些简化操作之类的, 放到这个更加高效, 所有的都放到 : batched_inputs 更目录下 """ gt = dd['annotations'][0] dataset_dict['gt_ins'] = Instances( (224, 224), gt_boxes=Boxes([gt['bbox'].tolist()]), gt_classes=torch.IntTensor([1])) props = [item['bbox'] for item in dd['annotations'][1:]] feats = [item['feat'] for item in dd['annotations'][1:]] classes = [item['class'] for item in dd['annotations'][1:]] dd['props_ins'] = Instances( (224, 224), proposal_boxes=Boxes(props), features=torch.from_numpy(np.array(feats)), classes=torch.from_numpy(np.array(classes)))
def _inference_one_image(self, inputs): augmented_inputs = self.tta_mapper(inputs) assert len({x["file_name"] for x in augmented_inputs}) == 1, "inference different images" heights = [k["height"] for k in augmented_inputs] widths = [k["width"] for k in augmented_inputs] assert ( len(set(heights)) == 1 and len(set(widths)) == 1 ), "Augmented version of the inputs should have the same original resolution!" height = heights[0] width = widths[0] # 1. Detect boxes from all augmented versions # TODO wangfeng02: use box structures instead of boxes, scores and classes all_boxes = [] all_scores = [] all_classes = [] factors = 2 if self.tta_mapper.flip else 1 if self.enable_scale_filter: assert len(augmented_inputs) == len(self.scale_ranges) * factors for i, single_input in enumerate(augmented_inputs): do_hflip = single_input.pop("horiz_flip", False) # 1.1: forward with single augmented image output = self.model._inference_for_ms_test([single_input]) # 1.2: union the results pred_boxes = output.get("pred_boxes").tensor if do_hflip: pred_boxes[:, [0, 2]] = width - pred_boxes[:, [2, 0]] pred_scores = output.get("scores") pred_classes = output.get("pred_classes") if self.enable_scale_filter: keep = filter_boxes(pred_boxes, *self.scale_ranges[i // factors]) pred_boxes = pred_boxes[keep] pred_scores = pred_scores[keep] pred_classes = pred_classes[keep] all_boxes.append(pred_boxes) all_scores.append(pred_scores) all_classes.append(pred_classes) boxes_all = torch.cat(all_boxes, dim=0) scores_all = torch.cat(all_scores, dim=0) class_idxs_all = torch.cat(all_classes, dim=0) boxes_all, scores_all, class_idxs_all = merge_result_from_multi_scales( boxes_all, scores_all, class_idxs_all, nms_type="soft_vote", vote_thresh=0.65, max_detection=self.max_detection ) result = Instances((height, width)) result.pred_boxes = Boxes(boxes_all) result.scores = scores_all result.pred_classes = class_idxs_all return {"instances": result}
def _postprocess(results, result_mask_info, output_height, output_width, mask_threshold=0.5): """ Post-process the output boxes for TensorMask. The input images are often resized when entering an object detector. As a result, we often need the outputs of the detector in a different resolution from its inputs. This function will postprocess the raw outputs of TensorMask to produce outputs according to the desired output resolution. Args: results (Instances): the raw outputs from the detector. `results.image_size` contains the input image resolution the detector sees. This object might be modified in-place. Note that it does not contain the field `pred_masks`, which is provided by another input `result_masks`. result_mask_info (list[Tensor], Boxes): a pair of two items for mask related results. The first item is a list of #detection tensors, each is the predicted masks. The second item is the anchors corresponding to the predicted masks. output_height, output_width: the desired output resolution. Returns: Instances: the postprocessed output from the model, based on the output resolution """ scale_x, scale_y = ( output_width / results.image_size[1], output_height / results.image_size[0], ) results = Instances((output_height, output_width), **results.get_fields()) output_boxes = results.pred_boxes output_boxes.tensor[:, 0::2] *= scale_x output_boxes.tensor[:, 1::2] *= scale_y output_boxes.clip(results.image_size) inds_nonempty = output_boxes.nonempty() results = results[inds_nonempty] result_masks, result_anchors = result_mask_info if result_masks: result_anchors.tensor[:, 0::2] *= scale_x result_anchors.tensor[:, 1::2] *= scale_y result_masks = [ x for (i, x) in zip(inds_nonempty.tolist(), result_masks) if i ] results.pred_masks = _paste_mask_lists_in_image( result_masks, result_anchors[inds_nonempty], results.image_size, threshold=mask_threshold, ) return results
def test_int_indexing(self): attr1 = torch.tensor([[0.0, 0.0, 1.0], [0.0, 0.0, 0.5], [0.0, 0.0, 1.0], [0.0, 0.5, 0.5]]) attr2 = torch.tensor([0.1, 0.2, 0.3, 0.4]) instances = Instances((100, 100)) instances.attr1 = attr1 instances.attr2 = attr2 for i in range(-len(instances), len(instances)): inst = instances[i] self.assertEqual((inst.attr1 == attr1[i]).all(), True) self.assertEqual((inst.attr2 == attr2[i]).all(), True) self.assertRaises(IndexError, lambda: instances[len(instances)]) self.assertRaises(IndexError, lambda: instances[-len(instances) - 1])
def detector_postprocess(results, output_height, output_width, mask_threshold=0.5): """ Resize the output instances. The input images are often resized when entering an object detector. As a result, we often need the outputs of the detector in a different resolution from its inputs. This function will resize the raw outputs of an R-CNN detector to produce outputs according to the desired output resolution. Args: results (Instances): the raw outputs from the detector. `results.image_size` contains the input image resolution the detector sees. This object might be modified in-place. output_height, output_width: the desired output resolution. Returns: Instances: the resized output from the model, based on the output resolution """ scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0]) results = Instances((output_height, output_width), **results.get_fields()) if results.has("pred_boxes"): output_boxes = results.pred_boxes elif results.has("proposal_boxes"): output_boxes = results.proposal_boxes output_boxes.scale(scale_x, scale_y) output_boxes.clip(results.image_size) results = results[output_boxes.nonempty()] if results.has("pred_masks"): results.pred_masks = paste_masks_in_image( results.pred_masks[:, 0, :, :], # N, 1, M, M results.pred_boxes, results.image_size, threshold=mask_threshold, ) if results.has("pred_keypoints"): results.pred_keypoints[:, :, 0] *= scale_x results.pred_keypoints[:, :, 1] *= scale_y return results
def fast_rcnn_inference_single_image(boxes, scores, image_shape, score_thresh, nms_thresh, nms_type, topk_per_image): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes per image. Returns: Same as `fast_rcnn_inference`, but for only one image. """ scores = scores[:, :-1] num_bbox_reg_classes = boxes.shape[1] // 4 # Convert to Boxes to use the `clip` function ... boxes = Boxes(boxes.reshape(-1, 4)) boxes.clip(image_shape) boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4 # Filter results based on detection scores filter_mask = scores > score_thresh # R x K # R' x 2. First column contains indices of the R predictions; # Second column contains indices of classes. filter_inds = filter_mask.nonzero(as_tuple=False) if num_bbox_reg_classes == 1: boxes = boxes[filter_inds[:, 0], 0] else: boxes = boxes[filter_mask] scores = scores[filter_mask] # Apply per-class NMS keep = generalized_batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh, nms_type=nms_type) if topk_per_image >= 0: keep = keep[:topk_per_image] boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep] result = Instances(image_shape) result.pred_boxes = Boxes(boxes) result.scores = scores result.pred_classes = filter_inds[:, 1] return result, filter_inds[:, 0]
def _inference_one_image(self, inputs): augmented_inputs = self.tta_mapper(inputs) assert len({x["file_name"] for x in augmented_inputs }) == 1, "inference different images" heights = [k["height"] for k in augmented_inputs] widths = [k["width"] for k in augmented_inputs] assert ( len(set(heights)) == 1 and len(set(widths)) == 1 ), "Augmented version of the inputs should have the same original resolution!" height = heights[0] width = widths[0] # 1. Detect boxes from all augmented versions all_boxes = [] all_scores = [] all_classes = [] for single_input in augmented_inputs: do_hflip = single_input.pop("horiz_flip", False) # 1.1: forward with single augmented image output = self.model._inference_for_ms_test([single_input]) # 1.2: union the results pred_boxes = output.get("pred_boxes").tensor if do_hflip: pred_boxes[:, [0, 2]] = width - pred_boxes[:, [2, 0]] all_boxes.append(pred_boxes) all_scores.append(output.get("scores")) all_classes.append(output.get("pred_classes")) boxes_all = torch.cat(all_boxes, dim=0) scores_all = torch.cat(all_scores, dim=0) class_idxs_all = torch.cat(all_classes, dim=0) keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.model.nms_threshold, nms_type=self.model.nms_type) keep = keep[:self.model.max_detections_per_image] result = Instances((height, width)) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return {"instances": result}
def test_fast_rcnn_rotated(self): torch.manual_seed(132) cfg = RCNNConfig() cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5, 1) box2box_transform = Box2BoxTransformRotated(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS) box_head_output_size = 8 num_classes = 5 cls_agnostic_bbox_reg = False box_predictor = FastRCNNOutputLayers( box_head_output_size, num_classes, cls_agnostic_bbox_reg, box_dim=5 ) feature_pooled = torch.rand(2, box_head_output_size) pred_class_logits, pred_proposal_deltas = box_predictor(feature_pooled) image_shape = (10, 10) proposal_boxes = torch.tensor( [[2, 1.95, 2.4, 1.7, 0], [4.65, 5.25, 4.7, 5.5, 0]], dtype=torch.float32 ) gt_boxes = torch.tensor([[2, 2, 2, 2, 0], [4, 4, 4, 4, 0]], dtype=torch.float32) result = Instances(image_shape) result.proposal_boxes = RotatedBoxes(proposal_boxes) result.gt_boxes = RotatedBoxes(gt_boxes) result.gt_classes = torch.tensor([1, 2]) proposals = [] proposals.append(result) smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA outputs = RotatedFastRCNNOutputs( box2box_transform, pred_class_logits, pred_proposal_deltas, proposals, smooth_l1_beta ) with EventStorage(): # capture events in a new storage to discard them losses = outputs.losses() # Note: the expected losses are slightly different even if # the boxes are essentially the same as in the FastRCNNOutput test, because # bbox_pred in FastRCNNOutputLayers have different Linear layers/initialization # between the two cases. expected_losses = { "loss_cls": torch.tensor(1.7920907736), "loss_box_reg": torch.tensor(4.0410838127), } for name in expected_losses.keys(): assert torch.allclose(losses[name], expected_losses[name])
def to_d2_instances_list(instances_list): """ Convert InstancesList to List[Instances]. The input `instances_list` can also be a List[Instances], in this case this method is a non-op. """ if not isinstance(instances_list, InstancesList): assert all(isinstance(x, Instances) for x in instances_list) return instances_list ret = [] for i, info in enumerate(instances_list.im_info): instances = Instances( torch.Size([int(info[0].item()), int(info[1].item())])) ids = instances_list.indices == i for k, v in instances_list.batch_extra_fields.items(): if isinstance(v, torch.Tensor): instances.set(k, v[ids]) continue elif isinstance(v, Boxes): instances.set(k, v[ids, -4:]) continue target_type, tensor_source = v assert isinstance(tensor_source, torch.Tensor) assert tensor_source.shape[0] == instances_list.indices.shape[ 0] tensor_source = tensor_source[ids] if issubclass(target_type, Boxes): instances.set(k, Boxes(tensor_source[:, -4:])) elif issubclass(target_type, Keypoints): instances.set(k, Keypoints(tensor_source)) elif issubclass(target_type, torch.Tensor): instances.set(k, tensor_source) else: raise ValueError( "Can't handle targe type: {}".format(target_type)) ret.append(instances) return ret
def transform_proposals(dataset_dict, image_shape, transforms, min_box_side_len, proposal_topk): """ Apply transformations to the proposals in dataset_dict, if any. Args: dataset_dict (dict): a dict read from the dataset, possibly contains fields "proposal_boxes", "proposal_objectness_logits", "proposal_bbox_mode" image_shape (tuple): height, width transforms (TransformList): min_box_side_len (int): keep proposals with at least this size proposal_topk (int): only keep top-K scoring proposals The input dict is modified in-place, with abovementioned keys removed. A new key "proposals" will be added. Its value is an `Instances` object which contains the transformed proposals in its field "proposal_boxes" and "objectness_logits". """ if "proposal_boxes" in dataset_dict: # Transform proposal boxes boxes = transforms.apply_box( BoxMode.convert( dataset_dict.pop("proposal_boxes"), dataset_dict.pop("proposal_bbox_mode"), BoxMode.XYXY_ABS, )) boxes = Boxes(boxes) objectness_logits = torch.as_tensor( dataset_dict.pop("proposal_objectness_logits").astype("float32")) boxes.clip(image_shape) keep = boxes.nonempty(threshold=min_box_side_len) boxes = boxes[keep] objectness_logits = objectness_logits[keep] proposals = Instances(image_shape) proposals.proposal_boxes = boxes[:proposal_topk] proposals.objectness_logits = objectness_logits[:proposal_topk] dataset_dict["proposals"] = proposals
def _create_proposals_from_boxes(self, boxes, image_sizes): """ Args: boxes (list[Tensor]): per-image predicted boxes, each of shape Ri x 4 image_sizes (list[tuple]): list of image shapes in (h, w) Returns: list[Instances]: per-image proposals with the given boxes. """ # Just like RPN, the proposals should not have gradients boxes = [Boxes(b.detach()) for b in boxes] proposals = [] for boxes_per_image, image_size in zip(boxes, image_sizes): boxes_per_image.clip(image_size) if self.training: # do not filter empty boxes at inference time, # because the scores from each stage need to be aligned and added later boxes_per_image = boxes_per_image[boxes_per_image.nonempty()] prop = Instances(image_size) prop.proposal_boxes = boxes_per_image proposals.append(prop) return proposals
def test_fast_rcnn(self): torch.manual_seed(132) cfg = RCNNConfig() cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5) box2box_transform = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS) box_head_output_size = 8 num_classes = 5 cls_agnostic_bbox_reg = False box_predictor = FastRCNNOutputLayers( box_head_output_size, num_classes, cls_agnostic_bbox_reg, box_dim=4 ) feature_pooled = torch.rand(2, box_head_output_size) pred_class_logits, pred_proposal_deltas = box_predictor(feature_pooled) image_shape = (10, 10) proposal_boxes = torch.tensor([[0.8, 1.1, 3.2, 2.8], [2.3, 2.5, 7, 8]], dtype=torch.float32) gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32) result = Instances(image_shape) result.proposal_boxes = Boxes(proposal_boxes) result.gt_boxes = Boxes(gt_boxes) result.gt_classes = torch.tensor([1, 2]) proposals = [] proposals.append(result) smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA outputs = FastRCNNOutputs( box2box_transform, pred_class_logits, pred_proposal_deltas, proposals, smooth_l1_beta ) with EventStorage(): # capture events in a new storage to discard them losses = outputs.losses() expected_losses = { "loss_cls": torch.tensor(1.7951188087), "loss_box_reg": torch.tensor(4.0357131958), } for name in expected_losses.keys(): assert torch.allclose(losses[name], expected_losses[name])
def postprocess(self, results, output_height, output_width, resized_in_h, resized_in_w, padded_im_h, padded_im_w): scale_x, scale_y = (output_width / resized_in_w, output_height / resized_in_h) # gather detection result to Instances results = Instances((output_height, output_width), **results.get_fields()) # scale detection box results from resized_padded_image space to source image space and clip output_boxes = results.pred_boxes output_boxes.scale(scale_x, scale_y) output_boxes.clip(results.image_size) # filter empty detection in source image space results = results[output_boxes.nonempty()] if results.has("pred_global_logits"): mask_h, mask_w = results.pred_global_logits.shape[-2:] factor_h = padded_im_h // mask_h factor_w = padded_im_w // mask_w assert factor_h == factor_w factor = factor_h # aligned upsample instances mask to resized_padded_image shape pred_global_masks = aligned_bilinear( results.pred_global_logits.sigmoid(), factor) pred_global_masks = pred_global_masks[:, :, :resized_in_h, : resized_in_w] # scale mask from resized_image shape to source image shape # this is a inverse procedure of opencv or PIL interpolation # which align_corners is False pred_global_masks = F.interpolate(pred_global_masks, size=(output_height, output_width), mode="bilinear", align_corners=False) pred_global_masks = pred_global_masks[:, 0, :, :] # filter out the pred masks with low confidence score results.pred_masks = pred_global_masks > self.infer_mask_threshold return results
def add_ground_truth_to_proposals_single_image(gt_boxes, proposals): """ Augment `proposals` with ground-truth boxes from `gt_boxes`. Args: Same as `add_ground_truth_to_proposals`, but with gt_boxes and proposals per image. Returns: Same as `add_ground_truth_to_proposals`, but for only one image. """ device = proposals.objectness_logits.device # Concatenating gt_boxes with proposals requires them to have the same fields # Assign all ground-truth boxes an objectness logit corresponding to P(object) \approx 1. gt_logit_value = math.log((1.0 - 1e-10) / (1 - (1.0 - 1e-10))) gt_logits = gt_logit_value * torch.ones(len(gt_boxes), device=device) gt_proposal = Instances(proposals.image_size) gt_proposal.proposal_boxes = gt_boxes gt_proposal.objectness_logits = gt_logits new_proposals = Instances.cat([proposals, gt_proposal]) return new_proposals
def inference_single_image(self, box_cls, box_delta, box_center, shifts, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. box_center (list[Tensor]): Same shape as 'box_cls' except that K becomes 1. shifts (list[Tensor]): list of #feature levels. Each entry contains a tensor, which contains all the shifts for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ boxes_all = [] scores_all = [] class_idxs_all = [] # Iterate over every feature level for box_cls_i, box_reg_i, box_ctr_i, shifts_i in zip( box_cls, box_delta, box_center, shifts): # (HxWxK,) box_cls_i = box_cls_i.flatten().sigmoid_() # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, box_reg_i.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = box_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] shift_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes box_reg_i = box_reg_i[shift_idxs] shifts_i = shifts_i[shift_idxs] # predict boxes predicted_boxes = self.shift2box_transform.apply_deltas( box_reg_i, shifts_i) box_ctr_i = box_ctr_i.flatten().sigmoid_()[shift_idxs] predicted_prob = torch.sqrt(predicted_prob * box_ctr_i) boxes_all.append(predicted_boxes) scores_all.append(predicted_prob) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold, nms_type=self.nms_type) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return result
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images, labels = self.preprocess_image(batched_inputs, self.training) # batched_inputs[0]['image'] = images.tensor[0].cpu() * 255 # self.visualize_data(batched_inputs[0]) x = images.tensor img_size = x.shape[-2:] def _branch(_embedding, _in): for i, e in enumerate(_embedding): _in = e(_in) if i == 4: out_branch = _in return _in, out_branch # backbone # x2, x1, x0 = self.backbone(x) out_features = self.backbone(x) features = [out_features[f] for f in self.in_features] [x2, x1, x0] = features # yolo branch 0 out0, out0_branch = _branch(self.out0, x0) # yolo branch 1 x1_in = self.out1_cbl(out0_branch) x1_in = self.out1_upsample(x1_in) x1_in = torch.cat([x1_in, x1], 1) out1, out1_branch = _branch(self.out1, x1_in) # yolo branch 2 x2_in = self.out2_cbl(out1_branch) x2_in = self.out2_upsample(x2_in) x2_in = torch.cat([x2_in, x2], 1) out2, out2_branch = _branch(self.out2, x2_in) outputs = [out0, out1, out2] if self.training: losses = [ loss_evaluator(out, labels, img_size) for out, loss_evaluator in zip(outputs, self.loss_evaluators) ] keys = [ "loss_x", "loss_y", "loss_w", "loss_h", "loss_conf", "loss_cls" ] losses_dict = {} for key in keys: losses_dict[key] = sum([loss[key] for loss in losses]) return losses_dict else: predictions_list = [ loss_evaluator(out, labels, img_size) for out, loss_evaluator in zip(outputs, self.loss_evaluators) ] predictions = torch.cat(predictions_list, 1) detections = postprocess(predictions, self.num_classes, self.conf_threshold, self.nms_threshold, nms_type=self.nms_type) results = [] for idx, out in enumerate(detections): if out is None: out = x.new_zeros((0, 7)) # image_size = images.image_sizes[idx] image_size = img_size result = Instances(image_size) result.pred_boxes = Boxes(out[:, :4]) result.scores = out[:, 5] * out[:, 4] result.pred_classes = out[:, -1] results.append(result) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def inference_single_image(self, pred_logits, pred_deltas, pred_masks, anchors, indexes, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: pred_logits (list[Tensor]): list of #feature levels. Each entry contains tensor of size (AxHxW, K) pred_deltas (list[Tensor]): Same shape as 'pred_logits' except that K becomes 4. pred_masks (list[list[Tensor]]): List of #feature levels, each is a list of #anchors. Each entry contains tensor of size (M_i*M_i, H, W). `None` if mask_on=False. anchors (list[Boxes]): list of #feature levels. Each entry contains a Boxes object, which contains all the anchors for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ pred_logits = pred_logits.flatten().sigmoid_() # We get top locations across all levels to accelerate the inference speed, # which does not seem to affect the accuracy. # First select values above the threshold logits_top_idxs = torch.where(pred_logits > self.score_threshold)[0] # Then get the top values num_topk = min(self.topk_candidates, logits_top_idxs.shape[0]) pred_prob, topk_idxs = pred_logits[logits_top_idxs].sort( descending=True) # Keep top k scoring values pred_prob = pred_prob[:num_topk] # Keep top k values top_idxs = logits_top_idxs[topk_idxs[:num_topk]] # class index cls_idxs = top_idxs % self.num_classes # HWA index top_idxs //= self.num_classes # predict boxes pred_boxes = self.box2box_transform.apply_deltas( pred_deltas[top_idxs], anchors[top_idxs].tensor) # apply nms keep = generalized_batched_nms(pred_boxes, pred_prob, cls_idxs, self.nms_threshold, nms_type=self.nms_type) # pick the top ones keep = keep[:self.detections_im] results = Instances(image_size) results.pred_boxes = Boxes(pred_boxes[keep]) results.scores = pred_prob[keep] results.pred_classes = cls_idxs[keep] # deal with masks result_masks, result_anchors = [], None if self.mask_on: # index and anchors, useful for masks top_indexes = indexes[top_idxs] top_anchors = anchors[top_idxs] result_indexes = top_indexes[keep] result_anchors = top_anchors[keep] # Get masks and do sigmoid for lvl, _, h, w, anc in result_indexes.tolist(): cur_size = self.mask_sizes[anc] * (2**lvl if self.bipyramid_on else 1) result_masks.append( torch.sigmoid(pred_masks[lvl][anc][:, h, w].view( 1, cur_size, cur_size))) return results, (result_masks, result_anchors)
def inference_single_image(self, box_cls, box_center, border_cls, border_delta, bd_based_box, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. box_center (list[Tensor]): Same shape as 'box_cls' except that K becomes 1. shifts (list[Tensor]): list of #feature levels. Each entry contains a tensor, which contains all the shifts for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ boxes_all = [] scores_all = [] class_idxs_all = [] border_bbox_std = bd_based_box[0].new_tensor(self.border_bbox_std) # Iterate over every feature level for box_cls_i, box_ctr_i, bd_box_cls_i, bd_box_reg_i, bd_based_box_i in zip( box_cls, box_center, border_cls, border_delta, bd_based_box): # (HxWxK,) box_cls_i = box_cls_i.sigmoid_() box_ctr_i = box_ctr_i.sigmoid_() bd_box_cls_i = bd_box_cls_i.sigmoid_() predicted_prob = (box_cls_i * box_ctr_i).sqrt() # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob * bd_box_cls_i predicted_prob = predicted_prob[keep_idxs] # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, predicted_prob.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = predicted_prob.sort(descending=True) topk_idxs = topk_idxs[:num_topk] keep_idxs = keep_idxs.nonzero() keep_idxs = keep_idxs[topk_idxs] keep_box_idxs = keep_idxs[:, 0] classes_idxs = keep_idxs[:, 1] predicted_prob = predicted_prob[:num_topk] bd_box_reg_i = bd_box_reg_i[keep_box_idxs] bd_based_box_i = bd_based_box_i[keep_box_idxs] det_wh = (bd_based_box_i[..., 2:4] - bd_based_box_i[..., :2]) det_wh = torch.cat([det_wh, det_wh], dim=1) predicted_boxes = bd_based_box_i + (bd_box_reg_i * border_bbox_std * det_wh) boxes_all.append(predicted_boxes) scores_all.append(predicted_prob.sqrt()) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold, nms_type=self.nms_type) boxes_all = boxes_all[keep] scores_all = scores_all[keep] class_idxs_all = class_idxs_all[keep] number_of_detections = len(keep) # Limit to max_per_image detections **over all classes** if number_of_detections > self.max_detections_per_image > 0: image_thresh, _ = torch.kthvalue( scores_all, number_of_detections - self.max_detections_per_image + 1) keep = scores_all >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) boxes_all = boxes_all[keep] scores_all = scores_all[keep] class_idxs_all = class_idxs_all[keep] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all) result.scores = scores_all result.pred_classes = class_idxs_all return result
def find_top_rpn_proposals( proposals, pred_objectness_logits, images, nms_thresh, pre_nms_topk, post_nms_topk, min_box_side_len, training, # pylint: disable=W0613 ): """ For each feature map, select the `pre_nms_topk` highest scoring proposals, apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` highest scoring proposals among all the feature maps if `training` is True, otherwise, returns the highest `post_nms_topk` scoring proposals for each feature map. Args: proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4). All proposal predictions on the feature maps. pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). images (ImageList): Input images as an :class:`ImageList`. nms_thresh (float): IoU threshold to use for NMS pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is per feature map. post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is total, over all feature maps. min_box_side_len (float): minimum proposal box side length in pixels (absolute units wrt input images). training (bool): True if proposals are to be used in training, otherwise False. This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." comment. Returns: proposals (list[Instances]): list of N Instances. The i-th Instances stores post_nms_topk object proposals for image i. """ image_sizes = images.image_sizes # in (h, w) order num_images = len(image_sizes) device = proposals[0].device # 1. Select top-k anchor for every level and every image topk_scores = [] # #lvl Tensor, each of shape N x topk topk_proposals = [] level_ids = [] # #lvl Tensor, each of shape (topk,) batch_idx = torch.arange(num_images, device=device) for level_id, proposals_i, logits_i in zip(itertools.count(), proposals, pred_objectness_logits): Hi_Wi_A = logits_i.shape[1] num_proposals_i = min(pre_nms_topk, Hi_Wi_A) # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812) # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1) logits_i, idx = logits_i.sort(descending=True, dim=1) topk_scores_i = logits_i[batch_idx, :num_proposals_i] topk_idx = idx[batch_idx, :num_proposals_i] # each is N x topk topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 4 topk_proposals.append(topk_proposals_i) topk_scores.append(topk_scores_i) level_ids.append( torch.full((num_proposals_i, ), level_id, dtype=torch.int64, device=device)) # 2. Concat all levels together topk_scores = cat(topk_scores, dim=1) topk_proposals = cat(topk_proposals, dim=1) level_ids = cat(level_ids, dim=0) # 3. For each image, run a per-level NMS, and choose topk results. results = [] for n, image_size in enumerate(image_sizes): boxes = Boxes(topk_proposals[n]) scores_per_img = topk_scores[n] # boxes.clip(image_size) # filter empty boxes keep = boxes.nonempty(threshold=min_box_side_len) lvl = level_ids if keep.sum().item() != len(boxes): boxes, scores_per_img, lvl = boxes[keep], scores_per_img[ keep], level_ids[keep] keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh) # In Detectron1, there was different behavior during training vs. testing. # (https://github.com/facebookresearch/Detectron/issues/459) # During training, topk is over the proposals from *all* images in the training batch. # During testing, it is over the proposals for each image separately. # As a result, the training behavior becomes batch-dependent, # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. # This bug is addressed in cvpods to make the behavior independent of batch size. keep = keep[:post_nms_topk] res = Instances(image_size) res.proposal_boxes = boxes[keep] res.objectness_logits = scores_per_img[keep] results.append(res) return results
def inference_single_image(self, cate_preds, seg_preds, featmap_size, img_shape, ori_shape): """ Args: cate_preds, seg_preds: see: method: `inference`. featmap_size (list[tuple]): feature map size per level. img_shape (tuple): the size of the image fed into the model (height and width). ori_shape (tuple): original image shape (height and width). Returns: result (Instances): predicted results of single image after post-processing. """ assert len(cate_preds) == len(seg_preds) result = Instances(ori_shape) # overall info. h, w = img_shape upsampled_size_out = (featmap_size[0] * 4, featmap_size[1] * 4) # process. inds = (cate_preds > self.score_threshold) # category scores. cate_scores = cate_preds[inds] if len(cate_scores) == 0: return result # category labels. inds = inds.nonzero(as_tuple=False) cate_labels = inds[:, 1] # strides. size_trans = cate_labels.new_tensor(self.seg_num_grids).pow(2).cumsum( 0) # [1600, 2896, 3472, 3728, 3872] strides = cate_scores.new_ones(size_trans[-1]) n_stage = len(self.seg_num_grids) strides[:size_trans[0]] *= self.feature_strides[0] for ind_ in range(1, n_stage): strides[size_trans[ind_ - 1]:size_trans[ind_]] *= self.feature_strides[ ind_] strides = strides[inds[:, 0]] # masks. seg_preds = seg_preds[inds[:, 0]] seg_masks = seg_preds > self.mask_threshold sum_masks = seg_masks.sum((1, 2)).float() # filter. keep = sum_masks > strides if keep.sum() == 0: return result seg_masks = seg_masks[keep, ...] seg_preds = seg_preds[keep, ...] sum_masks = sum_masks[keep] cate_scores = cate_scores[keep] cate_labels = cate_labels[keep] # mask scoring. seg_scores = (seg_preds * seg_masks.float()).sum((1, 2)) / sum_masks cate_scores *= seg_scores # sort and keep top nms_pre sort_inds = torch.argsort(cate_scores, descending=True) if len(sort_inds) > self.nms_per_image: sort_inds = sort_inds[:self.nms_per_image] seg_masks = seg_masks[sort_inds, :, :] seg_preds = seg_preds[sort_inds, :, :] sum_masks = sum_masks[sort_inds] cate_scores = cate_scores[sort_inds] cate_labels = cate_labels[sort_inds] # Matrix NMS cate_scores = matrix_nms(seg_masks, cate_labels, cate_scores, kernel=self.nms_kernel, sigma=self.nms_sigma, sum_masks=sum_masks) # filter. keep = cate_scores >= self.update_threshold if keep.sum() == 0: return result seg_preds = seg_preds[keep, :, :] cate_scores = cate_scores[keep] cate_labels = cate_labels[keep] # sort and keep top_k sort_inds = torch.argsort(cate_scores, descending=True) if len(sort_inds) > self.max_detections_per_image: sort_inds = sort_inds[:self.max_detections_per_image] seg_preds = seg_preds[sort_inds, :, :] cate_scores = cate_scores[sort_inds] cate_labels = cate_labels[sort_inds] seg_preds = F.interpolate(seg_preds.unsqueeze(0), size=upsampled_size_out, mode='bilinear')[:, :, :h, :w] seg_masks = F.interpolate(seg_preds, size=ori_shape, mode='bilinear').squeeze(0) seg_masks = seg_masks > self.mask_threshold seg_masks = BitMasks(seg_masks) result.pred_masks = seg_masks result.pred_boxes = seg_masks.get_bounding_boxes() result.scores = cate_scores result.pred_classes = cate_labels return result
def assemble_rcnn_outputs_by_name(image_sizes, tensor_outputs, force_mask_on=False): """ A function to assemble caffe2 model's outputs (i.e. Dict[str, Tensor]) to detectron2's format (i.e. list of Instances instance). This only works when the model follows the Caffe2 detectron's naming convention. Args: image_sizes (List[List[int, int]]): [H, W] of every image. tensor_outputs (Dict[str, Tensor]): external_output to its tensor. force_mask_on (Bool): if true, the it make sure there'll be pred_masks even if the mask is not found from tensor_outputs (usually due to model crash) """ results = [Instances(image_size) for image_size in image_sizes] batch_splits = tensor_outputs.get("batch_splits", None) if batch_splits: raise NotImplementedError() assert len(image_sizes) == 1 result = results[0] bbox_nms = tensor_outputs["bbox_nms"] score_nms = tensor_outputs["score_nms"] class_nms = tensor_outputs["class_nms"] # Detection will always success because Conv support 0-batch assert bbox_nms is not None assert score_nms is not None assert class_nms is not None result.pred_boxes = Boxes(bbox_nms) result.scores = score_nms result.pred_classes = class_nms.to(torch.int64) mask_fcn_probs = tensor_outputs.get("mask_fcn_probs", None) if mask_fcn_probs is not None: # finish the mask pred mask_probs_pred = mask_fcn_probs num_masks = mask_probs_pred.shape[0] class_pred = result.pred_classes indices = torch.arange(num_masks, device=class_pred.device) mask_probs_pred = mask_probs_pred[indices, class_pred][:, None] result.pred_masks = mask_probs_pred elif force_mask_on: # NOTE: there's no way to know the height/width of mask here, it won't be # used anyway when batch size is 0, so just set them to 0. result.pred_masks = torch.zeros([0, 1, 0, 0], dtype=torch.uint8) keypoints_out = tensor_outputs.get("keypoints_out", None) kps_score = tensor_outputs.get("kps_score", None) if keypoints_out is not None: # keypoints_out: [N, 4, #kypoints], where 4 is in order of (x, y, score, prob) keypoints_tensor = keypoints_out # NOTE: it's possible that prob is not calculated if "should_output_softmax" # is set to False in HeatmapMaxKeypoint, so just using raw score, seems # it doesn't affect mAP. TODO: check more carefully. keypoint_xyp = keypoints_tensor.transpose(1, 2)[:, :, [0, 1, 2]] result.pred_keypoints = keypoint_xyp elif kps_score is not None: # keypoint heatmap to sparse data structure pred_keypoint_logits = kps_score keypoint_head.keypoint_rcnn_inference(pred_keypoint_logits, [result]) return results
def _inference_one_image(self, input): """ Args: input (dict): one dataset dict Returns: dict: one output dict """ augmented_inputs = self.tta_mapper(input) do_hflip = [k.pop("horiz_flip", False) for k in augmented_inputs] heights = [k["height"] for k in augmented_inputs] widths = [k["width"] for k in augmented_inputs] assert ( len(set(heights)) == 1 and len(set(widths)) == 1 ), "Augmented version of the inputs should have the same original resolution!" height = heights[0] width = widths[0] # 1. Detect boxes from all augmented versions # 1.1: forward with all augmented images with self._turn_off_roi_head("mask_on"), self._turn_off_roi_head( "keypoint_on"): # temporarily disable mask/keypoint head outputs = self._batch_inference(augmented_inputs, do_postprocess=False) # 1.2: union the results all_boxes = [] all_scores = [] all_classes = [] for idx, output in enumerate(outputs): rescaled_output = detector_postprocess(output, height, width) pred_boxes = rescaled_output.pred_boxes.tensor if do_hflip[idx]: pred_boxes[:, [0, 2]] = width - pred_boxes[:, [2, 0]] all_boxes.append(pred_boxes) all_scores.extend(rescaled_output.scores) all_classes.extend(rescaled_output.pred_classes) all_boxes = torch.cat(all_boxes, dim=0).cpu() num_boxes = len(all_boxes) # 1.3: select from the union of all results num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES # +1 because fast_rcnn_inference expects background scores as well all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device) for idx, cls, score in zip(count(), all_classes, all_scores): all_scores_2d[idx, cls] = score merged_instances, _ = fast_rcnn_inference_single_image( all_boxes, all_scores_2d, (height, width), 1e-8, self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST, self.cfg.MODEL.NMS_TYPE, self.cfg.TEST.DETECTIONS_PER_IMAGE, ) if not self.cfg.MODEL.MASK_ON: return {"instances": merged_instances} # 2. Use the detected boxes to obtain masks # 2.1: rescale the detected boxes augmented_instances = [] for idx, input in enumerate(augmented_inputs): actual_height, actual_width = input["image"].shape[1:3] scale_x = actual_width * 1.0 / width scale_y = actual_height * 1.0 / height pred_boxes = merged_instances.pred_boxes.clone() pred_boxes.tensor[:, 0::2] *= scale_x pred_boxes.tensor[:, 1::2] *= scale_y if do_hflip[idx]: pred_boxes.tensor[:, [ 0, 2 ]] = actual_width - pred_boxes.tensor[:, [2, 0]] aug_instances = Instances( image_size=(actual_height, actual_width), pred_boxes=pred_boxes, pred_classes=merged_instances.pred_classes, scores=merged_instances.scores, ) augmented_instances.append(aug_instances) # 2.2: run forward on the detected boxes outputs = self._batch_inference(augmented_inputs, augmented_instances, do_postprocess=False) for idx, output in enumerate(outputs): if do_hflip[idx]: output.pred_masks = output.pred_masks.flip(dims=[3]) # 2.3: average the predictions all_pred_masks = torch.stack([o.pred_masks for o in outputs], dim=0) avg_pred_masks = torch.mean(all_pred_masks, dim=0) output = outputs[0] output.pred_masks = avg_pred_masks output = detector_postprocess(output, height, width) return {"instances": output}
def inference_single_image(self, cls_logits, pts_refine, pts_strides, points, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguemnts: cls_logits (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W, K) pts_refine (list[Tensor]): Same shape as 'cls_logits' except that K becomes 2 * num_points. pts_strides (list(Tensor)): list of #feature levels. Each entry contains tensor of size (H x W, ) points (list[Tensor]): list of #feature levels. Each entry contains a tensor, which contains all the points for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but only for one image """ assert len(cls_logits) == len(pts_refine) == len(pts_strides) boxes_all = [] scores_all = [] class_idxs_all = [] # Iterate over every feature level for cls_logits_i, pts_refine_i, points_i, pts_strides_i in zip( cls_logits, pts_refine, points, pts_strides): bbox_pos_center = torch.cat([points_i, points_i], dim=1) bbox_pred = self.pts_to_bbox(pts_refine_i) bbox_pred = bbox_pred * pts_strides_i.reshape(-1, 1) + bbox_pos_center bbox_pred[:, 0].clamp_(min=0, max=image_size[1]) bbox_pred[:, 1].clamp_(min=0, max=image_size[0]) bbox_pred[:, 2].clamp_(min=0, max=image_size[1]) bbox_pred[:, 3].clamp_(min=0, max=image_size[0]) # (HxWxK, ) point_cls_i = cls_logits_i.flatten().sigmoid_() # keep top k scoring indices only num_topk = min(self.topk_candidates, point_cls_i.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = point_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] point_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes predicted_boxes = bbox_pred[point_idxs] boxes_all.append(predicted_boxes) scores_all.append(predicted_prob) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold, nms_type=self.nms_type) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return result