def inference_single_image(self, conf_pred_per_image, loc_pred_per_image, default_boxes, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: conf_pred_per_image (list[Tensor]): list of #feature levels. Each entry contains tensor of size [Hi x Wi x D, C]. loc_pred_per_image (list[Tensor]): same shape as 'conf_pred_per_image' except that C becomes 4. default_boxes (list['Boxes']): a list of 'Boxes' elements. The Boxes contains default boxes of one image on the specific feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ # predict confidence conf_pred = torch.cat(conf_pred_per_image, dim=0) # [R, C] conf_pred = conf_pred.softmax(dim=1) # predict boxes loc_pred = torch.cat(loc_pred_per_image, dim=0) # [R, 4] default_boxes = Boxes.cat(default_boxes) # [R, 4] boxes_pred = self.box2box_transform.apply_deltas( loc_pred, default_boxes.tensor) num_boxes, num_classes = conf_pred.shape boxes_pred = boxes_pred.view(num_boxes, 1, 4).expand( num_boxes, num_classes, 4) # [R, C, 4] labels = torch.arange(num_classes, device=self.device) # [0, ..., C] labels = labels.view(1, num_classes).expand_as(conf_pred) # [R, C] # remove predictions with the background label boxes_pred = boxes_pred[:, :-1] conf_pred = conf_pred[:, :-1] labels = labels[:, :-1] # batch everything, by making every class prediction be a separate instance boxes_pred = boxes_pred.reshape(-1, 4) conf_pred = conf_pred.reshape(-1) labels = labels.reshape(-1) # remove low scoring boxes indices = torch.nonzero(conf_pred > self.score_threshold, as_tuple=False).squeeze(1) boxes_pred, conf_pred, labels = boxes_pred[indices], conf_pred[indices], labels[indices] keep = generalized_batched_nms(boxes_pred, conf_pred, labels, self.nms_threshold, nms_type=self.nms_type) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_pred[keep]) result.scores = conf_pred[keep] result.pred_classes = labels[keep] return result
def get_transform(self, img, annotations): """ Args: img (ndarray): of shape HxWxC(RGB). The array can be of type uint8 in range [0, 255], or floating point in range [0, 255]. annotations (list[dict[str->str]]): Each item in the list is a bbox label of an object. The object is represented by a dict, which contains: - bbox (list): bbox coordinates, top left and bottom right. - bbox_mode (str): bbox label mode, for example: `XYXY_ABS`, `XYWH_ABS` and so on... """ sample_mode = (1, *self.min_ious, 0) h, w = img.shape[:2] boxes = list() for obj in annotations: boxes.append(BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS)) boxes = torch.tensor(boxes) while True: mode = np.random.choice(sample_mode) if mode == 1: return NoOpTransform() min_iou = mode for i in range(50): new_w = np.random.uniform(self.min_crop_size * w, w) new_h = np.random.uniform(self.min_crop_size * h, h) # h / w in [0.5, 2] if new_h / new_w < 0.5 or new_h / new_w > 2: continue left = np.random.uniform(w - new_w) top = np.random.uniform(h - new_h) patch = np.array( (int(left), int(top), int(left + new_w), int(top + new_h))) overlaps = pairwise_iou( Boxes(patch.reshape(-1, 4)), Boxes(boxes.reshape(-1, 4)) ) if overlaps.min() < min_iou: continue # center of boxes should inside the crop img center = (boxes[:, :2] + boxes[:, 2:]) / 2 mask = ((center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (center[:, 0] < patch[2]) * ( center[:, 1] < patch[3])) if not mask.any(): continue return IoUCropTransform(int(left), int(top), int(new_w), int(new_h))
def get_ground_truth(self, anchors, bbox_preds, targets): anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] N = len(anchors) # list[Tensor(R, 4)], one for each image all_anchors = Boxes.cat(anchors).tensor.reshape(N, -1, 4) # Boxes(Tensor(N*R, 4)) box_delta = cat(bbox_preds, dim=1) # box_pred: xyxy; targets: xyxy box_pred = self.box2box_transform.apply_deltas(box_delta, all_anchors) indices = self.matcher(box_pred, all_anchors, targets) return indices
def inference(self, box_cls, box_pred, image_sizes): """ Arguments: box_cls (Tensor): tensor of shape (batch_size, num_proposals, K). The tensor predicts the classification probability for each proposal. box_pred (Tensor): tensors of shape (batch_size, num_proposals, 4). The tensor predicts 4-vector (x,y,w,h) box regression values for every proposal image_sizes (List[torch.Size]): the input image sizes Returns: results (List[Instances]): a list of #images elements. """ assert len(box_cls) == len(image_sizes) results = [] if self.use_focal: scores = torch.sigmoid(box_cls) labels = torch.arange(self.num_classes, device=self.device). \ unsqueeze(0).repeat(self.num_proposals, 1).flatten(0, 1) for i, (scores_per_image, box_pred_per_image, image_size) in enumerate(zip(scores, box_pred, image_sizes)): result = Instances(image_size) scores_per_image, topk_indices = scores_per_image.flatten( 0, 1).topk(self.num_proposals, sorted=False) labels_per_image = labels[topk_indices] box_pred_per_image = box_pred_per_image.view(-1, 1, 4).repeat( 1, self.num_classes, 1).view(-1, 4) box_pred_per_image = box_pred_per_image[topk_indices] result.pred_boxes = Boxes(box_pred_per_image) result.scores = scores_per_image result.pred_classes = labels_per_image results.append(result) else: scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1) for i, (scores_per_image, labels_per_image, box_pred_per_image, image_size) \ in enumerate(zip(scores, labels, box_pred, image_sizes)): result = Instances(image_size) result.pred_boxes = Boxes(box_pred_per_image) result.pred_boxes.scale(scale_x=image_size[1], scale_y=image_size[0]) result.scores = scores_per_image result.pred_classes = labels_per_image results.append(result) return results
def bbox_targets(self, candidate_bboxes, gt_bboxes, gt_labels, pos_iou_thr=0.5, neg_iou_thr=0.4, gt_max_matching=True): """ Target assign: MaxIoU assign Args: candidate_bboxes: gt_bboxes: gt_labels: pos_iou_thr: neg_iou_thr: gt_max_matching: Returns: """ if candidate_bboxes.size(0) == 0 or gt_bboxes.tensor.size(0) == 0: raise ValueError('No gt or anchors') candidate_bboxes[:, 0].clamp_(min=0) candidate_bboxes[:, 1].clamp_(min=0) candidate_bboxes[:, 2].clamp_(min=0) candidate_bboxes[:, 3].clamp_(min=0) num_candidates = candidate_bboxes.size(0) overlaps = pairwise_iou(Boxes(candidate_bboxes), gt_bboxes) assigned_labels = overlaps.new_full((overlaps.size(0), ), self.num_classes, dtype=torch.long) # for each anchor, which gt best overlaps with it # for each anchor, the max iou of all gts max_overlaps, argmax_overlaps = overlaps.max(dim=1) # for each gt, which anchor best overlaps with it # for each gt, the max iou of all proposals gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=0) bg_inds = max_overlaps < neg_iou_thr assigned_labels[bg_inds] = self.num_classes fg_inds = max_overlaps >= pos_iou_thr assigned_labels[fg_inds] = gt_labels[argmax_overlaps[fg_inds]] if gt_max_matching: fg_inds = torch.nonzero(overlaps == gt_max_overlaps)[:, 0] assigned_labels[fg_inds] = gt_labels[argmax_overlaps[fg_inds]] assigned_bboxes = overlaps.new_zeros((num_candidates, 4)) fg_inds = (assigned_labels >= 0) & (assigned_labels != self.num_classes) assigned_bboxes[fg_inds] = gt_bboxes.tensor[argmax_overlaps[fg_inds]] return assigned_bboxes, assigned_labels
def create_instances(predictions, image_size): ret = Instances(image_size) score = np.asarray([x["score"] for x in predictions]) chosen = (score > args.conf_threshold).nonzero()[0] score = score[chosen] bbox = np.asarray([predictions[i]["bbox"] for i in chosen]) if score.shape[0] == 0: bbox = np.zeros((0, 4)) else: bbox = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) labels = np.asarray( [dataset_id_map(predictions[i]["category_id"]) for i in chosen]) ret.scores = score ret.pred_boxes = Boxes(bbox) ret.pred_classes = labels try: ret.pred_masks = [predictions[i]["segmentation"] for i in chosen] except KeyError: pass return ret
def forward(self, features): """ Returns: list[list[Boxes]]: a list of #image elements. Each is a list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. list[list[Tensor]]: a list of #image elements. Each is a list of #feature level tensors. The tensor contains strides, or unit lengths for the anchors. list[list[Tensor]]: a list of #image elements. Each is a list of #feature level tensors. The Tensor contains indexes for the anchors, with the last dimension meaning (L, N, H, W, A), where L is level, I is image (not set yet), H is height, W is width, and A is anchor. """ num_images = len(features[0]) grid_sizes = [feature_map.shape[-2:] for feature_map in features] ( anchors_list, lengths_list, indexes_list, ) = self.grid_anchors_with_unit_lengths_and_indexes(grid_sizes) # Convert anchors from Tensor to Boxes anchors_per_im = [Boxes(x) for x in anchors_list] anchors = [copy.deepcopy(anchors_per_im) for _ in range(num_images)] unit_lengths = [copy.deepcopy(lengths_list) for _ in range(num_images)] indexes = [copy.deepcopy(indexes_list) for _ in range(num_images)] return anchors, unit_lengths, indexes
def benchmark_paste(): S = 800 H, W = image_shape = (S, S) N = 64 torch.manual_seed(42) masks = torch.rand(N, 28, 28) center = torch.rand(N, 2) * 600 + 100 wh = torch.clamp(torch.randn(N, 2) * 40 + 200, min=50) x0y0 = torch.clamp(center - wh * 0.5, min=0.0) x1y1 = torch.clamp(center + wh * 0.5, max=S) boxes = Boxes(torch.cat([x0y0, x1y1], axis=1)) def func(device, n=3): m = masks.to(device=device) b = boxes.to(device=device) def bench(): for _ in range(n): paste_masks_in_image(m, b, image_shape) if device.type == "cuda": torch.cuda.synchronize() return bench specs = [{"device": torch.device("cpu"), "n": 3}] if torch.cuda.is_available(): specs.append({"device": torch.device("cuda"), "n": 3}) benchmark(func, "paste_masks", specs, num_iters=10, warmup_iters=2)
def _test_scriptability(self, device): pooler_resolution = 14 canonical_level = 4 canonical_scale_factor = 2**canonical_level pooler_scales = (1.0 / canonical_scale_factor, ) sampling_ratio = 0 N, C, H, W = 2, 4, 10, 8 N_rois = 10 std = 11 mean = 0 feature = (torch.rand(N, C, H, W) - 0.5) * 2 * std + mean features = [feature.to(device)] rois = [] for _ in range(N): boxes = self._rand_boxes(num_boxes=N_rois, x_max=W * canonical_scale_factor, y_max=H * canonical_scale_factor) rois.append(Boxes(boxes).to(device)) roialignv2_pooler = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type="ROIAlignV2", ) roialignv2_out = roialignv2_pooler(features, rois) scripted_roialignv2_out = torch.jit.script(roialignv2_pooler)(features, rois) self.assertTrue(torch.equal(roialignv2_out, scripted_roialignv2_out))
def get_ground_truth(self, anchors, targets): """ Args: anchors (list[list[Boxes]]): a list of N=#image elements. Each is a list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each anchor. R is the total number of anchors, i.e. the sum of Hi x Wi x A for all levels. Anchors with an IoU with some target higher than the foreground threshold are assigned their corresponding label in the [0, K-1] range. Anchors whose IoU are below the background threshold are assigned the label "K". Anchors whose IoU are between the foreground and background thresholds are assigned a label "-1", i.e. ignore. gt_anchors_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth box2box transform targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding anchor is labeled as foreground. """ gt_classes = [] gt_anchors_deltas = [] anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] # list[Tensor(R, 4)], one for each image for anchors_per_image, targets_per_image in zip(anchors, targets): match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, anchors_per_image) gt_matched_idxs, anchor_labels = self.matcher(match_quality_matrix) has_gt = len(targets_per_image) > 0 if has_gt: # ground truth box regression matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs] gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas( anchors_per_image.tensor, matched_gt_boxes.tensor ) gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Anchors with label 0 are treated as background. gt_classes_i[anchor_labels == 0] = self.num_classes # Anchors with label -1 are ignored. gt_classes_i[anchor_labels == -1] = -1 else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes gt_anchors_reg_deltas_i = torch.zeros_like( anchors_per_image.tensor) gt_classes.append(gt_classes_i) gt_anchors_deltas.append(gt_anchors_reg_deltas_i) return torch.stack(gt_classes), torch.stack(gt_anchors_deltas)
def test_roi_heads(self): torch.manual_seed(121) cfg = RCNNConfig() # PROPOSAL_GENERATOR: "RPN" # ROI_HEADS: "StandardROIHeads" # ROI_BOX_HEAD: "FastRCNNConvFCHead" cfg.MODEL.RESNETS.DEPTH = 50 cfg.MODEL.ROI_BOX_HEAD.NUM_FC = 2 cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2" cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5) def build_box_head(cfg, input_shape): return FastRCNNConvFCHead(cfg, input_shape) cfg.build_box_head = build_box_head backbone = build_backbone(cfg) num_images = 2 images_tensor = torch.rand(num_images, 20, 30) image_sizes = [(10, 10), (20, 30)] images = ImageList(images_tensor, image_sizes) num_channels = 1024 features = {"res4": torch.rand(num_images, num_channels, 1, 2)} image_shape = (15, 15) gt_boxes0 = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32) gt_instance0 = Instances(image_shape) gt_instance0.gt_boxes = Boxes(gt_boxes0) gt_instance0.gt_classes = torch.tensor([2, 1]) gt_boxes1 = torch.tensor([[1, 5, 2, 8], [7, 3, 10, 5]], dtype=torch.float32) gt_instance1 = Instances(image_shape) gt_instance1.gt_boxes = Boxes(gt_boxes1) gt_instance1.gt_classes = torch.tensor([1, 2]) gt_instances = [gt_instance0, gt_instance1] proposal_generator = RPN(cfg, backbone.output_shape()) roi_heads = StandardROIHeads(cfg, backbone.output_shape()) with EventStorage(): # capture events in a new storage to discard them proposals, proposal_losses = proposal_generator(images, features, gt_instances) _, detector_losses = roi_heads(images, features, proposals, gt_instances) expected_losses = { "loss_cls": torch.tensor(4.4236516953), "loss_box_reg": torch.tensor(0.0091214813), } for name in expected_losses.keys(): self.assertTrue(torch.allclose(detector_losses[name], expected_losses[name]))
def set_task_related(): """ 设置一些 task related 信息, 一些简化操作之类的, 放到这个更加高效, 所有的都放到 : batched_inputs 更目录下 """ gt = dd['annotations'][0] dataset_dict['gt_ins'] = Instances( (224, 224), gt_boxes=Boxes([gt['bbox'].tolist()]), gt_classes=torch.IntTensor([1])) props = [item['bbox'] for item in dd['annotations'][1:]] feats = [item['feat'] for item in dd['annotations'][1:]] classes = [item['class'] for item in dd['annotations'][1:]] dd['props_ins'] = Instances( (224, 224), proposal_boxes=Boxes(props), features=torch.from_numpy(np.array(feats)), classes=torch.from_numpy(np.array(classes)))
def _inference_one_image(self, inputs): augmented_inputs = self.tta_mapper(inputs) assert len({x["file_name"] for x in augmented_inputs}) == 1, "inference different images" heights = [k["height"] for k in augmented_inputs] widths = [k["width"] for k in augmented_inputs] assert ( len(set(heights)) == 1 and len(set(widths)) == 1 ), "Augmented version of the inputs should have the same original resolution!" height = heights[0] width = widths[0] # 1. Detect boxes from all augmented versions # TODO wangfeng02: use box structures instead of boxes, scores and classes all_boxes = [] all_scores = [] all_classes = [] factors = 2 if self.tta_mapper.flip else 1 if self.enable_scale_filter: assert len(augmented_inputs) == len(self.scale_ranges) * factors for i, single_input in enumerate(augmented_inputs): do_hflip = single_input.pop("horiz_flip", False) # 1.1: forward with single augmented image output = self.model._inference_for_ms_test([single_input]) # 1.2: union the results pred_boxes = output.get("pred_boxes").tensor if do_hflip: pred_boxes[:, [0, 2]] = width - pred_boxes[:, [2, 0]] pred_scores = output.get("scores") pred_classes = output.get("pred_classes") if self.enable_scale_filter: keep = filter_boxes(pred_boxes, *self.scale_ranges[i // factors]) pred_boxes = pred_boxes[keep] pred_scores = pred_scores[keep] pred_classes = pred_classes[keep] all_boxes.append(pred_boxes) all_scores.append(pred_scores) all_classes.append(pred_classes) boxes_all = torch.cat(all_boxes, dim=0) scores_all = torch.cat(all_scores, dim=0) class_idxs_all = torch.cat(all_classes, dim=0) boxes_all, scores_all, class_idxs_all = merge_result_from_multi_scales( boxes_all, scores_all, class_idxs_all, nms_type="soft_vote", vote_thresh=0.65, max_detection=self.max_detection ) result = Instances((height, width)) result.pred_boxes = Boxes(boxes_all) result.scores = scores_all result.pred_classes = class_idxs_all return {"instances": result}
def forward(self, features, bboxes, pro_features, pooler): """ :param bboxes: (N, nr_boxes, 4) :param pro_features: (nr_boxes, N, d_model) """ N, nr_boxes = bboxes.shape[:2] # roi_feature. proposal_boxes = list() for b in range(N): proposal_boxes.append(Boxes(bboxes[b])) roi_features = pooler(features, proposal_boxes) roi_features = roi_features.view(N * nr_boxes, self.d_model, -1).permute(2, 0, 1) # self_att. pro_features = pro_features.view(N, nr_boxes, self.d_model).permute(1, 0, 2) pro_features2 = self.self_attn(pro_features, pro_features, value=pro_features)[0] pro_features = pro_features + self.dropout1(pro_features2) pro_features = self.norm1(pro_features) # inst_interact. pro_features = pro_features.view(nr_boxes, N, self.d_model). \ permute(1, 0, 2).reshape(1, N * nr_boxes, self.d_model) pro_features2 = self.inst_interact(pro_features, roi_features) pro_features = pro_features + self.dropout2(pro_features2) obj_features = self.norm2(pro_features) # obj_feature. obj_features2 = self.linear2( self.dropout(self.activation(self.linear1(obj_features)))) obj_features = obj_features + self.dropout3(obj_features2) obj_features = self.norm3(obj_features) fc_feature = obj_features.transpose(0, 1).reshape(N * nr_boxes, -1) cls_feature = fc_feature.clone() reg_feature = fc_feature.clone() for cls_layer in self.cls_module: cls_feature = cls_layer(cls_feature) for reg_layer in self.reg_module: reg_feature = reg_layer(reg_feature) class_logits = self.class_logits(cls_feature) bboxes_deltas = self.bboxes_delta(reg_feature) pred_bboxes = self.apply_deltas(bboxes_deltas, bboxes.view(-1, 4)) return class_logits.view(N, nr_boxes, -1), pred_bboxes.view(N, nr_boxes, -1), obj_features
def transform_proposals(dataset_dict, image_shape, transforms, min_box_side_len, proposal_topk): """ Apply transformations to the proposals in dataset_dict, if any. Args: dataset_dict (dict): a dict read from the dataset, possibly contains fields "proposal_boxes", "proposal_objectness_logits", "proposal_bbox_mode" image_shape (tuple): height, width transforms (TransformList): min_box_side_len (int): keep proposals with at least this size proposal_topk (int): only keep top-K scoring proposals The input dict is modified in-place, with abovementioned keys removed. A new key "proposals" will be added. Its value is an `Instances` object which contains the transformed proposals in its field "proposal_boxes" and "objectness_logits". """ if "proposal_boxes" in dataset_dict: # Transform proposal boxes boxes = transforms.apply_box( BoxMode.convert( dataset_dict.pop("proposal_boxes"), dataset_dict.pop("proposal_bbox_mode"), BoxMode.XYXY_ABS, )) boxes = Boxes(boxes) objectness_logits = torch.as_tensor( dataset_dict.pop("proposal_objectness_logits").astype("float32")) boxes.clip(image_shape) keep = boxes.nonempty(threshold=min_box_side_len) boxes = boxes[keep] objectness_logits = objectness_logits[keep] proposals = Instances(image_shape) proposals.proposal_boxes = boxes[:proposal_topk] proposals.objectness_logits = objectness_logits[:proposal_topk] dataset_dict["proposals"] = proposals
def process_annotation(self, ann, mask_side_len=28): # Parse annotation data img_info = self.coco.loadImgs(ids=[ann["image_id"]])[0] height, width = img_info["height"], img_info["width"] gt_polygons = [ np.array(p, dtype=np.float64) for p in ann["segmentation"] ] gt_bbox = BoxMode.convert(ann["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) gt_bbox = np.array(gt_bbox) gt_bit_mask = polygons_to_bitmask(gt_polygons, height, width) # Run rasterize .. torch_gt_bbox = torch.Tensor(gt_bbox)[None, :].to(dtype=torch.float32) box_bitmasks = { "polygon": PolygonMasks([gt_polygons ]).crop_and_resize(torch_gt_bbox, mask_side_len)[0], "gridsample": rasterize_polygons_with_grid_sample(gt_bit_mask, gt_bbox, mask_side_len), "roialign": BitMasks(torch.from_numpy( gt_bit_mask[None, :, :])).crop_and_resize( torch_gt_bbox, mask_side_len)[0], } # Run paste .. results = defaultdict(dict) for k, box_bitmask in box_bitmasks.items(): padded_bitmask, scale = pad_masks(box_bitmask[None, :, :], 1) scaled_boxes = scale_boxes(torch_gt_bbox, scale) r = results[k] r["old"] = paste_mask_in_image_old(padded_bitmask[0], scaled_boxes[0], height, width, threshold=0.5) r["aligned"] = paste_masks_in_image(box_bitmask[None, :, :], Boxes(gt_bbox[None, :]), (height, width))[0] table = [] for rasterize_method, r in results.items(): for paste_method, mask in r.items(): mask = np.asarray(mask) iou = iou_between_full_image_bit_masks( gt_bit_mask.astype("uint8"), mask) table.append((rasterize_method, paste_method, iou)) return table
def test_fast_rcnn(self): torch.manual_seed(132) cfg = RCNNConfig() cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5) box2box_transform = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS) box_head_output_size = 8 num_classes = 5 cls_agnostic_bbox_reg = False box_predictor = FastRCNNOutputLayers( box_head_output_size, num_classes, cls_agnostic_bbox_reg, box_dim=4 ) feature_pooled = torch.rand(2, box_head_output_size) pred_class_logits, pred_proposal_deltas = box_predictor(feature_pooled) image_shape = (10, 10) proposal_boxes = torch.tensor([[0.8, 1.1, 3.2, 2.8], [2.3, 2.5, 7, 8]], dtype=torch.float32) gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32) result = Instances(image_shape) result.proposal_boxes = Boxes(proposal_boxes) result.gt_boxes = Boxes(gt_boxes) result.gt_classes = torch.tensor([1, 2]) proposals = [] proposals.append(result) smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA outputs = FastRCNNOutputs( box2box_transform, pred_class_logits, pred_proposal_deltas, proposals, smooth_l1_beta ) with EventStorage(): # capture events in a new storage to discard them losses = outputs.losses() expected_losses = { "loss_cls": torch.tensor(1.7951188087), "loss_box_reg": torch.tensor(4.0357131958), } for name in expected_losses.keys(): assert torch.allclose(losses[name], expected_losses[name])
def _test_roialignv2_roialignrotated_match(self, device): pooler_resolution = 14 canonical_level = 4 canonical_scale_factor = 2**canonical_level pooler_scales = (1.0 / canonical_scale_factor, ) sampling_ratio = 0 N, C, H, W = 2, 4, 10, 8 N_rois = 10 std = 11 mean = 0 feature = (torch.rand(N, C, H, W) - 0.5) * 2 * std + mean features = [feature.to(device)] rois = [] rois_rotated = [] for _ in range(N): boxes = self._rand_boxes(num_boxes=N_rois, x_max=W * canonical_scale_factor, y_max=H * canonical_scale_factor) rotated_boxes = torch.zeros(N_rois, 5) rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0 rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0 rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0] rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1] rois.append(Boxes(boxes).to(device)) rois_rotated.append(RotatedBoxes(rotated_boxes).to(device)) roialignv2_pooler = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type="ROIAlignV2", ) roialignv2_out = roialignv2_pooler(features, rois) roialignrotated_pooler = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type="ROIAlignRotated", ) roialignrotated_out = roialignrotated_pooler(features, rois_rotated) self.assertTrue( torch.allclose(roialignv2_out, roialignrotated_out, atol=1e-4))
def test_pairwise_iou(self): boxes1 = torch.tensor([[0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0]]) boxes2 = torch.tensor( [ [0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 0.5, 1.0], [0.0, 0.0, 1.0, 0.5], [0.0, 0.0, 0.5, 0.5], [0.5, 0.5, 1.0, 1.0], [0.5, 0.5, 1.5, 1.5], ] ) expected_ious = torch.tensor( [ [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)], [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)], ] ) ious = pairwise_iou(Boxes(boxes1), Boxes(boxes2)) self.assertTrue(torch.allclose(ious, expected_ious))
def _match_and_label_boxes(self, proposals, stage, targets): """ Match proposals with groundtruth using the matcher at the given stage. Label the proposals as foreground or background based on the match. Args: proposals (list[Instances]): One Instances for each image, with the field "proposal_boxes". stage (int): the current stage targets (list[Instances]): the ground truth instances Returns: list[Instances]: the same proposals, but with fields "gt_classes" and "gt_boxes" """ num_fg_samples, num_bg_samples = [], [] for proposals_per_image, targets_per_image in zip(proposals, targets): match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes) # proposal_labels are 0 or 1 matched_idxs, proposal_labels = self.proposal_matchers[stage]( match_quality_matrix) if len(targets_per_image) > 0: gt_classes = targets_per_image.gt_classes[matched_idxs] # Label unmatched proposals (0 label from matcher) as background (label=num_classes) gt_classes[proposal_labels == 0] = self.num_classes gt_boxes = targets_per_image.gt_boxes[matched_idxs] else: gt_classes = torch.zeros_like(matched_idxs) + self.num_classes gt_boxes = Boxes( targets_per_image.gt_boxes.tensor.new_zeros( (len(proposals_per_image), 4))) proposals_per_image.gt_classes = gt_classes proposals_per_image.gt_boxes = gt_boxes num_fg_samples.append((proposal_labels == 1).sum().item()) num_bg_samples.append(proposal_labels.numel() - num_fg_samples[-1]) # Log the number of fg/bg samples in each stage storage = get_event_storage() storage.put_scalar( "stage{}/roi_head/num_fg_samples".format(stage), sum(num_fg_samples) / len(num_fg_samples), ) storage.put_scalar( "stage{}/roi_head/num_bg_samples".format(stage), sum(num_bg_samples) / len(num_bg_samples), ) return proposals
def _get_ground_truth(self): """ Returns: gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the total number of anchors in image i (i.e., len(anchors[i])). Label values are in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative class; 1 = positive class. gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), 4). """ gt_objectness_logits = [] gt_anchor_deltas = [] # Concatenate anchors from all feature maps into a single Boxes per image anchors = [Boxes.cat(anchors_i) for anchors_i in self.anchors] for image_size_i, anchors_i, gt_boxes_i in zip(self.image_sizes, anchors, self.gt_boxes): """ image_size_i: (h, w) for the i-th image anchors_i: anchors for i-th image gt_boxes_i: ground-truth boxes for i-th image """ match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i, anchors_i) matched_idxs, gt_objectness_logits_i = retry_if_cuda_oom( self.anchor_matcher)(match_quality_matrix) # Matching is memory-expensive and may result in CPU tensors. But the result is small gt_objectness_logits_i = gt_objectness_logits_i.to( device=gt_boxes_i.device) del match_quality_matrix if self.boundary_threshold >= 0: # Discard anchors that go out of the boundaries of the image # NOTE: This is legacy functionality that is turned off by default in cvpods anchors_inside_image = anchors_i.inside_box( image_size_i, self.boundary_threshold) gt_objectness_logits_i[~anchors_inside_image] = -1 if len(gt_boxes_i) == 0: # These values won't be used anyway since the anchor is labeled as background gt_anchor_deltas_i = torch.zeros_like(anchors_i.tensor) else: # TODO wasted computation for ignored boxes matched_gt_boxes = gt_boxes_i[matched_idxs] gt_anchor_deltas_i = self.box2box_transform.get_deltas( anchors_i.tensor, matched_gt_boxes.tensor) gt_objectness_logits.append(gt_objectness_logits_i) gt_anchor_deltas.append(gt_anchor_deltas_i) return gt_objectness_logits, gt_anchor_deltas
def __init__(self, box2box_transform, pred_class_logits, pred_proposal_deltas, proposals, smooth_l1_beta): """ Args: box2box_transform (Box2BoxTransform/Box2BoxTransformRotated): box2box transform instance for proposal-to-detection transformations. pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class logits for all R predicted object instances. Each row corresponds to a predicted object instance. pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for class-specific or class-agnostic regression. It stores the predicted deltas that transform proposals into final box detections. B is the box dimension (4 or 5). When B is 4, each row is [dx, dy, dw, dh (, ....)]. When B is 5, each row is [dx, dy, dw, dh, da (, ....)]. proposals (list[Instances]): A list of N Instances, where Instances i stores the proposals for image i, in the field "proposal_boxes". When training, each Instances must have ground-truth labels stored in the field "gt_classes" and "gt_boxes". smooth_l1_beta (float): The transition point between L1 and L2 loss in the smooth L1 loss function. When set to 0, the loss becomes L1. When set to +inf, the loss becomes constant 0. """ self.box2box_transform = box2box_transform self.num_preds_per_image = [len(p) for p in proposals] self.pred_class_logits = pred_class_logits self.pred_proposal_deltas = pred_proposal_deltas self.smooth_l1_beta = smooth_l1_beta if len(proposals): box_type = type(proposals[0].proposal_boxes) # cat(..., dim=0) concatenates over all images in the batch self.proposals = box_type.cat( [p.proposal_boxes for p in proposals]) assert (not self.proposals.tensor.requires_grad ), "Proposals should not require gradients!" self.image_shapes = [x.image_size for x in proposals] # The following fields should exist only when training. if proposals[0].has("gt_boxes"): self.gt_boxes = box_type.cat([p.gt_boxes for p in proposals]) assert proposals[0].has("gt_classes") self.gt_classes = cat([p.gt_classes for p in proposals], dim=0) else: self.proposals = Boxes( torch.zeros(0, 4, device=self.pred_proposal_deltas.device)) self._no_instances = len(proposals) == 0 # no instances found
def point_sample_fine_grained_features(features_list, feature_scales, boxes, point_coords): """ Get features from feature maps in `features_list` that correspond to specific point coordinates inside each bounding box from `boxes`. Args: features_list (list[Tensor]): A list of feature map tensors to get features from. feature_scales (list[float]): A list of scales for tensors in `features_list`. boxes (list[Boxes]): A list of I Boxes objects that contain R_1 + ... + R_I = R boxes all together. point_coords (Tensor): A tensor of shape (R, P, 2) that contains [0, 1] x [0, 1] box-normalized coordinates of the P sampled points. Returns: point_features (Tensor): A tensor of shape (R, C, P) that contains features sampled from all features maps in feature_list for P sampled points for all R boxes in `boxes`. point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains image-level coordinates of P points. """ cat_boxes = Boxes.cat(boxes) num_boxes = [len(b) for b in boxes] point_coords_wrt_image = get_point_coords_wrt_image( cat_boxes.tensor, point_coords) split_point_coords_wrt_image = torch.split(point_coords_wrt_image, num_boxes) point_features = [] for idx_img, point_coords_wrt_image_per_image in enumerate( split_point_coords_wrt_image): point_features_per_image = [] for idx_feature, feature_map in enumerate(features_list): h, w = feature_map.shape[-2:] scale = torch.tensor( [w, h], device=feature_map.device) / feature_scales[idx_feature] point_coords_scaled = point_coords_wrt_image_per_image / scale point_features_per_image.append( point_sample( feature_map[idx_img].unsqueeze(0), point_coords_scaled.unsqueeze(0), align_corners=False, ).squeeze(0).transpose(1, 0)) point_features.append(cat(point_features_per_image, dim=1)) return cat(point_features, dim=0), point_coords_wrt_image
def _forward_box( self, features: List[torch.Tensor], proposals: List[Instances] ) -> Union[Dict[str, torch.Tensor], List[Instances]]: """ Forward logic of the box prediction branch. If `self.train_on_pred_boxes is True`, the function puts predicted boxes in the `proposal_boxes` field of `proposals` argument. Args: features (list[Tensor]): #level input features for box prediction proposals (list[Instances]): the per-image object proposals with their matching ground truth. Each has fields "proposal_boxes", and "objectness_logits", "gt_classes", "gt_boxes". Returns: In training, a dict of losses. In inference, a list of `Instances`, the predicted instances. """ box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) box_features = self.box_head(box_features) pred_class_logits, pred_proposal_deltas = self.box_predictor( box_features) del box_features outputs = FastRCNNOutputs( self.box2box_transform, pred_class_logits, pred_proposal_deltas, proposals, self.smooth_l1_beta, ) if self.training: if self.train_on_pred_boxes: with torch.no_grad(): pred_boxes = outputs.predict_boxes_for_gt_classes() for proposals_per_image, pred_boxes_per_image in zip( proposals, pred_boxes): proposals_per_image.proposal_boxes = Boxes( pred_boxes_per_image) return outputs.losses() else: pred_instances, _ = outputs.inference(self.test_score_thresh, self.test_nms_thresh, self.test_nms_type, self.test_detections_per_img) return pred_instances
def _inference_one_image(self, inputs): augmented_inputs = self.tta_mapper(inputs) assert len({x["file_name"] for x in augmented_inputs }) == 1, "inference different images" heights = [k["height"] for k in augmented_inputs] widths = [k["width"] for k in augmented_inputs] assert ( len(set(heights)) == 1 and len(set(widths)) == 1 ), "Augmented version of the inputs should have the same original resolution!" height = heights[0] width = widths[0] # 1. Detect boxes from all augmented versions all_boxes = [] all_scores = [] all_classes = [] for single_input in augmented_inputs: do_hflip = single_input.pop("horiz_flip", False) # 1.1: forward with single augmented image output = self.model._inference_for_ms_test([single_input]) # 1.2: union the results pred_boxes = output.get("pred_boxes").tensor if do_hflip: pred_boxes[:, [0, 2]] = width - pred_boxes[:, [2, 0]] all_boxes.append(pred_boxes) all_scores.append(output.get("scores")) all_classes.append(output.get("pred_classes")) boxes_all = torch.cat(all_boxes, dim=0) scores_all = torch.cat(all_scores, dim=0) class_idxs_all = torch.cat(all_classes, dim=0) keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.model.nms_threshold, nms_type=self.model.nms_type) keep = keep[:self.model.max_detections_per_image] result = Instances((height, width)) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return {"instances": result}
def inference(self, pred_logits, pred_deltas, pred_masks, anchors, indexes, images): """ Arguments: pred_logits, pred_deltas, pred_masks: Same as the output of: meth:`TensorMaskHead.forward` anchors, indexes: Same as the input of meth:`TensorMask.get_ground_truth` images (ImageList): the input images Returns: results (List[Instances]): a list of #images elements. """ assert len(anchors) == len(images) results = [] pred_logits = [ permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits ] pred_deltas = [permute_to_N_HWA_K(x, 4) for x in pred_deltas] pred_logits = cat(pred_logits, dim=1) pred_deltas = cat(pred_deltas, dim=1) for img_idx, (anchors_im, indexes_im) in enumerate(zip(anchors, indexes)): # Get the size of the current image image_size = images.image_sizes[img_idx] logits_im = pred_logits[img_idx] deltas_im = pred_deltas[img_idx] if self.mask_on: masks_im = [[mla[img_idx] for mla in ml] for ml in pred_masks] else: masks_im = [None] * self.num_levels results_im = self.inference_single_image( logits_im, deltas_im, masks_im, Boxes.cat(anchors_im), cat(indexes_im), tuple(image_size), ) results.append(results_im) return results
def decode_prediction(self, pred_dict, img_info): r""" Args: pred_dict (dict): a dict contains all information of prediction img_info (dict): a dict contains needed information of origin image """ fmap = pred_dict["cls"] reg = pred_dict["reg"] wh = pred_dict["wh"] boxes, scores, classes = CenterNetDecoder.decode(fmap, wh, reg) # boxes = Boxes(boxes.reshape(boxes.shape[-2:])) scores = scores.reshape(-1) classes = classes.reshape(-1).to(torch.int64) # dets = CenterNetDecoder.decode(fmap, wh, reg) boxes = CenterNetDecoder.transform_boxes(boxes, img_info) boxes = Boxes(boxes) return dict(pred_boxes=boxes, scores=scores, pred_classes=classes)
def to_d2_instances_list(instances_list): """ Convert InstancesList to List[Instances]. The input `instances_list` can also be a List[Instances], in this case this method is a non-op. """ if not isinstance(instances_list, InstancesList): assert all(isinstance(x, Instances) for x in instances_list) return instances_list ret = [] for i, info in enumerate(instances_list.im_info): instances = Instances( torch.Size([int(info[0].item()), int(info[1].item())])) ids = instances_list.indices == i for k, v in instances_list.batch_extra_fields.items(): if isinstance(v, torch.Tensor): instances.set(k, v[ids]) continue elif isinstance(v, Boxes): instances.set(k, v[ids, -4:]) continue target_type, tensor_source = v assert isinstance(tensor_source, torch.Tensor) assert tensor_source.shape[0] == instances_list.indices.shape[ 0] tensor_source = tensor_source[ids] if issubclass(target_type, Boxes): instances.set(k, Boxes(tensor_source[:, -4:])) elif issubclass(target_type, Keypoints): instances.set(k, Keypoints(tensor_source)) elif issubclass(target_type, torch.Tensor): instances.set(k, tensor_source) else: raise ValueError( "Can't handle targe type: {}".format(target_type)) ret.append(instances) return ret
def forward(self, features): """ Args: features (list[Tensor]): list of backbone feature maps on which to generate anchors. Returns: list[list[Boxes]]: a list of #image elements. Each is a list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. """ num_images = len(features[0]) grid_sizes = [feature_map.shape[-2:] for feature_map in features] anchors_over_all_feature_maps = self.grid_anchors(grid_sizes) anchors_in_image = [] for anchors_per_feature_map in anchors_over_all_feature_maps: boxes = Boxes(anchors_per_feature_map) anchors_in_image.append(boxes) anchors = [copy.deepcopy(anchors_in_image) for _ in range(num_images)] return anchors
def fast_rcnn_inference_single_image(boxes, scores, image_shape, score_thresh, nms_thresh, nms_type, topk_per_image): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes per image. Returns: Same as `fast_rcnn_inference`, but for only one image. """ scores = scores[:, :-1] num_bbox_reg_classes = boxes.shape[1] // 4 # Convert to Boxes to use the `clip` function ... boxes = Boxes(boxes.reshape(-1, 4)) boxes.clip(image_shape) boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4 # Filter results based on detection scores filter_mask = scores > score_thresh # R x K # R' x 2. First column contains indices of the R predictions; # Second column contains indices of classes. filter_inds = filter_mask.nonzero(as_tuple=False) if num_bbox_reg_classes == 1: boxes = boxes[filter_inds[:, 0], 0] else: boxes = boxes[filter_mask] scores = scores[filter_mask] # Apply per-class NMS keep = generalized_batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh, nms_type=nms_type) if topk_per_image >= 0: keep = keep[:topk_per_image] boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep] result = Instances(image_shape) result.pred_boxes = Boxes(boxes) result.scores = scores result.pred_classes = filter_inds[:, 1] return result, filter_inds[:, 0]