def get_empty_instance(h, w): inst = Instances((h, w)) inst.gt_boxes = Boxes(torch.rand(0, 4)) inst.gt_classes = torch.tensor([]).to(dtype=torch.int64) inst.gt_masks = BitMasks(torch.rand(0, h, w)) return inst
def convert_output(output): r = Instances(tuple(output[0])) r.pred_classes = output[1] r.pred_boxes = Boxes(output[2]) r.scores = output[3] return r
def generate_pair_instances(pred_instances): pred_pair_instances = [] for pred_instance in pred_instances: pred_pair_instance = Instances(pred_instance.image_size) instance_num = len(pred_instance) pred_classes = pred_instance.pred_classes pred_boxes = pred_instance.pred_boxes.tensor # pred_masks=pred_instance.pred_masks image_height, image_width = pred_instance.image_size pred_pair_sub_classes = pred_classes.repeat(instance_num, 1).permute(1, 0).flatten() pred_pair_obj_classes = pred_classes.repeat(instance_num) pred_pair_sub_boxes = pred_boxes.repeat(instance_num, 1, 1).permute( 1, 0, 2).contiguous().view(-1, 4) pred_pair_obj_boxes = pred_boxes.repeat(instance_num, 1) sub_boxes_x1 = pred_pair_sub_boxes[:, 0] obj_boxes_x1 = pred_pair_obj_boxes[:, 0] sub_boxes_y1 = pred_pair_sub_boxes[:, 1] obj_boxes_y1 = pred_pair_obj_boxes[:, 1] sub_boxes_x2 = pred_pair_sub_boxes[:, 2] obj_boxes_x2 = pred_pair_obj_boxes[:, 2] sub_boxes_y2 = pred_pair_sub_boxes[:, 3] obj_boxes_y2 = pred_pair_obj_boxes[:, 3] pair_boxes_x1 = torch.min(sub_boxes_x1, obj_boxes_x1) pair_boxes_y1 = torch.min(sub_boxes_y1, obj_boxes_y1) pair_boxes_x2 = torch.max(sub_boxes_x2, obj_boxes_x2) pair_boxes_y2 = torch.max(sub_boxes_y2, obj_boxes_y2) pred_pair_boxes = torch.stack( [pair_boxes_x1, pair_boxes_y1, pair_boxes_x2, pair_boxes_y2], dim=1) pred_pair_locations = torch.stack( [(sub_boxes_x1 - 0) / image_width, (sub_boxes_y1 - 0) / image_height, (sub_boxes_x2 - image_width) / image_width, (sub_boxes_y2 - image_height) / image_height, (obj_boxes_x1 - 0) / image_width, (obj_boxes_y1 - 0) / image_height, (obj_boxes_x2 - image_width) / image_width, (obj_boxes_y2 - image_height) / image_height], dim=1) pair_width = pair_boxes_x2 - pair_boxes_x1 pair_height = pair_boxes_y2 - pair_boxes_y1 pred_pair_union_locations = torch.stack( [(sub_boxes_x1 - pair_boxes_x1) / pair_width, (sub_boxes_y1 - pair_boxes_y1) / pair_height, (sub_boxes_x2 - pair_boxes_x2) / pair_width, (sub_boxes_y2 - pair_boxes_y2) / pair_height, (obj_boxes_x1 - pair_boxes_x1) / pair_width, (obj_boxes_y1 - pair_boxes_y1) / pair_height, (obj_boxes_x2 - pair_boxes_x2) / pair_width, (obj_boxes_y2 - pair_boxes_y2) / pair_height], dim=1) pred_pair_iou = boxes_iou(pred_pair_sub_boxes, pred_pair_obj_boxes) pred_pair_left_boxes = pred_pair_boxes.repeat( instance_num * instance_num, 1, 1).permute(1, 0, 2).contiguous().view(-1, 4) pred_pair_right_boxes = pred_pair_boxes.repeat( instance_num * instance_num, 1) pred_union_iou = boxes_iou(pred_pair_left_boxes, pred_pair_right_boxes).view( instance_num * instance_num, instance_num * instance_num) left = torch.arange(0, instance_num).repeat(instance_num, 1).permute( 1, 0).flatten().to(pred_classes.device) right = torch.arange(0, instance_num).repeat(instance_num).flatten().to( pred_classes.device) lr_loc = torch.stack([left, right], dim=1) pred_pair_instance_relate_matrix = torch.zeros( instance_num * instance_num, instance_num).to(pred_classes.device).scatter_(1, lr_loc, 1.0) # pred_pair_sub_classes = [] # pred_pair_obj_classes = [] # pred_pair_boxes = [] # # pred_pair_masks=[] # pred_pair_locations = [] # pred_pair_union_locations = [] # pred_pair_iou = [] # pred_subobj_ids = [] # pred_pair_instance_relate_matrix = [] # for i in range(pred_classes.shape[0]): # sub_class = pred_classes[i].item() # sub_box = pred_boxes[i] # sub_x1 = sub_box[0].item() # sub_y1 = sub_box[1].item() # sub_x2 = sub_box[2].item() # sub_y2 = sub_box[3].item() # # sub_mask = pred_masks[i] # for j in range(pred_classes.shape[0]): # obj_class = pred_classes[j].item() # pred_pair_sub_classes.append(sub_class) # pred_pair_obj_classes.append(obj_class) # # obj_box = pred_boxes[j] # obj_x1 = obj_box[0].item() # obj_y1 = obj_box[1].item() # obj_x2 = obj_box[2].item() # obj_y2 = obj_box[3].item() # # pair_x1 = min(sub_x1, obj_x1) # pair_y1 = min(sub_y1, obj_y1) # pair_x2 = max(sub_x2, obj_x2) # pair_y2 = max(sub_y2, obj_y2) # # pair_width = pair_x2 - pair_x1 # pair_height = pair_y2 - pair_y1 # # pred_pair_boxes.append([pair_x1, pair_y1, pair_x2, pair_y2]) # pred_pair_locations.append([(sub_x1 - 0) / image_width, # (sub_y1 - 0) / image_height, # (sub_x2 - image_width) / image_width, # (sub_y2 - image_height) / image_height, # (obj_x1 - 0) / image_width, # (obj_y1 - 0) / image_height, # (obj_x2 - image_width) / image_width, # (obj_y2 - image_height) / image_height]) # pred_pair_union_locations.append([(sub_x1 - pair_x1) / pair_width, # (sub_y1 - pair_y1) / pair_height, # (sub_x2 - pair_x2) / pair_width, # (sub_y2 - pair_y2) / pair_height, # (obj_x1 - pair_x1) / pair_width, # (obj_y1 - pair_y1) / pair_height, # (obj_x2 - pair_x2) / pair_width, # (obj_y2 - pair_y2) / pair_height]) # pred_pair_iou.append(box_iou(sub_box, obj_box)) # pred_subobj_ids.append([i, j]) # instance_vector = torch.zeros(pred_classes.shape[0]).to(pred_classes.device) # instance_vector[i] = 1 # instance_vector[j] = 1 # pred_pair_instance_relate_matrix.append(instance_vector) # # obj_mask=pred_masks[j] # # pred_pair_masks.append(sub_mask+obj_mask) # pred_pair_instance.pred_pair_sub_classes = torch.IntTensor(pred_pair_sub_classes).to(pred_classes.device) # pred_pair_instance.pred_pair_obj_classes = torch.IntTensor(pred_pair_obj_classes).to(pred_classes.device) # pred_pair_instance.pred_pair_boxes = Boxes(torch.FloatTensor(pred_pair_boxes)).to(pred_classes.device) # # pred_pair_instance.pred_pair_masks=torch.stack(pred_pair_masks) # pred_pair_instance.pred_pair_locations = torch.FloatTensor(pred_pair_locations).to(pred_classes.device) # pred_pair_instance.pred_pair_union_locations = torch.FloatTensor(pred_pair_union_locations).to(pred_classes.device) # pred_pair_instance.pred_pair_iou = torch.FloatTensor(pred_pair_iou).to(pred_classes.device) # pred_subobj_ids = torch.IntTensor(pred_subobj_ids).to(pred_classes.device) # pred_pair_relate_matrix = [] # for i in range(pred_subobj_ids.shape[0]): # pred_pair_relate_matrix.append((pred_subobj_ids[i] == pred_subobj_ids).sum(1)) # pred_union_iou=[] # for box1 in pred_pair_instance.pred_pair_boxes.tensor: # pred_union_iou_row=[] # for box2 in pred_pair_instance.pred_pair_boxes.tensor: # pred_union_iou_row.append(box_iou(box1, box2)) # pred_union_iou.append(pred_union_iou_row) # pred_pair_instance.pred_union_iou=torch.FloatTensor(pred_union_iou).to(pred_classes.device) # pred_pair_relate_matrix = torch.stack(pred_pair_relate_matrix) # pred_pair_instance.pred_pair_relate_matrix = pred_pair_relate_matrix.float() # pred_pair_instance_relate_matrix = torch.stack(pred_pair_instance_relate_matrix) # pred_pair_instance.pred_pair_instance_relate_matrix = pred_pair_instance_relate_matrix.float() pred_pair_instance.pred_pair_sub_classes = pred_pair_sub_classes pred_pair_instance.pred_pair_obj_classes = pred_pair_obj_classes pred_pair_instance.pred_pair_boxes = Boxes(pred_pair_boxes) pred_pair_instance.pred_pair_locations = pred_pair_locations pred_pair_instance.pred_pair_union_locations = pred_pair_union_locations pred_pair_instance.pred_pair_iou = pred_pair_iou pred_pair_instance.pred_union_iou = pred_union_iou pred_pair_instance.pred_pair_instance_relate_matrix = pred_pair_instance_relate_matrix pred_pair_instances.append(pred_pair_instance) return pred_pair_instances
def _forward_box( self, features: Dict[str, torch.Tensor], proposals: List[Instances] ) -> Union[Dict[str, torch.Tensor], List[Instances]]: """ Forward logic of the box prediction branch. If `self.train_on_pred_boxes is True`, the function puts predicted boxes in the `proposal_boxes` field of `proposals` argument. Args: features (dict[str, Tensor]): mapping from feature map names to tensor. Same as in :meth:`ROIHeads.forward`. proposals (list[Instances]): the per-image object proposals with their matching ground truth. Each has fields "proposal_boxes", and "objectness_logits", "gt_classes", "gt_boxes". Returns: In training, a dict of losses. In inference, a list of `Instances`, the predicted instances. """ features = [features[f] for f in self.box_in_features] box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) objectness_logits = torch.cat( [x.objectness_logits + 1 for x in proposals], dim=0) box_features = box_features * objectness_logits.view(-1, 1, 1, 1) # torch.cuda.empty_cache() box_features = self.box_head(box_features) predictions = self.box_predictor(box_features, proposals) # del box_features if self.training: losses = self.box_predictor.losses(predictions, proposals, self.gt_classes_img_oh) self.pred_class_img_logits = (self.box_predictor.predict_probs_img( predictions, proposals).clone().detach()) prev_pred_scores = predictions[0].detach() for k in range(self.refine_K): predictions_k = self.box_refinery[k](box_features) losses_k = self.box_refinery[k].losses_pcl( predictions_k, proposals, prev_pred_scores, self.gt_classes_img_oh) prev_pred_scores = self.box_refinery[k].predict_probs( predictions_k, proposals) prev_pred_scores = [ prev_pred_score.detach() for prev_pred_score in prev_pred_scores ][0] losses.update(losses_k) # proposals is modified in-place below, so losses must be computed first. if self.train_on_pred_boxes: with torch.no_grad(): pred_boxes = self.box_predictor.predict_boxes_for_gt_classes( predictions, proposals) for proposals_per_image, pred_boxes_per_image in zip( proposals, pred_boxes): proposals_per_image.proposal_boxes = Boxes( pred_boxes_per_image) return losses else: predictions_K = [] for k in range(self.refine_K): predictions_k = self.box_refinery[k](box_features) predictions_K.append(predictions_k) pred_instances, _, all_scores, all_boxes = self.box_refinery[ -1].inference(predictions_K, proposals, pcl_bg=True) return pred_instances, all_scores, all_boxes
def test_rpn(self): torch.manual_seed(121) cfg = get_cfg() cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RPN" cfg.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator" cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1) backbone = build_backbone(cfg) proposal_generator = build_proposal_generator(cfg, backbone.output_shape()) num_images = 2 images_tensor = torch.rand(num_images, 20, 30) image_sizes = [(10, 10), (20, 30)] images = ImageList(images_tensor, image_sizes) image_shape = (15, 15) num_channels = 1024 features = {"res4": torch.rand(num_images, num_channels, 1, 2)} gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32) gt_instances = Instances(image_shape) gt_instances.gt_boxes = Boxes(gt_boxes) with EventStorage(): # capture events in a new storage to discard them proposals, proposal_losses = proposal_generator( images, features, [gt_instances[0], gt_instances[1]]) expected_losses = { "loss_rpn_cls": torch.tensor(0.0804563984), "loss_rpn_loc": torch.tensor(0.0990132466), } for name in expected_losses.keys(): self.assertTrue( torch.allclose(proposal_losses[name], expected_losses[name])) expected_proposal_boxes = [ Boxes(torch.tensor([[0, 0, 10, 10], [7.3365392685, 0, 10, 10]])), Boxes( torch.tensor([ [0, 0, 30, 20], [0, 0, 16.7862777710, 13.1362524033], [0, 0, 30, 13.3173446655], [0, 0, 10.8602609634, 20], [7.7165775299, 0, 27.3875980377, 20], ])), ] expected_objectness_logits = [ torch.tensor([0.1225359365, -0.0133192837]), torch.tensor([ 0.1415634006, 0.0989848152, 0.0565387346, -0.0072308783, -0.0428492837 ]), ] for proposal, expected_proposal_box, im_size, expected_objectness_logit in zip( proposals, expected_proposal_boxes, image_sizes, expected_objectness_logits): self.assertEqual(len(proposal), len(expected_proposal_box)) self.assertEqual(proposal.image_size, im_size) self.assertTrue( torch.allclose(proposal.proposal_boxes.tensor, expected_proposal_box.tensor)) self.assertTrue( torch.allclose(proposal.objectness_logits, expected_objectness_logit))
def inference_single_image(self, cate_preds, kernel_preds, seg_preds, cur_size, ori_size): # overall info. h, w = cur_size f_h, f_w = seg_preds.size()[-2:] ratio = math.ceil(h / f_h) upsampled_size_out = (int(f_h * ratio), int(f_w * ratio)) # process. inds = (cate_preds > self.score_threshold) cate_scores = cate_preds[inds] if len(cate_scores) == 0: results = Instances(ori_size) results.scores = torch.tensor([]) results.pred_classes = torch.tensor([]) results.pred_masks = torch.tensor([]) results.pred_boxes = Boxes(torch.tensor([])) return results # cate_labels & kernel_preds inds = inds.nonzero() cate_labels = inds[:, 1] kernel_preds = kernel_preds[inds[:, 0]] # trans vector. size_trans = cate_labels.new_tensor(self.num_grids).pow(2).cumsum(0) strides = kernel_preds.new_ones(size_trans[-1]) n_stage = len(self.num_grids) strides[:size_trans[0]] *= self.instance_strides[0] for ind_ in range(1, n_stage): strides[size_trans[ind_ - 1]:size_trans[ind_]] *= self.instance_strides[ ind_] strides = strides[inds[:, 0]] # mask encoding. N, I = kernel_preds.shape kernel_preds = kernel_preds.view(N, I, 1, 1) seg_preds = F.conv2d(seg_preds, kernel_preds, stride=1).squeeze(0).sigmoid() # mask. seg_masks = seg_preds > self.mask_threshold sum_masks = seg_masks.sum((1, 2)).float() # filter. keep = sum_masks > strides if keep.sum() == 0: results = Instances(ori_size) results.scores = torch.tensor([]) results.pred_classes = torch.tensor([]) results.pred_masks = torch.tensor([]) results.pred_boxes = Boxes(torch.tensor([])) return results seg_masks = seg_masks[keep, ...] seg_preds = seg_preds[keep, ...] sum_masks = sum_masks[keep] cate_scores = cate_scores[keep] cate_labels = cate_labels[keep] # mask scoring. seg_scores = (seg_preds * seg_masks.float()).sum((1, 2)) / sum_masks cate_scores *= seg_scores # sort and keep top nms_pre sort_inds = torch.argsort(cate_scores, descending=True) if len(sort_inds) > self.max_before_nms: sort_inds = sort_inds[:self.max_before_nms] seg_masks = seg_masks[sort_inds, :, :] seg_preds = seg_preds[sort_inds, :, :] sum_masks = sum_masks[sort_inds] cate_scores = cate_scores[sort_inds] cate_labels = cate_labels[sort_inds] if self.nms_type == "matrix": # matrix nms & filter. cate_scores = matrix_nms(cate_labels, seg_masks, sum_masks, cate_scores, sigma=self.nms_sigma, kernel=self.nms_kernel) keep = cate_scores >= self.update_threshold elif self.nms_type == "mask": # original mask nms. keep = mask_nms(cate_labels, seg_masks, sum_masks, cate_scores, nms_thr=self.mask_threshold) else: raise NotImplementedError if keep.sum() == 0: results = Instances(ori_size) results.scores = torch.tensor([]) results.pred_classes = torch.tensor([]) results.pred_masks = torch.tensor([]) results.pred_boxes = Boxes(torch.tensor([])) return results seg_preds = seg_preds[keep, :, :] cate_scores = cate_scores[keep] cate_labels = cate_labels[keep] # sort and keep top_k sort_inds = torch.argsort(cate_scores, descending=True) if len(sort_inds) > self.max_per_img: sort_inds = sort_inds[:self.max_per_img] seg_preds = seg_preds[sort_inds, :, :] cate_scores = cate_scores[sort_inds] cate_labels = cate_labels[sort_inds] # reshape to original size. seg_preds = F.interpolate(seg_preds.unsqueeze(0), size=upsampled_size_out, mode='bilinear')[:, :, :h, :w] seg_masks = F.interpolate(seg_preds, size=ori_size, mode='bilinear').squeeze(0) seg_masks = seg_masks > self.mask_threshold results = Instances(ori_size) results.pred_classes = cate_labels results.scores = cate_scores results.pred_masks = seg_masks # get bbox from mask pred_boxes = torch.zeros(seg_masks.size(0), 4) #for i in range(seg_masks.size(0)): # mask = seg_masks[i].squeeze() # ys, xs = torch.where(mask) # pred_boxes[i] = torch.tensor([xs.min(), ys.min(), xs.max(), ys.max()]).float() results.pred_boxes = Boxes(pred_boxes) return results
def convert_to_coco_dict(dataset_name): """ Convert an instance detection/segmentation or keypoint detection dataset in detectron2's standard format into COCO json format. Generic dataset description can be found here: https://detectron2.readthedocs.io/tutorials/datasets.html#register-a-dataset COCO data format description can be found here: http://cocodataset.org/#format-data Args: dataset_name (str): name of the source dataset Must be registered in DatastCatalog and in detectron2's standard format. Must have corresponding metadata "thing_classes" Returns: coco_dict: serializable dict in COCO json format """ dataset_dicts = DatasetCatalog.get(dataset_name) metadata = MetadataCatalog.get(dataset_name) # unmap the category mapping ids for COCO if hasattr(metadata, "thing_dataset_id_to_contiguous_id"): reverse_id_mapping = {v: k for k, v in metadata.thing_dataset_id_to_contiguous_id.items()} reverse_id_mapper = lambda contiguous_id: reverse_id_mapping[contiguous_id] # noqa else: reverse_id_mapper = lambda contiguous_id: contiguous_id # noqa categories = [ {"id": reverse_id_mapper(id), "name": name} for id, name in enumerate(metadata.thing_classes) ] logger.info("Converting dataset dicts into COCO format") coco_images = [] coco_annotations = [] for image_id, image_dict in enumerate(dataset_dicts): coco_image = { "id": image_dict.get("image_id", image_id), "width": image_dict["width"], "height": image_dict["height"], "file_name": image_dict["file_name"], } coco_images.append(coco_image) anns_per_image = image_dict.get("annotations", []) for annotation in anns_per_image: # create a new dict with only COCO fields coco_annotation = {} # COCO requirement: XYWH box format bbox = annotation["bbox"] bbox_mode = annotation["bbox_mode"] bbox = BoxMode.convert(bbox, bbox_mode, BoxMode.XYWH_ABS) # COCO requirement: instance area if "segmentation" in annotation: # Computing areas for instances by counting the pixels segmentation = annotation["segmentation"] # TODO: check segmentation type: RLE, BinaryMask or Polygon if isinstance(segmentation, list): polygons = PolygonMasks([segmentation]) area = polygons.area()[0].item() elif isinstance(segmentation, dict): # RLE area = mask_util.area(segmentation).item() else: raise TypeError(f"Unknown segmentation type {type(segmentation)}!") else: # Computing areas using bounding boxes bbox_xy = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) area = Boxes([bbox_xy]).area()[0].item() if "keypoints" in annotation: keypoints = annotation["keypoints"] # list[int] for idx, v in enumerate(keypoints): if idx % 3 != 2: # COCO's segmentation coordinates are floating points in [0, H or W], # but keypoint coordinates are integers in [0, H-1 or W-1] # For COCO format consistency we substract 0.5 # https://github.com/facebookresearch/detectron2/pull/175#issuecomment-551202163 keypoints[idx] = v - 0.5 if "num_keypoints" in annotation: num_keypoints = annotation["num_keypoints"] else: num_keypoints = sum(kp > 0 for kp in keypoints[2::3]) # COCO requirement: # linking annotations to images # "id" field must start with 1 coco_annotation["id"] = len(coco_annotations) + 1 coco_annotation["image_id"] = coco_image["id"] coco_annotation["bbox"] = [round(float(x), 3) for x in bbox] coco_annotation["area"] = float(area) coco_annotation["iscrowd"] = annotation.get("iscrowd", 0) coco_annotation["category_id"] = reverse_id_mapper(annotation["category_id"]) # Add optional fields if "keypoints" in annotation: coco_annotation["keypoints"] = keypoints coco_annotation["num_keypoints"] = num_keypoints if "segmentation" in annotation: seg = coco_annotation["segmentation"] = annotation["segmentation"] if isinstance(seg, dict): # RLE counts = seg["counts"] if not isinstance(counts, str): # make it json-serializable seg["counts"] = counts.decode("ascii") coco_annotations.append(coco_annotation) logger.info( "Conversion finished, " f"#images: {len(coco_images)}, #annotations: {len(coco_annotations)}" ) info = { "date_created": str(datetime.datetime.now()), "description": "Automatically generated COCO json file for Detectron2.", } coco_dict = {"info": info, "images": coco_images, "categories": categories, "licenses": None} if len(coco_annotations) > 0: coco_dict["annotations"] = coco_annotations return coco_dict
def match_predictions_to_groundtruth(predicted_box_means, predicted_cls_probs, predicted_box_covariances, gt_box_means, gt_cat_idxs, iou_min=0.1, iou_correct=0.7): true_positives = dict({ 'predicted_box_means': torch.Tensor().to(device), 'predicted_box_covariances': torch.Tensor().to(device), 'predicted_cls_probs': torch.Tensor().to(device), 'gt_box_means': torch.Tensor().to(device), 'gt_cat_idxs': torch.Tensor().to(device), 'iou_with_ground_truth': torch.Tensor().to(device) }) duplicates = dict({ 'predicted_box_means': torch.Tensor().to(device), 'predicted_box_covariances': torch.Tensor().to(device), 'predicted_cls_probs': torch.Tensor().to(device), 'gt_box_means': torch.Tensor().to(device), 'gt_cat_idxs': torch.Tensor().to(device), 'iou_with_ground_truth': torch.Tensor().to(device) }) false_positives = dict({ 'predicted_box_means': torch.Tensor().to(device), 'predicted_box_covariances': torch.Tensor().to(device), 'predicted_cls_probs': torch.Tensor().to(device) }) false_negatives = dict({ 'gt_box_means': torch.Tensor().to(device), 'gt_cat_idxs': torch.Tensor().to(device) }) with tqdm.tqdm(total=len(predicted_box_means)) as pbar: for key in predicted_box_means.keys(): pbar.update(1) # Check if gt available, if not all detections go to false # positives if key not in gt_box_means.keys(): false_positives['predicted_box_means'] = torch.cat( (false_positives['predicted_box_means'], predicted_box_means[key])) false_positives['predicted_cls_probs'] = torch.cat( (false_positives['predicted_cls_probs'], predicted_cls_probs[key])) false_positives['predicted_box_covariances'] = torch.cat( (false_positives['predicted_box_covariances'], predicted_box_covariances[key])) continue # Compute iou between gt boxes and all predicted boxes in frame frame_gt_boxes = Boxes(gt_box_means[key]) frame_predicted_boxes = Boxes(predicted_box_means[key]) match_iou = pairwise_iou(frame_gt_boxes, frame_predicted_boxes) # Get false negative ground truth, which are fully missed. # These can be found by looking for ground truth boxes that have an # iou < iou_min with any detection false_negative_idxs = (match_iou <= iou_min).all(1) false_negatives['gt_box_means'] = torch.cat( (false_negatives['gt_box_means'], gt_box_means[key][false_negative_idxs])) false_negatives['gt_cat_idxs'] = torch.cat( (false_negatives['gt_cat_idxs'], gt_cat_idxs[key][false_negative_idxs])) # False positives are detections that have an iou < match iou with # any ground truth object. false_positive_idxs = (match_iou <= iou_min).all(0) false_positives['predicted_box_means'] = torch.cat( (false_positives['predicted_box_means'], predicted_box_means[key][false_positive_idxs])) false_positives['predicted_cls_probs'] = torch.cat( (false_positives['predicted_cls_probs'], predicted_cls_probs[key][false_positive_idxs])) false_positives['predicted_box_covariances'] = torch.cat( (false_positives['predicted_box_covariances'], predicted_box_covariances[key][false_positive_idxs])) # True positives are any detections with match iou > iou correct. We need to separate these detections to # True positive and duplicate set. The true positive detection is the detection assigned the highest score # by the neural network. true_positive_idxs = torch.nonzero(match_iou >= iou_correct) # Setup tensors to allow assignment of detections only once. gt_idxs_processed = torch.tensor([]).type( torch.LongTensor).to(device) for i in torch.arange(frame_gt_boxes.tensor.shape[0]): # Check if true positive has been previously assigned to a ground truth box and remove it if this is # the case. Very rare occurrence but need to handle it # nevertheless. gt_idxs = true_positive_idxs[true_positive_idxs[:, 0] == i][:, 1] non_valid_idxs = torch.nonzero( gt_idxs_processed[..., None] == gt_idxs) if non_valid_idxs.shape[0] > 0: gt_idxs[non_valid_idxs[:, 1]] = -1 gt_idxs = gt_idxs[gt_idxs != -1] if gt_idxs.shape[0] > 0: current_matches_predicted_cls_probs = predicted_cls_probs[ key][gt_idxs] max_score, _ = torch.max( current_matches_predicted_cls_probs, 1) _, max_idxs = max_score.topk(max_score.shape[0]) if max_idxs.shape[0] > 1: max_idx = max_idxs[0] duplicate_idxs = max_idxs[1:] else: max_idx = max_idxs duplicate_idxs = torch.empty(0).to(device) current_matches_predicted_box_means = predicted_box_means[ key][gt_idxs] current_matches_predicted_box_covariances = predicted_box_covariances[ key][gt_idxs] # Highest scoring detection goes to true positives true_positives['predicted_box_means'] = torch.cat( (true_positives['predicted_box_means'], current_matches_predicted_box_means[max_idx:max_idx + 1, :])) true_positives['predicted_cls_probs'] = torch.cat( (true_positives['predicted_cls_probs'], current_matches_predicted_cls_probs[max_idx:max_idx + 1, :])) true_positives['predicted_box_covariances'] = torch.cat( (true_positives['predicted_box_covariances'], current_matches_predicted_box_covariances[ max_idx:max_idx + 1, :])) true_positives['gt_box_means'] = torch.cat( (true_positives['gt_box_means'], gt_box_means[key][i:i + 1, :])) true_positives['gt_cat_idxs'] = torch.cat( (true_positives['gt_cat_idxs'], gt_cat_idxs[key][i:i + 1, :])) true_positives['iou_with_ground_truth'] = torch.cat( (true_positives['iou_with_ground_truth'], match_iou[i, gt_idxs][max_idx:max_idx + 1])) # Lower scoring redundant detections go to duplicates if duplicate_idxs.shape[0] > 1: duplicates['predicted_box_means'] = torch.cat( (duplicates['predicted_box_means'], current_matches_predicted_box_means[ duplicate_idxs, :])) duplicates['predicted_cls_probs'] = torch.cat( (duplicates['predicted_cls_probs'], current_matches_predicted_cls_probs[ duplicate_idxs, :])) duplicates['predicted_box_covariances'] = torch.cat( (duplicates['predicted_box_covariances'], current_matches_predicted_box_covariances[ duplicate_idxs, :])) duplicates['gt_box_means'] = torch.cat( (duplicates['gt_box_means'], gt_box_means[key][ np.repeat(i, duplicate_idxs.shape[0]), :])) duplicates['gt_cat_idxs'] = torch.cat( (duplicates['gt_cat_idxs'], gt_cat_idxs[key][ np.repeat(i, duplicate_idxs.shape[0]), :])) duplicates['iou_with_ground_truth'] = torch.cat( (duplicates['iou_with_ground_truth'], match_iou[i, gt_idxs][duplicate_idxs])) elif duplicate_idxs.shape[0] == 1: # Special case when only one duplicate exists, required to # index properly for torch.cat duplicates['predicted_box_means'] = torch.cat( (duplicates['predicted_box_means'], current_matches_predicted_box_means[ duplicate_idxs:duplicate_idxs + 1, :])) duplicates['predicted_cls_probs'] = torch.cat( (duplicates['predicted_cls_probs'], current_matches_predicted_cls_probs[ duplicate_idxs:duplicate_idxs + 1, :])) duplicates['predicted_box_covariances'] = torch.cat( (duplicates['predicted_box_covariances'], current_matches_predicted_box_covariances[ duplicate_idxs:duplicate_idxs + 1, :])) duplicates['gt_box_means'] = torch.cat( (duplicates['gt_box_means'], gt_box_means[key][i:i + 1, :])) duplicates['gt_cat_idxs'] = torch.cat( (duplicates['gt_cat_idxs'], gt_cat_idxs[key][i:i + 1, :])) duplicates['iou_with_ground_truth'] = torch.cat( (duplicates['iou_with_ground_truth'], match_iou[i, gt_idxs][duplicate_idxs:duplicate_idxs + 1])) matched_results = dict() matched_results.update({ "true_positives": true_positives, "duplicates": duplicates, "false_positives": false_positives, "false_negatives": false_negatives }) return matched_results
def label_and_sample_proposals(self, proposals, targets): """ Prepare some proposals to be used to train the ROI heads. It performs box matching between `proposals` and `targets`, and assigns training labels to the proposals. It returns ``self.batch_size_per_image`` random samples from proposals and groundtruth boxes, with a fraction of positives that is no larger than ``self.positive_sample_fraction``. Args: See :meth:`ROIHeads.forward` Returns: list[Instances]: length `N` list of `Instances`s containing the proposals sampled for training. Each `Instances` has the following fields: - proposal_boxes: the proposal boxes - gt_boxes: the ground-truth box that the proposal is assigned to (this is only meaningful if the proposal has a label > 0; if label = 0 then the ground-truth box is random) Other fields such as "gt_classes", "gt_masks", that's included in `targets`. """ # ywlee for using targets.gt_classes # in add_ground_truth_to_proposal() # gt_boxes = [x.gt_boxes for x in targets] # Augment proposals with ground-truth boxes. # In the case of learned proposals (e.g., RPN), when training starts # the proposals will be low quality due to random initialization. # It's possible that none of these initial # proposals have high enough overlap with the gt objects to be used # as positive examples for the second stage components (box head, # cls head, mask head). Adding the gt boxes to the set of proposals # ensures that the second stage components will have some positive # examples from the start of training. For RPN, this augmentation improves # convergence and empirically improves box AP on COCO by about 0.5 # points (under one tested configuration). if self.proposal_append_gt: proposals = add_ground_truth_to_proposals(targets, proposals) proposals_with_gt = [] num_fg_samples = [] num_bg_samples = [] for proposals_per_image, targets_per_image in zip(proposals, targets): has_gt = len(targets_per_image) > 0 match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes) matched_idxs, matched_labels = self.proposal_matcher( match_quality_matrix) sampled_idxs, gt_classes = self._sample_proposals( matched_idxs, matched_labels, targets_per_image.gt_classes) # Set target attributes of the sampled proposals: proposals_per_image = proposals_per_image[sampled_idxs] proposals_per_image.gt_classes = gt_classes # We index all the attributes of targets that start with "gt_" # and have not been added to proposals yet (="gt_classes"). if has_gt: sampled_targets = matched_idxs[sampled_idxs] # NOTE: here the indexing waste some compute, because heads # like masks, keypoints, etc, will filter the proposals again, # (by foreground/background, or number of keypoints in the image, etc) # so we essentially index the data twice. for (trg_name, trg_value) in targets_per_image.get_fields().items(): if trg_name.startswith( "gt_") and not proposals_per_image.has(trg_name): proposals_per_image.set(trg_name, trg_value[sampled_targets]) else: gt_boxes = Boxes( targets_per_image.gt_boxes.tensor.new_zeros( (len(sampled_idxs), 4))) proposals_per_image.gt_boxes = gt_boxes num_bg_samples.append( (gt_classes == self.num_classes).sum().item()) num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) proposals_with_gt.append(proposals_per_image) # Log the number of fg/bg samples that are selected for training ROI heads storage = get_event_storage() storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) return proposals_with_gt
def _evaluate_box_proposals(dataset_predictions, lvis_api, thresholds=None, area="all", limit=None): """ Evaluate detection proposal recall metrics. This function is a much faster alternative to the official LVIS API recall evaluation code. However, it produces slightly different results. """ # Record max overlap value for each gt box # Return vector of overlap values areas = { "all": 0, "small": 1, "medium": 2, "large": 3, "96-128": 4, "128-256": 5, "256-512": 6, "512-inf": 7, } area_ranges = [ [0**2, 1e5**2], # all [0**2, 32**2], # small [32**2, 96**2], # medium [96**2, 1e5**2], # large [96**2, 128**2], # 96-128 [128**2, 256**2], # 128-256 [256**2, 512**2], # 256-512 [512**2, 1e5**2], ] # 512-inf assert area in areas, "Unknown area range: {}".format(area) area_range = area_ranges[areas[area]] gt_overlaps = [] num_pos = 0 for prediction_dict in dataset_predictions: predictions = prediction_dict["proposals"] # sort predictions in descending order # TODO maybe remove this and make it explicit in the documentation inds = predictions.objectness_logits.sort(descending=True)[1] predictions = predictions[inds] ann_ids = lvis_api.get_ann_ids(img_ids=[prediction_dict["image_id"]]) anno = lvis_api.load_anns(ann_ids) gt_boxes = [ BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno ] gt_boxes = torch.as_tensor(gt_boxes).reshape( -1, 4) # guard against no boxes gt_boxes = Boxes(gt_boxes) gt_areas = torch.as_tensor([obj["area"] for obj in anno]) if len(gt_boxes) == 0 or len(predictions) == 0: continue valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) gt_boxes = gt_boxes[valid_gt_inds] num_pos += len(gt_boxes) if len(gt_boxes) == 0: continue if limit is not None and len(predictions) > limit: predictions = predictions[:limit] overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes) _gt_overlaps = torch.zeros(len(gt_boxes)) for j in range(min(len(predictions), len(gt_boxes))): # find which proposal box maximally covers each gt box # and get the iou amount of coverage for each gt box max_overlaps, argmax_overlaps = overlaps.max(dim=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ovr, gt_ind = max_overlaps.max(dim=0) assert gt_ovr >= 0 # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] # record the iou coverage of this gt box _gt_overlaps[j] = overlaps[box_ind, gt_ind] assert _gt_overlaps[j] == gt_ovr # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 # append recorded iou coverage level gt_overlaps.append(_gt_overlaps) gt_overlaps = torch.cat(gt_overlaps, dim=0) gt_overlaps, _ = torch.sort(gt_overlaps) if thresholds is None: step = 0.05 thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) recalls = torch.zeros_like(thresholds) # compute recall for each iou threshold for i, t in enumerate(thresholds): recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) # ar = 2 * np.trapz(recalls, thresholds) ar = recalls.mean() return { "ar": ar, "recalls": recalls, "thresholds": thresholds, "gt_overlaps": gt_overlaps, "num_pos": num_pos, }
def convert_to_coco_dict(dataset_name): """ Convert a dataset in detectron2's standard format into COCO json format Generic dataset description can be found here: https://detectron2.readthedocs.io/tutorials/datasets.html#register-a-dataset COCO data format description can be found here: http://cocodataset.org/#format-data Args: dataset_name: name of the source dataset must be registered in DatastCatalog and in detectron2's standard format Returns: coco_dict: serializable dict in COCO json format """ dataset_dicts = DatasetCatalog.get(dataset_name) categories = [{ "id": id, "name": name } for id, name in enumerate( MetadataCatalog.get(dataset_name).thing_classes)] logger.info("Converting dataset dicts into COCO format") coco_images = [] coco_annotations = [] for image_id, image_dict in enumerate(dataset_dicts): coco_image = { "id": image_dict.get("image_id", image_id), "width": image_dict["width"], "height": image_dict["height"], "file_name": image_dict["file_name"], } coco_images.append(coco_image) anns_per_image = image_dict["annotations"] for annotation in anns_per_image: # create a new dict with only COCO fields coco_annotation = {} # COCO requirement: XYWH box format bbox = annotation["bbox"] bbox_mode = annotation["bbox_mode"] bbox = BoxMode.convert(bbox, bbox_mode, BoxMode.XYWH_ABS) # COCO requirement: instance area if "segmentation" in annotation: # Computing areas for instances by counting the pixels segmentation = annotation["segmentation"] # TODO: check segmentation type: RLE, BinaryMask or Polygon polygons = PolygonMasks([segmentation]) area = polygons.area()[0].item() else: # Computing areas using bounding boxes bbox_xy = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) area = Boxes([bbox_xy]).area()[0].item() if "keypoints" in annotation: keypoints = annotation["keypoints"] # list[int] for idx, v in enumerate(keypoints): if idx % 3 != 2: # COCO's segmentation coordinates are floating points in [0, H or W], # but keypoint coordinates are integers in [0, H-1 or W-1] # For COCO format consistency we substract 0.5 # https://github.com/facebookresearch/detectron2/pull/175#issuecomment-551202163 keypoints[idx] = v - 0.5 if "num_keypoints" in annotation: num_keypoints = annotation["num_keypoints"] else: num_keypoints = sum(kp > 0 for kp in keypoints[2::3]) # COCO requirement: # linking annotations to images # "id" field must start with 1 coco_annotation["id"] = len(coco_annotations) + 1 coco_annotation["image_id"] = coco_image["id"] coco_annotation["bbox"] = [round(float(x), 3) for x in bbox] coco_annotation["area"] = area coco_annotation["category_id"] = annotation["category_id"] coco_annotation["iscrowd"] = annotation.get("iscrowd", 0) # Add optional fields if "keypoints" in annotation: coco_annotation["keypoints"] = keypoints coco_annotation["num_keypoints"] = num_keypoints if "segmentation" in annotation: coco_annotation["segmentation"] = annotation["segmentation"] coco_annotations.append(coco_annotation) logger.info( "Conversion finished, " f"num images: {len(coco_images)}, num annotations: {len(coco_annotations)}" ) info = { "date_created": str(datetime.datetime.now()), "description": "Automatically generated COCO json file for Detectron2.", } coco_dict = { "info": info, "images": coco_images, "annotations": coco_annotations, "categories": categories, "licenses": None, } return coco_dict
def apply_late_fusion_and_evaluate(cfg, evaluator, det_1, det_2, method): evaluator.reset() img_folder = '../../../Datasets/FLIR/val/thermal_8_bit/' num_img = len(det_2['image']) count_1 = 0 count_2 = 0 count_fusion = 0 print('Method: ', method) for i in range(num_img): info_1 = {} info_1['img_name'] = det_1['image'][i] info_1['bbox'] = det_1['boxes'][i] info_1['score'] = det_1['scores'][i] info_1['class'] = det_1['classes'][i] info_2 = {} info_2['img_name'] = det_2['image'][i].split('.')[0] + '.jpeg' info_2['bbox'] = det_2['boxes'][i] info_2['score'] = det_2['scores'][i] info_2['class'] = det_2['classes'][i] #pdb.set_trace() if len(info_1['bbox']) == 0 or len(info_2['bbox']) == 0: if (len(info_1['bbox']) > 0): out_boxes = np.array(info_1['bbox']) out_class = torch.Tensor(info_1['class']) out_scores = torch.Tensor(info_1['score']) elif (len(info_2['bbox']) > 0): out_boxes = np.array(info_2['bbox']) out_class = torch.Tensor(info_2['class']) out_scores = torch.Tensor(info_2['score']) else: out_boxes = np.array(info_2['bbox']) out_class = torch.Tensor(info_2['class']) out_scores = torch.Tensor(info_2['score']) else: if method == 'nms': out_boxes, out_scores, out_class = nms_1(info_1, info_2) elif method == 'pooling': in_boxes, in_scores, in_class = prepare_data(info_1, info_2) out_boxes = in_boxes out_scores = torch.Tensor(in_scores) out_class = torch.Tensor(in_class) elif method == 'baysian' or method == 'baysian_avg_bbox' or method == 'avg_score' or method == 'baysian_wt_score_box': threshold = 0.5 in_boxes, in_scores, in_class = prepare_data(info_1, info_2) keep, out_scores, out_boxes, out_class = nms_2( in_boxes, in_scores, in_class, threshold, method) count_1 += len(info_1['bbox']) count_2 += len(info_2['bbox']) count_fusion += len(out_boxes) file_name = img_folder + info_1['img_name'].split('.')[0] + '.jpeg' img = cv2.imread(file_name) H, W, _ = img.shape # Handle inputs inputs = [] input_info = {} input_info['file_name'] = file_name input_info['height'] = H input_info['width'] = W input_info['image_id'] = det_2['image_id'][i] input_info['image'] = torch.Tensor(img) inputs.append(input_info) # Handle outputs outputs = [] out_info = {} proposals = Instances([H, W]) proposals.pred_boxes = Boxes(out_boxes) proposals.scores = out_scores proposals.pred_classes = out_class out_info['instances'] = proposals outputs.append(out_info) evaluator.process(inputs, outputs) img = draw_box(img, out_boxes, (0, 255, 0)) out_img_name = 'out_img_baysian_fusion/' + file_name.split( 'thermal_8_bit/')[1].split('.')[0] + '_baysian_avg_bbox.jpg' #cv2.imwrite(out_img_name, img) #pdb.set_trace() """ if '09115' in file_name: out_img_name = 'out_img_baysian_fusion/' + file_name.split('thermal_8_bit/')[1].split('.')[0]+'_baysian_avg_bbox.jpg' pdb.set_trace() cv2.imwrite(out_img_name, img) """ results = evaluator.evaluate(out_eval_path='FLIR_pooling_.out') if results is None: results = {} avgRGB = count_1 / num_img avgThermal = count_2 / num_img avgNMS = count_fusion / num_img print('Avg bbox for RGB:', avgRGB, "average count thermal:", avgThermal, 'average count nms:', avgNMS) return results
def forward_for_single_feature_map(self, locations, logits_pred, reg_pred, ctrness_pred, image_sizes, top_feat=None): N, C, H, W = logits_pred.shape # put in the same format as locations logits_pred = logits_pred.view(N, C, H, W).permute(0, 2, 3, 1) logits_pred = logits_pred.reshape(N, -1, C).sigmoid() box_regression = reg_pred.view(N, 4, H, W).permute(0, 2, 3, 1) box_regression = box_regression.reshape(N, -1, 4) ctrness_pred = ctrness_pred.view(N, 1, H, W).permute(0, 2, 3, 1) ctrness_pred = ctrness_pred.reshape(N, -1).sigmoid() if top_feat is not None: top_feat = top_feat.view(N, -1, H, W).permute(0, 2, 3, 1) top_feat = top_feat.reshape(N, H * W, -1) # if self.thresh_with_ctr is True, we multiply the classification # scores with centerness scores before applying the threshold. if self.thresh_with_ctr: logits_pred = logits_pred * ctrness_pred[:, :, None] candidate_inds = logits_pred > self.pre_nms_thresh pre_nms_top_n = candidate_inds.view(N, -1).sum(1) pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_topk) if not self.thresh_with_ctr: logits_pred = logits_pred * ctrness_pred[:, :, None] results = [] for i in range(N): per_box_cls = logits_pred[i] per_candidate_inds = candidate_inds[i] per_box_cls = per_box_cls[per_candidate_inds] per_candidate_nonzeros = per_candidate_inds.nonzero() per_box_loc = per_candidate_nonzeros[:, 0] per_class = per_candidate_nonzeros[:, 1] per_box_regression = box_regression[i] per_box_regression = per_box_regression[per_box_loc] per_locations = locations[per_box_loc] if top_feat is not None: per_top_feat = top_feat[i] per_top_feat = per_top_feat[per_box_loc] per_pre_nms_top_n = pre_nms_top_n[i] if per_candidate_inds.sum().item() > per_pre_nms_top_n.item(): per_box_cls, top_k_indices = \ per_box_cls.topk(per_pre_nms_top_n, sorted=False) per_class = per_class[top_k_indices] per_box_regression = per_box_regression[top_k_indices] per_locations = per_locations[top_k_indices] if top_feat is not None: per_top_feat = per_top_feat[top_k_indices] detections = torch.stack([ per_locations[:, 0] - per_box_regression[:, 0], per_locations[:, 1] - per_box_regression[:, 1], per_locations[:, 0] + per_box_regression[:, 2], per_locations[:, 1] + per_box_regression[:, 3], ], dim=1) boxlist = Instances(image_sizes[i]) boxlist.pred_boxes = Boxes(detections) boxlist.scores = torch.sqrt(per_box_cls) boxlist.pred_classes = per_class boxlist.locations = per_locations if top_feat is not None: boxlist.top_feat = per_top_feat results.append(boxlist) return results
def fast_rcnn_inference_single_image_with_anchor(proposals, boxes, scores, image_shape, score_thresh, nms_thresh, topk_per_image): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes per image. Returns: Same as `fast_rcnn_inference`, but for only one image. """ anchors = proposals.get_fields()['anchor_boxes'].tensor proposals = proposals.get_fields()['proposal_boxes'].tensor valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all( dim=1) if not valid_mask.all(): boxes = boxes[valid_mask] scores = scores[valid_mask] anchors = anchors[valid_mask] proposals = proposals[valid_mask] scores = scores[:, :-1] num_bbox_reg_classes = boxes.shape[1] // 4 # Convert to Boxes to use the `clip` function ... boxes = Boxes(boxes.reshape(-1, 4)) boxes.clip(image_shape) boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4 anchors = Boxes(anchors) proposals = Boxes(proposals) anchors.clip(image_shape) proposals.clip(image_shape) anchors = anchors.tensor proposals = proposals.tensor # Filter results based on detection scores filter_mask = scores > score_thresh # R x K # R' x 2. First column contains indices of the R predictions; # Second column contains indices of classes. filter_inds = filter_mask.nonzero() if num_bbox_reg_classes == 1: boxes = boxes[filter_inds[:, 0], 0] else: boxes = boxes[filter_mask] scores = scores[filter_mask] anchors = anchors[filter_inds[:, 0]] proposals = proposals[filter_inds[:, 0]] # Apply per-class NMS keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh) if topk_per_image >= 0: keep = keep[:topk_per_image] boxes, scores, filter_inds, anchors, proposals = boxes[keep], scores[keep], filter_inds[keep], anchors[keep], \ proposals[keep] result = Instances(image_shape) result.pred_boxes = Boxes(boxes) result.scores = scores result.pred_classes = filter_inds[:, 1] result.anchors = Boxes(anchors) result.proposals = Boxes(proposals) return result, filter_inds[:, 0]
def get_pgt_top_k( self, prev_pred_boxes, prev_pred_scores, proposals, top_k=1, thres=0, need_instance=True, need_weight=True, suffix="", ): if isinstance(prev_pred_scores, torch.Tensor): num_preds_per_image = [len(p) for p in proposals] prev_pred_scores = prev_pred_scores.split(num_preds_per_image, dim=0) else: assert isinstance(prev_pred_scores, list) assert isinstance(prev_pred_scores[0], torch.Tensor) assert isinstance(prev_pred_boxes, tuple) or isinstance(prev_pred_boxes, list) if isinstance(prev_pred_boxes[0], Boxes): num_preds = [len(prev_pred_box) for prev_pred_box in prev_pred_boxes] prev_pred_boxes = [ prev_pred_box.tensor.unsqueeze(1).expand(num_pred, self.num_classes, 4) for num_pred, prev_pred_box in zip(num_preds, prev_pred_boxes) ] else: assert isinstance(prev_pred_boxes[0], torch.Tensor) if self.cls_agnostic_bbox_reg: num_preds = [prev_pred_box.size(0) for prev_pred_box in prev_pred_boxes] prev_pred_boxes = [ prev_pred_box.unsqueeze(1).expand(num_pred, self.num_classes, 4) for num_pred, prev_pred_box in zip(num_preds, prev_pred_boxes) ] prev_pred_boxes = [ prev_pred_box.view(-1, self.num_classes, 4) for prev_pred_box in prev_pred_boxes ] prev_pred_scores = [ torch.index_select(prev_pred_score, 1, gt_int) for prev_pred_score, gt_int in zip(prev_pred_scores, self.gt_classes_img_int) ] prev_pred_boxes = [ torch.index_select(prev_pred_box, 1, gt_int) for prev_pred_box, gt_int in zip(prev_pred_boxes, self.gt_classes_img_int) ] # get top k num_preds = [prev_pred_score.size(0) for prev_pred_score in prev_pred_scores] if top_k >= 1: top_ks = [min(num_pred, int(top_k)) for num_pred in num_preds] elif top_k < 1 and top_k > 0: top_ks = [max(int(num_pred * top_k), 1) for num_pred in num_preds] else: top_ks = [min(num_pred, 1) for num_pred in num_preds] pgt_scores_idxs = [ torch.topk(prev_pred_score, top_k, dim=0) for prev_pred_score, top_k in zip(prev_pred_scores, top_ks) ] pgt_scores = [item[0] for item in pgt_scores_idxs] pgt_idxs = [item[1] for item in pgt_scores_idxs] pgt_idxs = [ torch.unsqueeze(pgt_idx, 2).expand(top_k, gt_int.numel(), 4) for pgt_idx, top_k, gt_int in zip(pgt_idxs, top_ks, self.gt_classes_img_int) ] pgt_boxes = [ torch.gather(prev_pred_box, 0, pgt_idx) for prev_pred_box, pgt_idx in zip(prev_pred_boxes, pgt_idxs) ] pgt_classes = [ torch.unsqueeze(gt_int, 0).expand(top_k, gt_int.numel()) for gt_int, top_k in zip(self.gt_classes_img_int, top_ks) ] if need_weight: pgt_weights = [ torch.index_select(pred_logits, 1, gt_int).expand(top_k, gt_int.numel()) for pred_logits, gt_int, top_k in zip( self.pred_class_img_logits.split(1, dim=0), self.gt_classes_img_int, top_ks ) ] if thres > 0: # get large scores masks = [pgt_score.ge(thres) for pgt_score in pgt_scores] masks = [ torch.cat([torch.full_like(mask[0:1, :], True), mask[1:, :]], dim=0) for mask in masks ] pgt_scores = [ torch.masked_select(pgt_score, mask) for pgt_score, mask in zip(pgt_scores, masks) ] pgt_boxes = [ torch.masked_select( pgt_box, torch.unsqueeze(mask, 2).expand(top_k, gt_int.numel(), 4) ) for pgt_box, mask, top_k, gt_int in zip( pgt_boxes, masks, top_ks, self.gt_classes_img_int ) ] pgt_classes = [ torch.masked_select(pgt_class, mask) for pgt_class, mask in zip(pgt_classes, masks) ] if need_weight: pgt_weights = [ torch.masked_select(pgt_weight, mask) for pgt_weight, mask in zip(pgt_weights, masks) ] pgt_scores = [pgt_score.reshape(-1) for pgt_score in pgt_scores] pgt_boxes = [pgt_box.reshape(-1, 4) for pgt_box in pgt_boxes] pgt_classes = [pgt_class.reshape(-1) for pgt_class in pgt_classes] if need_weight: pgt_weights = [pgt_weight.reshape(-1) for pgt_weight in pgt_weights] if not need_instance and need_weight: return pgt_scores, pgt_boxes, pgt_classes, pgt_weights elif not need_instance and not need_weight: return pgt_scores, pgt_boxes, pgt_classes pgt_boxes = [Boxes(pgt_box) for pgt_box in pgt_boxes] targets = [ Instances( proposals[i].image_size, gt_boxes=pgt_box, gt_classes=pgt_class, gt_scores=pgt_score, gt_weights=pgt_weight, ) for i, (pgt_box, pgt_class, pgt_score, pgt_weight) in enumerate( zip(pgt_boxes, pgt_classes, pgt_scores, pgt_weights) ) ] self._vis_pgt(targets, "pgt_top_k", suffix) return targets
def annotations_to_instances(annos, image_size, mask_format="polygon"): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", "gt_masks", "gt_keypoints", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [ BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos ] target = Instances(image_size) boxes = target.gt_boxes = Boxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] if mask_format == "polygon": masks = PolygonMasks(segms) else: assert mask_format == "bitmask", mask_format masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image_size)) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a full-image segmentation mask " "as a 2D ndarray.".format(type(segm))) # torch.from_numpy does not support array with negative stride. masks = BitMasks( torch.stack([ torch.from_numpy(np.ascontiguousarray(x)) for x in masks ])) target.gt_masks = masks if len(annos) and "keypoints" in annos[0]: kpts = [obj.get("keypoints", []) for obj in annos] target.gt_keypoints = Keypoints(kpts) return target
def __call__(self, values): return Boxes(values[0])
def assemble_rcnn_outputs_by_name(image_sizes, tensor_outputs, force_mask_on=False): """ A function to assemble caffe2 model's outputs (i.e. Dict[str, Tensor]) to detectron2's format (i.e. list of Instances instance). This only works when the model follows the Caffe2 detectron's naming convention. Args: image_sizes (List[List[int, int]]): [H, W] of every image. tensor_outputs (Dict[str, Tensor]): external_output to its tensor. force_mask_on (Bool): if true, the it make sure there'll be pred_masks even if the mask is not found from tensor_outputs (usually due to model crash) """ results = [Instances(image_size) for image_size in image_sizes] batch_splits = tensor_outputs.get("batch_splits", None) if batch_splits: raise NotImplementedError() assert len(image_sizes) == 1 result = results[0] bbox_nms = tensor_outputs["bbox_nms"] score_nms = tensor_outputs["score_nms"] class_nms = tensor_outputs["class_nms"] # Detection will always success because Conv support 0-batch assert bbox_nms is not None assert score_nms is not None assert class_nms is not None if bbox_nms.shape[1] == 5: result.pred_boxes = RotatedBoxes(bbox_nms) else: result.pred_boxes = Boxes(bbox_nms) result.scores = score_nms result.pred_classes = class_nms.to(torch.int64) mask_fcn_probs = tensor_outputs.get("mask_fcn_probs", None) if mask_fcn_probs is not None: # finish the mask pred mask_probs_pred = mask_fcn_probs num_masks = mask_probs_pred.shape[0] class_pred = result.pred_classes indices = torch.arange(num_masks, device=class_pred.device) mask_probs_pred = mask_probs_pred[indices, class_pred][:, None] result.pred_masks = mask_probs_pred elif force_mask_on: # NOTE: there's no way to know the height/width of mask here, it won't be # used anyway when batch size is 0, so just set them to 0. result.pred_masks = torch.zeros([0, 1, 0, 0], dtype=torch.uint8) keypoints_out = tensor_outputs.get("keypoints_out", None) kps_score = tensor_outputs.get("kps_score", None) if keypoints_out is not None: # keypoints_out: [N, 4, #kypoints], where 4 is in order of (x, y, score, prob) keypoints_tensor = keypoints_out # NOTE: it's possible that prob is not calculated if "should_output_softmax" # is set to False in HeatmapMaxKeypoint, so just using raw score, seems # it doesn't affect mAP. TODO: check more carefully. keypoint_xyp = keypoints_tensor.transpose(1, 2)[:, :, [0, 1, 2]] result.pred_keypoints = keypoint_xyp elif kps_score is not None: # keypoint heatmap to sparse data structure pred_keypoint_logits = kps_score keypoint_head.keypoint_rcnn_inference(pred_keypoint_logits, [result]) return results
def inference_single_image(self, box_cls, box_delta, anchors, mask_coef, proto_mask, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W x A, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. anchors (list[Boxes]): list of #feature levels. Each entry contains a Boxes object, which contains all the anchors for that image in that feature level. mask_coef (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W x A, #masks) proto_mask (Tensor): size (M, M, #masks) image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ boxes_all = [] scores_all = [] class_idxs_all = [] mask_coef_all = [] # Iterate over every feature level for box_cls_i, box_reg_i, mask_coef_i, anchors_i in zip( box_cls, box_delta, mask_coef, anchors): # (HxWxAxK,) box_cls_i = box_cls_i.flatten().sigmoid_() # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, box_reg_i.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = box_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] anchor_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes box_reg_i = box_reg_i[anchor_idxs] # (N,4) anchors_i = anchors_i[anchor_idxs] mask_coef_i = mask_coef_i[anchor_idxs] # (N, #masks) # predict boxes predicted_boxes = self.box2box_transform.apply_deltas(box_reg_i, anchors_i.tensor) boxes_all.append(predicted_boxes) scores_all.append(predicted_prob) class_idxs_all.append(classes_idxs) mask_coef_all.append(mask_coef_i) boxes_all, scores_all, class_idxs_all, mask_coef_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all, mask_coef_all] ] keep = batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold) keep = keep[: self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] pred_masks = F.sigmoid(proto_mask @ mask_coef_all[keep].t()) # note: pred_masks shape (M, M, #keep) pred_masks = crop(pred_masks, boxes_all[keep]) # shape (#keep, M, M) pred_masks = pred_masks.permute(2, 0, 1).contiguous() # mask_iou to rescore mask if self.rescore_mask: pred_maskiou = self.maskiou_net(pred_masks.unsqueeze(1)) pred_maskiou = torch.gather( pred_maskiou, dim=1, index=class_idxs_all[keep].unsqueeze(1)).squeeze(1) result.scores = scores_all[keep] * pred_maskiou pred_masks = F.interpolate(pred_masks.unsqueeze(0), image_size, mode="bilinear", align_corners=False).squeeze(0) result.pred_masks = pred_masks.gt_(0.5) return result
def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept "file_name", "image_id", 'ref_id', 'expr_id', 'height', 'weight' "tokens", 'cate', """ dataset_dict = copy.deepcopy( dataset_dict) # it will be modified by code below # USER: Write your own image loading if it's not from a file image = utils.read_image(dataset_dict['file_name'], format=self.img_format) utils.check_image_size(dataset_dict, image) image_scale = dataset_dict['image_scale'] image = cv2.resize(image, None, None, fx=image_scale, fy=image_scale, interpolation=cv2.INTER_LINEAR) precomp_boxes = torch.as_tensor(dataset_dict['precomp_bbox'] * image_scale, dtype=torch.float32) gt_boxes = torch.as_tensor(dataset_dict['gt_boxes'] * image_scale, dtype=torch.float32) h, w = image.shape[:2] gt_boxes = Boxes(gt_boxes, [h, w]) precomp_boxes = Boxes(precomp_boxes, [h, w]) image = torch.as_tensor(image.transpose(2, 0, 1), dtype=torch.float32) dataset_dict['det_label_embedding'] = self.vocab_embed[ dataset_dict['precomp_det_label']] ## N*1024 dataset_dict['image'] = image dataset_dict['gt_boxes'] = gt_boxes dataset_dict['precomp_bbox'] = precomp_boxes return dataset_dict # class DatasetMapper: # """ # A callable which takes a dataset dict in Detectron2 Dataset format, # and map it into a format used by the model. # # This is the default callable to be used to map your dataset dict into training data. # You may need to follow it to implement your own one for customized logic. # # The callable currently does the following: # 1. Read the image from "file_name" # 2. Applies cropping/geometric transforms to the image and annotations # 3. Prepare data and annotations to Tensor and :class:`Instances` # """ # # def __init__(self, cfg, is_train=True): # self.tfm_gens = utils.build_transform_gen(cfg, is_train) # # if cfg.INPUT.CROP.ENABLED and is_train: # self.crop_gen = T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE) # else: # self.crop_gen = None # # # fmt: off # self.img_format = cfg.INPUT.FORMAT # self.mask_on = cfg.MODEL.MASK_ON # self.mask_format = cfg.INPUT.MASK_FORMAT # self.keypoint_on = cfg.MODEL.KEYPOINT_ON # self.load_proposals = cfg.MODEL.LOAD_PROPOSALS # # fmt: on # if self.keypoint_on and is_train: # # Flip only makes sense in training # self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN) # else: # self.keypoint_hflip_indices = None # # if self.load_proposals: # self.min_box_side_len = cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE # self.proposal_topk = ( # cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN # if is_train # else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST # ) # self.is_train = is_train # # def __call__(self, dataset_dict): # """ # Args: # dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. # # Returns: # dict: a format that builtin models in detectron2 accept # """ # dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below # # USER: Write your own image loading if it's not from a file # image = utils.read_image(dataset_dict["file_name"], format=self.img_format) # utils.check_image_size(dataset_dict, image) # # if "annotations" not in dataset_dict: # image, transforms = T.apply_transform_gens( # ([self.crop_gen] if self.crop_gen else []) + self.tfm_gens, image # ) # else: # # Crop around an instance if there are instances in the image. # # USER: Remove if you don't use cropping # if self.crop_gen: # crop_tfm = utils.gen_crop_transform_with_instance( # self.crop_gen.get_crop_size(image.shape[:2]), # image.shape[:2], # np.random.choice(dataset_dict["annotations"]), # ) # image = crop_tfm.apply_image(image) # image, transforms = T.apply_transform_gens(self.tfm_gens, image) # if self.crop_gen: # transforms = crop_tfm + transforms # # image_shape = image.shape[:2] # h, w # # # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # # Therefore it's important to use torch.Tensor. # dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32")) # # Can use uint8 if it turns out to be slow some day # # # USER: Remove if you don't use pre-computed proposals. # if self.load_proposals: # utils.transform_proposals( # dataset_dict, image_shape, transforms, self.min_box_side_len, self.proposal_topk # ) # # if not self.is_train: # dataset_dict.pop("annotations", None) # dataset_dict.pop("sem_seg_file_name", None) # return dataset_dict # # if "annotations" in dataset_dict: # # USER: Modify this if you want to keep them for some reason. # for anno in dataset_dict["annotations"]: # if not self.mask_on: # anno.pop("segmentation", None) # if not self.keypoint_on: # anno.pop("keypoints", None) # # # USER: Implement additional transformations if you have other types of data # annos = [ # utils.transform_instance_annotations( # obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices # ) # for obj in dataset_dict.pop("annotations") # if obj.get("iscrowd", 0) == 0 # ] # instances = utils.annotations_to_instances( # annos, image_shape, mask_format=self.mask_format # ) # # Create a tight bounding box from masks, useful when image is cropped # if self.crop_gen and instances.has("gt_masks"): # instances.gt_boxes = instances.gt_masks.get_bounding_boxes() # dataset_dict["instances"] = utils.filter_empty_instances(instances) # # # USER: Remove if you don't do semantic/panoptic segmentation. # if "sem_seg_file_name" in dataset_dict: # with PathManager.open(dataset_dict.pop("sem_seg_file_name"), "rb") as f: # sem_seg_gt = Image.open(f) # sem_seg_gt = np.asarray(sem_seg_gt, dtype="uint8") # sem_seg_gt = transforms.apply_segmentation(sem_seg_gt) # sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) # dataset_dict["sem_seg"] = sem_seg_gt # return dataset_dict
def find_top_rpn_proposals( proposals, pred_objectness_logits, images, nms_thresh, pre_nms_topk, post_nms_topk, min_box_side_len, training, ): """ For each feature map, select the `pre_nms_topk` highest scoring proposals, apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` highest scoring proposals among all the feature maps if `training` is True, otherwise, returns the highest `post_nms_topk` scoring proposals for each feature map. Args: proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4). All proposal predictions on the feature maps. pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). images (ImageList): Input images as an :class:`ImageList`. nms_thresh (float): IoU threshold to use for NMS pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is per feature map. post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is total, over all feature maps. min_box_side_len (float): minimum proposal box side length in pixels (absolute units wrt input images). training (bool): True if proposals are to be used in training, otherwise False. This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." comment. Returns: proposals (list[Instances]): list of N Instances. The i-th Instances stores post_nms_topk object proposals for image i. """ image_sizes = images.image_sizes # in (h, w) order num_images = len(image_sizes) device = proposals[0].device # 1. Select top-k anchor for every level and every image topk_scores = [] # #lvl Tensor, each of shape N x topk topk_proposals = [] level_ids = [] # #lvl Tensor, each of shape (topk,) batch_idx = torch.arange(num_images, device=device) for level_id, proposals_i, logits_i in zip( itertools.count(), proposals, pred_objectness_logits ): Hi_Wi_A = logits_i.shape[1] num_proposals_i = min(pre_nms_topk, Hi_Wi_A) # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812) # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1) logits_i, idx = logits_i.sort(descending=True, dim=1) topk_scores_i = logits_i[batch_idx, :num_proposals_i] topk_idx = idx[batch_idx, :num_proposals_i] # each is N x topk topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 4 topk_proposals.append(topk_proposals_i) topk_scores.append(topk_scores_i) level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device)) # 2. Concat all levels together topk_scores = cat(topk_scores, dim=1) topk_proposals = cat(topk_proposals, dim=1) level_ids = cat(level_ids, dim=0) # 3. For each image, run a per-level NMS, and choose topk results. results = [] for n, image_size in enumerate(image_sizes): boxes = Boxes(topk_proposals[n]) scores_per_img = topk_scores[n] boxes.clip(image_size) # filter empty boxes keep = boxes.nonempty(threshold=min_box_side_len) lvl = level_ids if keep.sum().item() != len(boxes): boxes, scores_per_img, lvl = boxes[keep], scores_per_img[keep], level_ids[keep] keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh) # In Detectron1, there was different behavior during training vs. testing. # (https://github.com/facebookresearch/Detectron/issues/459) # During training, topk is over the proposals from *all* images in the training batch. # During testing, it is over the proposals for each image separately. # As a result, the training behavior becomes batch-dependent, # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. # This bug is addressed in Detectron2 to make the behavior independent of batch size. keep = keep[:post_nms_topk] res = Instances(image_size) res.proposal_boxes = boxes[keep] res.objectness_logits = scores_per_img[keep] results.append(res) return results
def __init__( self, box2box_transform, pred_class_logits, pred_proposal_deltas, proposals, smooth_l1_beta=0.0, box_reg_loss_type="smooth_l1", mean_loss=False, W_pos=None, W_neg=None, PL=None, NL=None, csc_stats=None, loss_weight=1.0, prefix="", ): """ Args: box2box_transform (Box2BoxTransform/Box2BoxTransformRotated): box2box transform instance for proposal-to-detection transformations. pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class logits for all R predicted object instances. Each row corresponds to a predicted object instance. pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for class-specific or class-agnostic regression. It stores the predicted deltas that transform proposals into final box detections. B is the box dimension (4 or 5). When B is 4, each row is [dx, dy, dw, dh (, ....)]. When B is 5, each row is [dx, dy, dw, dh, da (, ....)]. proposals (list[Instances]): A list of N Instances, where Instances i stores the proposals for image i, in the field "proposal_boxes". When training, each Instances must have ground-truth labels stored in the field "gt_classes" and "gt_boxes". The total number of all instances must be equal to R. smooth_l1_beta (float): The transition point between L1 and L2 loss in the smooth L1 loss function. When set to 0, the loss becomes L1. When set to +inf, the loss becomes constant 0. box_reg_loss_type (str): Box regression loss type. One of: "smooth_l1", "giou" """ self.box2box_transform = box2box_transform self.num_preds_per_image = [len(p) for p in proposals] self.pred_class_logits = pred_class_logits self.pred_proposal_deltas = pred_proposal_deltas self.smooth_l1_beta = smooth_l1_beta self.box_reg_loss_type = box_reg_loss_type self.image_shapes = [x.image_size for x in proposals] if len(proposals): box_type = type(proposals[0].proposal_boxes) # cat(..., dim=0) concatenates over all images in the batch self.proposals = box_type.cat([p.proposal_boxes for p in proposals]) assert ( not self.proposals.tensor.requires_grad ), "Proposals should not require gradients!" # The following fields should exist only when training. if proposals[0].has("gt_boxes"): self.gt_boxes = box_type.cat([p.gt_boxes for p in proposals]) assert proposals[0].has("gt_classes") self.gt_classes = cat([p.gt_classes for p in proposals], dim=0) else: self.proposals = Boxes(torch.zeros(0, 4, device=self.pred_proposal_deltas.device)) self._no_instances = len(proposals) == 0 # no instances found self.mean_loss = mean_loss self.W_pos = W_pos self.W_neg = W_neg self.PL = PL self.NL = NL self.csc_stats = csc_stats self.loss_weight = loss_weight self.prefix = prefix
def annotations_to_instances_with_attributes(annos, image_size, mask_format="polygon", load_attributes=False, max_attr_per_ins=16): """ Extend the function annotations_to_instances() to support attributes """ boxes = [ BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos ] target = Instances(image_size) boxes = target.gt_boxes = Boxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] if mask_format == "polygon": masks = PolygonMasks(segms) else: assert mask_format == "bitmask", mask_format masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image_size)) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a full-image segmentation mask " "as a 2D ndarray.".format(type(segm))) masks = BitMasks( torch.stack([ torch.from_numpy(np.ascontiguousarray(x)) for x in masks ])) target.gt_masks = masks if len(annos) and "keypoints" in annos[0]: kpts = [obj.get("keypoints", []) for obj in annos] target.gt_keypoints = Keypoints(kpts) if len(annos) and load_attributes: attributes = -torch.ones( (len(annos), max_attr_per_ins), dtype=torch.int64) for idx, anno in enumerate(annos): if "attribute_ids" in anno: for jdx, attr_id in enumerate(anno["attribute_ids"]): attributes[idx, jdx] = attr_id target.gt_attributes = attributes return target
def fast_rcnn_inference_single_image( boxes, scores, image_shape, score_thresh, nms_thresh, topk_per_image ): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes per image. Returns: Same as `fast_rcnn_inference`, but for only one image. """ all_scores = scores.clone() all_scores = torch.unsqueeze(all_scores, 0) all_boxes = boxes.clone() all_boxes = torch.unsqueeze(all_boxes, 0) pred_inds = torch.unsqueeze( torch.arange(scores.size(0), device=scores.device, dtype=torch.long), dim=1 ).repeat(1, scores.size(1)) valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1) if not valid_mask.all(): boxes = boxes[valid_mask] scores = scores[valid_mask] pred_inds = pred_inds[valid_mask] scores = scores[:, :-1] num_bbox_reg_classes = boxes.shape[1] // 4 # Convert to Boxes to use the `clip` function ... boxes = Boxes(boxes.reshape(-1, 4)) boxes.clip(image_shape) boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4 pred_inds = pred_inds[:, :-1] # Filter results based on detection scores filter_mask = scores > score_thresh # R x K # R' x 2. First column contains indices of the R predictions; # Second column contains indices of classes. filter_inds = filter_mask.nonzero() if num_bbox_reg_classes == 1: boxes = boxes[filter_inds[:, 0], 0] else: boxes = boxes[filter_mask] scores = scores[filter_mask] pred_inds = pred_inds[filter_mask] # Apply per-class NMS keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh) if topk_per_image >= 0: keep = keep[:topk_per_image] boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep] pred_inds = pred_inds[keep] result = Instances(image_shape) result.pred_boxes = Boxes(boxes) result.scores = scores result.pred_classes = filter_inds[:, 1] result.pred_inds = pred_inds return result, filter_inds[:, 0], all_scores, all_boxes
# Copyright (c) Facebook, Inc. and its affiliates. import unittest import torch from detectron2.structures import Boxes, BoxMode, Instances from densepose.modeling.losses.utils import ChartBasedAnnotationsAccumulator from densepose.structures import DensePoseDataRelative, DensePoseList image_shape = (100, 100) instances = Instances(image_shape) n_instances = 3 instances.proposal_boxes = Boxes(torch.rand(n_instances, 4)) instances.gt_boxes = Boxes(torch.rand(n_instances, 4)) # instances.gt_densepose = None cannot happen because instances attributes need a length class TestChartBasedAnnotationsAccumulator(unittest.TestCase): def test_chart_based_annotations_accumulator_no_gt_densepose(self): accumulator = ChartBasedAnnotationsAccumulator() accumulator.accumulate(instances) expected_values = { "nxt_bbox_with_dp_index": 0, "nxt_bbox_index": n_instances } for key in accumulator.__dict__: self.assertEqual(getattr(accumulator, key), expected_values.get(key, [])) def test_chart_based_annotations_accumulator_gt_densepose_none(self):
def _forward_box( self, features: Dict[str, torch.Tensor], proposals: List[Instances] ) -> Union[Dict[str, torch.Tensor], List[Instances]]: """ Forward logic of the box prediction branch. If `self.train_on_pred_boxes is True`, the function puts predicted boxes in the `proposal_boxes` field of `proposals` argument. Args: features (dict[str, Tensor]): mapping from feature map names to tensor. Same as in :meth:`ROIHeads.forward`. proposals (list[Instances]): the per-image object proposals with their matching ground truth. Each has fields "proposal_boxes", and "objectness_logits", "gt_classes", "gt_boxes". Returns: In training, a dict of losses. In inference, a list of `Instances`, the predicted instances. """ features = [features[f] for f in self.box_in_features] box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) objectness_logits = torch.cat([x.objectness_logits + 1 for x in proposals], dim=0) if self.pooler_type == "ROILoopPool": objectness_logits = torch.cat( [objectness_logits, objectness_logits, objectness_logits], dim=0 ) box_features = box_features * objectness_logits.view(-1, 1, 1, 1) if self.training: storage = get_event_storage() storage.put_scalar("proposals/objectness_logits+1 mean", objectness_logits.mean()) storage.put_scalar("proposals/objectness_logits+1 max", objectness_logits.max()) storage.put_scalar("proposals/objectness_logits+1 min", objectness_logits.min()) # torch.cuda.empty_cache() box_features = self.box_head(box_features) if self.pooler_type == "ROILoopPool": box_features, box_features_frame, box_features_context = torch.chunk( box_features, 3, dim=0 ) predictions = self.box_predictor( [box_features, box_features_frame, box_features_context], proposals, context=True ) del box_features_frame del box_features_context else: predictions = self.box_predictor(box_features, proposals) # del box_features if self.training: losses = self.box_predictor.losses(predictions, proposals, self.gt_classes_img_oh) self.pred_class_img_logits = ( self.box_predictor.predict_probs_img(predictions, proposals).clone().detach() ) prev_pred_scores = predictions[0].detach() prev_pred_boxes = [p.proposal_boxes for p in proposals] for k in range(self.refine_K): suffix = "_r" + str(k) # targets = self.get_pgt( # prev_pred_boxes, prev_pred_scores, proposals, suffix # ) targets = self.get_pgt_top_k( prev_pred_boxes, prev_pred_scores, proposals, suffix=suffix ) proposals_k = self.label_and_sample_proposals(proposals, targets, suffix=suffix) if self.roi_label: if isinstance(prev_pred_scores, list): S = cat(prev_pred_scores, dim=0).cpu() else: S = prev_pred_scores.cpu() U = cat( [pairwise_iou(p.proposal_boxes, p.proposal_boxes) for p in proposals_k], dim=0, ).cpu() L = self.gt_classes_img_oh.cpu() CW = self.pred_class_img_logits.cpu() RL, RW = self.roi_label(S, U, L, CW) RL = RL.to(self.pred_class_img_logits.device) RW = RW.to(self.pred_class_img_logits.device) num_preds_per_image = [len(p) for p in proposals_k] for p, rl, rw in zip( proposals_k, RL.split(num_preds_per_image, dim=0), RW.split(num_preds_per_image, dim=0), ): p.gt_classes = rl.to(torch.int64) p.gt_weights = rw.to(torch.float32) predictions_k = self.box_refinery[k](box_features) losses_k = self.box_refinery[k].losses(predictions_k, proposals_k) prev_pred_scores = self.box_refinery[k].predict_probs(predictions_k, proposals_k) prev_pred_boxes = self.box_refinery[k].predict_boxes(predictions_k, proposals_k) prev_pred_scores = [ prev_pred_score.detach() for prev_pred_score in prev_pred_scores ] prev_pred_boxes = [prev_pred_box.detach() for prev_pred_box in prev_pred_boxes] losses.update(losses_k) # proposals is modified in-place below, so losses must be computed first. if self.train_on_pred_boxes: with torch.no_grad(): pred_boxes = self.box_predictor.predict_boxes_for_gt_classes( predictions, proposals ) for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes): proposals_per_image.proposal_boxes = Boxes(pred_boxes_per_image) return losses else: if self.refine_reg[-1]: predictions_k = self.box_refinery[-1](box_features) pred_instances, _, all_scores, all_boxes = self.box_refinery[-1].inference( predictions_k, proposals ) else: predictions_K = [] for k in range(self.refine_K): predictions_k = self.box_refinery[k](box_features) predictions_K.append(predictions_k) pred_instances, _, all_scores, all_boxes = self.box_refinery[-1].inference( predictions_K, proposals ) return pred_instances, all_scores, all_boxes
def inference_single_image(self, box_cls, box_delta, anchors, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W x A, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. anchors (list[Boxes]): list of #feature levels. Each entry contains a Boxes object, which contains all the anchors for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ boxes_all = [] scores_all = [] class_idxs_all = [] # Iterate over every feature level for box_cls_i, box_reg_i, anchors_i in zip(box_cls, box_delta, anchors): # (HxWxAxK,) box_cls_i = box_cls_i.flatten().sigmoid_() # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, box_reg_i.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = box_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] anchor_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes box_reg_i = box_reg_i[anchor_idxs] anchors_i = anchors_i[anchor_idxs] # predict boxes predicted_boxes = self.box2box_transform.apply_deltas(box_reg_i, anchors_i.tensor) boxes_all.append(predicted_boxes) scores_all.append(predicted_prob) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold) keep = keep[: self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return result
def get_pgt(self, prev_pred_boxes, prev_pred_scores, proposals, suffix): if isinstance(prev_pred_scores, torch.Tensor): num_preds_per_image = [len(p) for p in proposals] prev_pred_scores = prev_pred_scores.split(num_preds_per_image, dim=0) else: assert isinstance(prev_pred_scores, list) assert isinstance(prev_pred_scores[0], torch.Tensor) prev_pred_scores = [ torch.index_select(prev_pred_score, 1, gt_int) for prev_pred_score, gt_int in zip(prev_pred_scores, self.gt_classes_img_int) ] pgt_scores_idxs = [ torch.max(prev_pred_score, dim=0) for prev_pred_score in prev_pred_scores ] pgt_scores = [item[0] for item in pgt_scores_idxs] pgt_idxs = [item[1] for item in pgt_scores_idxs] assert isinstance(prev_pred_boxes, tuple) or isinstance(prev_pred_boxes, list) if isinstance(prev_pred_boxes[0], Boxes): pgt_boxes = [ prev_pred_box[pgt_idx] for prev_pred_box, pgt_idx in zip(prev_pred_boxes, pgt_idxs) ] else: assert isinstance(prev_pred_boxes[0], torch.Tensor) if self.cls_agnostic_bbox_reg: num_preds = [prev_pred_box.size(0) for prev_pred_box in prev_pred_boxes] prev_pred_boxes = [ prev_pred_box.unsqueeze(1).expand(num_pred, self.num_classes, 4) for num_pred, prev_pred_box in zip(num_preds, prev_pred_boxes) ] prev_pred_boxes = [ prev_pred_box.view(-1, self.num_classes, 4) for prev_pred_box in prev_pred_boxes ] prev_pred_boxes = [ torch.index_select(prev_pred_box, 1, gt_int) for prev_pred_box, gt_int in zip(prev_pred_boxes, self.gt_classes_img_int) ] pgt_boxes = [ torch.index_select(prev_pred_box, 0, pgt_idx) for prev_pred_box, pgt_idx in zip(prev_pred_boxes, pgt_idxs) ] pgt_boxes = [pgt_box.view(-1, 4) for pgt_box in pgt_boxes] diags = [ torch.tensor( [i * gt_split.numel() + i for i in range(gt_split.numel())], dtype=torch.int64, device=pgt_boxes[0].device, ) for gt_split in self.gt_classes_img_int ] pgt_boxes = [ torch.index_select(pgt_box, 0, diag) for pgt_box, diag in zip(pgt_boxes, diags) ] pgt_boxes = [Boxes(pgt_box) for pgt_box in pgt_boxes] pgt_classes = self.gt_classes_img_int pgt_weights = [ torch.index_select(pred_logits, 1, pgt_class).reshape(-1) for pred_logits, pgt_class in zip( self.pred_class_img_logits.split(1, dim=0), pgt_classes ) ] targets = [ Instances( proposals[i].image_size, gt_boxes=pgt_box, gt_classes=pgt_class, gt_scores=pgt_score, gt_weights=pgt_weight, ) for i, (pgt_box, pgt_class, pgt_score, pgt_weight) in enumerate( zip(pgt_boxes, pgt_classes, pgt_scores, pgt_weights) ) ] self._vis_pgt(targets, "pgt", suffix) return targets
def main(args): # retrieve configuration file and update the weights cfg = get_cfg() cfg.merge_from_file(args.cfg) # update the model so that it uses the final output weights. cfg.MODEL.WEIGHTS = str(Path(cfg.OUTPUT_DIR) / Path("model_final.pth")) predictor = DefaultPredictor(cfg) # load image. # get data from validation data # need to get data from the signs dataset, not the hotspots dataset. dset = DatasetCatalog.get(args.dataset) all_hotspots = [] all_gt_aligned = [] all_scores = [] for example in tqdm(dset): img = cv2.imread(example["file_name"]) # # format is documented at https://detectron2.readthedocs.io/tutorials/models.html#model-output-format outputs = predictor(img) # gets individual hotspot images, save to npz array hotspots = extract_boxes( img[:, :, ::-1], outputs["instances"].to("cpu").pred_boxes ) all_hotspots.extend(hotspots) # get scores scores = outputs["instances"].to("cpu").scores all_scores.extend(scores.numpy()) # get groundtruth classes # these parameters can be customized. matcher = Matcher([0.4, 0.5], [0, -1, 1], allow_low_quality_matches=False) # convert the groundtruth annotations into a detectron Boxes object gt_boxes = Boxes( torch.tensor( np.vstack([annotation["bbox"] for annotation in example["annotations"]]) ) ) gt_classes = np.array( [annotation["category_id"] for annotation in example["annotations"]] ) pred_boxes = outputs["instances"].to("cpu").pred_boxes match_quality_matrix = pairwise_iou(gt_boxes, pred_boxes) matched_idxs, matched_labels = matcher(match_quality_matrix) # compute ground-truth classes for every box aligned_classes = gt_classes[matched_idxs] # handle edge case where only one aligned box shows up if not isinstance(aligned_classes, np.ndarray): aligned_classes = np.ndarray([aligned_classes]) # handle background classes: aligned_classes[matched_labels == 0] = -1 aligned_classes[matched_labels == -1] = -1 all_gt_aligned.extend(aligned_classes) np.savez( Path(args.outpath).with_suffix(".npz"), hotspots=np.array(all_hotspots, dtype=object), scores=all_scores, gt_classes=all_gt_aligned, )
def forward(self, x, boxes): return self.roi(x, [Boxes(boxes)])