def json_to_d2(height, width, json): inst = Instances((height, width,), **json) print(inst) print(type(inst)) return inst
def find_top_bua_rpn_proposals( proposals, pred_objectness_logits, images, nms_thresh, pre_nms_topk, post_nms_topk, min_box_side_len, training, ): """ For each feature map, select the `pre_nms_topk` highest scoring proposals, apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` highest scoring proposals among all the feature maps if `training` is True, otherwise, returns the highest `post_nms_topk` scoring proposals for each feature map. Args: proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4). All proposal predictions on the feature maps. pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). images (ImageList): Input images as an :class:`ImageList`. nms_thresh (float): IoU threshold to use for NMS pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is per feature map. post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is total, over all feature maps. min_box_side_len (float): minimum proposal box side length in pixels (absolute units wrt input images). training (bool): True if proposals are to be used in training, otherwise False. This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." comment. Returns: proposals (list[Instances]): list of N Instances. The i-th Instances stores post_nms_topk object proposals for image i. """ image_sizes = images.image_sizes # in (h, w) order image_scales = images.image_scales device = proposals[0].device # 1. Concat all levels together all_scores = [] all_proposals = [] level_ids = [] for level_id, proposals_i, logits_i in zip(itertools.count(), proposals, pred_objectness_logits): Hi_Wi_A = logits_i.shape[1] all_proposals.append(proposals_i) all_scores.append(logits_i) level_ids.append( torch.full((Hi_Wi_A, ), level_id, dtype=torch.int64, device=device)) all_scores = cat(all_scores, dim=1) all_proposals = cat(all_proposals, dim=1) level_ids = cat(level_ids, dim=0) # 2. For each image, run a choose pre_nms_topk proposal ,per-level NMS, and choose post_nms_topk results. results = [] for n, image_size in enumerate(image_sizes): boxes = BUABoxes(all_proposals[n]) scores_per_img = all_scores[n] boxes.clip(image_size) keep = boxes.filter_boxes() boxes = boxes[keep] scores_per_img = scores_per_img[keep] lvl = level_ids[keep] # filter empty boxes keep = boxes.nonempty(threshold=min_box_side_len * image_scales[n]) if keep.sum().item() != len(boxes): boxes, scores_per_img, lvl = boxes[keep], scores_per_img[ keep], lvl[keep] # choose pre_nms_topk proposal Hi_Wi_A = scores_per_img.shape[0] num_proposals_i = min(pre_nms_topk, Hi_Wi_A) scores_per_img, idx = scores_per_img.sort(descending=True, dim=0) topk_scores_i = scores_per_img[:num_proposals_i] topk_idx = idx[:num_proposals_i] topk_boxes_i = boxes[topk_idx, :] lvl_i = lvl[topk_idx] keep = batched_nms(topk_boxes_i.tensor, topk_scores_i, lvl_i, nms_thresh) # In Detectron1, there was different behavior during training vs. testing. # (https://github.com/facebookresearch/Detectron/issues/459) # During training, topk is over the proposals from *all* images in the training batch. # During testing, it is over the proposals for each image separately. # As a result, the training behavior becomes batch-dependent, # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. # This bug is addressed in Detectron2 to make the behavior independent of batch size. keep = keep[:post_nms_topk] res = Instances(image_size) res.proposal_boxes = topk_boxes_i[keep] res.objectness_logits = topk_scores_i[keep] results.append(res) return results
def inference_single_image(self, box_cls, box_delta, anchors, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W x A, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. anchors (list[Boxes]): list of #feature levels. Each entry contains a Boxes object, which contains all the anchors for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ boxes_all = [] scores_all = [] class_idxs_all = [] # Iterate over every feature level for box_cls_i, box_reg_i, anchors_i in zip(box_cls, box_delta, anchors): # (HxWxAxK,) box_cls_i = box_cls_i.flatten().sigmoid_() # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, box_reg_i.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = box_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] anchor_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes box_reg_i = box_reg_i[anchor_idxs] anchors_i = anchors_i[anchor_idxs] # predict boxes predicted_boxes = self.box2box_transform.apply_deltas( box_reg_i, anchors_i.tensor) boxes_all.append(predicted_boxes) scores_all.append(predicted_prob) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return result
def _original_call(self, dataset_dict): """ Modified from detectron2's original __call__ in DatasetMapper """ dataset_dict = copy.deepcopy( dataset_dict) # it will be modified by code below image = self._read_image(dataset_dict, format=self.img_format) if not self.backfill_size: utils.check_image_size(dataset_dict, image) if "annotations" not in dataset_dict: image, transforms = T.apply_transform_gens( ([self.crop_gen] if self.crop_gen else []) + self.tfm_gens, image) else: # Crop around an instance if there are instances in the image. # USER: Remove if you don't use cropping if self.crop_gen: crop_tfm = utils.gen_crop_transform_with_instance( self.crop_gen.get_crop_size(image.shape[:2]), image.shape[:2], np.random.choice(dataset_dict["annotations"]), ) image = crop_tfm.apply_image(image) image, transforms = T.apply_transform_gens(self.tfm_gens, image) if self.crop_gen: transforms = crop_tfm + transforms image_shape = image.shape[:2] # h, w dataset_dict["image"] = torch.as_tensor( image.transpose(2, 0, 1).astype("float32")) # Can use uint8 if it turns out to be slow some day assert not self.load_proposals, "Not supported!" if not self.is_train: dataset_dict.pop("annotations", None) dataset_dict.pop("sem_seg_file_name", None) return dataset_dict if "annotations" in dataset_dict: for anno in dataset_dict["annotations"]: if not self.mask_on: anno.pop("segmentation", None) if not self.keypoint_on: anno.pop("keypoints", None) # Convert dataset_dict["annotations"] to dataset_dict["instances"] annotations = [ obj for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0 ] # Convert either rotated box or horizontal box to XYWHA_ABS format original_boxes = [ BoxMode.convert( box=obj["bbox"], from_mode=obj["bbox_mode"], to_mode=BoxMode.XYWHA_ABS, ) for obj in annotations ] transformed_boxes = transforms.apply_rotated_box( np.array(original_boxes, dtype=np.float64)) instances = Instances(image_shape) instances.gt_classes = torch.tensor( [obj["category_id"] for obj in annotations], dtype=torch.int64) instances.gt_boxes = RotatedBoxes(transformed_boxes) instances.gt_boxes.clip(image_shape) dataset_dict["instances"] = instances[ instances.gt_boxes.nonempty()] return dataset_dict
def annotations_to_instances(annos, image_size, mask_format="polygon"): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", "gt_masks", "gt_keypoints", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [ BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos ] target = Instances(image_size) boxes = target.gt_boxes = Boxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] if mask_format == "polygon": masks = PolygonMasks(segms) else: assert mask_format == "bitmask", mask_format masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image_size)) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a full-image segmentation mask " "as a 2D ndarray.".format(type(segm))) # torch.from_numpy does not support array with negative stride. masks = BitMasks( torch.stack([ torch.from_numpy(np.ascontiguousarray(x)) for x in masks ])) target.gt_masks = masks if len(annos) and "keypoints" in annos[0]: kpts = [obj.get("keypoints", []) for obj in annos] target.gt_keypoints = Keypoints(kpts) return target
def forward_for_single_feature_map(self, locations, logits_pred, reg_pred, ctrness_pred, image_sizes, top_feat=None): N, C, H, W = logits_pred.shape # put in the same format as locations logits_pred = logits_pred.view(N, C, H, W).permute(0, 2, 3, 1) logits_pred = logits_pred.reshape(N, -1, C).sigmoid() box_regression = reg_pred.view(N, 4, H, W).permute(0, 2, 3, 1) box_regression = box_regression.reshape(N, -1, 4) ctrness_pred = ctrness_pred.view(N, 1, H, W).permute(0, 2, 3, 1) ctrness_pred = ctrness_pred.reshape(N, -1).sigmoid() if top_feat is not None: top_feat = top_feat.view(N, -1, H, W).permute(0, 2, 3, 1) top_feat = top_feat.reshape(N, H * W, -1) # if self.thresh_with_ctr is True, we multiply the classification # scores with centerness scores before applying the threshold. if self.thresh_with_ctr: logits_pred = logits_pred * ctrness_pred[:, :, None] candidate_inds = logits_pred > self.pre_nms_thresh pre_nms_top_n = candidate_inds.view(N, -1).sum(1) pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_topk) if not self.thresh_with_ctr: logits_pred = logits_pred * ctrness_pred[:, :, None] results = [] for i in range(N): per_box_cls = logits_pred[i] per_candidate_inds = candidate_inds[i] per_box_cls = per_box_cls[per_candidate_inds] per_candidate_nonzeros = per_candidate_inds.nonzero() per_box_loc = per_candidate_nonzeros[:, 0] per_class = per_candidate_nonzeros[:, 1] per_box_regression = box_regression[i] per_box_regression = per_box_regression[per_box_loc] per_locations = locations[per_box_loc] if top_feat is not None: per_top_feat = top_feat[i] per_top_feat = per_top_feat[per_box_loc] per_pre_nms_top_n = pre_nms_top_n[i] if per_candidate_inds.sum().item() > per_pre_nms_top_n.item(): per_box_cls, top_k_indices = \ per_box_cls.topk(per_pre_nms_top_n, sorted=False) per_class = per_class[top_k_indices] per_box_regression = per_box_regression[top_k_indices] per_locations = per_locations[top_k_indices] if top_feat is not None: per_top_feat = per_top_feat[top_k_indices] detections = torch.stack([ per_locations[:, 0] - per_box_regression[:, 0], per_locations[:, 1] - per_box_regression[:, 1], per_locations[:, 0] + per_box_regression[:, 2], per_locations[:, 1] + per_box_regression[:, 3], ], dim=1) boxlist = Instances(image_sizes[i]) boxlist.pred_boxes = Boxes(detections) boxlist.scores = torch.sqrt(per_box_cls) boxlist.pred_classes = per_class boxlist.locations = per_locations if top_feat is not None: boxlist.top_feat = per_top_feat results.append(boxlist) return results
def apply_late_fusion_and_evaluate(cfg, evaluator, det_1, det_2, method): evaluator.reset() img_folder = '../../../Datasets/FLIR/val/thermal_8_bit/' num_img = len(det_2['image']) count_1 = 0 count_2 = 0 count_fusion = 0 print('Method: ', method) for i in range(num_img): info_1 = {} info_1['img_name'] = det_1['image'][i] info_1['bbox'] = det_1['boxes'][i] info_1['score'] = det_1['scores'][i] info_1['class'] = det_1['classes'][i] info_2 = {} info_2['img_name'] = det_2['image'][i].split('.')[0] + '.jpeg' info_2['bbox'] = det_2['boxes'][i] info_2['score'] = det_2['scores'][i] info_2['class'] = det_2['classes'][i] #pdb.set_trace() if len(info_1['bbox']) == 0 or len(info_2['bbox']) == 0: if (len(info_1['bbox']) > 0): out_boxes = np.array(info_1['bbox']) out_class = torch.Tensor(info_1['class']) out_scores = torch.Tensor(info_1['score']) elif (len(info_2['bbox']) > 0): out_boxes = np.array(info_2['bbox']) out_class = torch.Tensor(info_2['class']) out_scores = torch.Tensor(info_2['score']) else: out_boxes = np.array(info_2['bbox']) out_class = torch.Tensor(info_2['class']) out_scores = torch.Tensor(info_2['score']) else: if method == 'nms': out_boxes, out_scores, out_class = nms_1(info_1, info_2) elif method == 'pooling': in_boxes, in_scores, in_class = prepare_data(info_1, info_2) out_boxes = in_boxes out_scores = torch.Tensor(in_scores) out_class = torch.Tensor(in_class) elif method == 'baysian' or method == 'baysian_avg_bbox' or method == 'avg_score' or method == 'baysian_wt_score_box': threshold = 0.5 in_boxes, in_scores, in_class = prepare_data(info_1, info_2) keep, out_scores, out_boxes, out_class = nms_2( in_boxes, in_scores, in_class, threshold, method) count_1 += len(info_1['bbox']) count_2 += len(info_2['bbox']) count_fusion += len(out_boxes) file_name = img_folder + info_1['img_name'].split('.')[0] + '.jpeg' img = cv2.imread(file_name) H, W, _ = img.shape # Handle inputs inputs = [] input_info = {} input_info['file_name'] = file_name input_info['height'] = H input_info['width'] = W input_info['image_id'] = det_2['image_id'][i] input_info['image'] = torch.Tensor(img) inputs.append(input_info) # Handle outputs outputs = [] out_info = {} proposals = Instances([H, W]) proposals.pred_boxes = Boxes(out_boxes) proposals.scores = out_scores proposals.pred_classes = out_class out_info['instances'] = proposals outputs.append(out_info) evaluator.process(inputs, outputs) img = draw_box(img, out_boxes, (0, 255, 0)) out_img_name = 'out_img_baysian_fusion/' + file_name.split( 'thermal_8_bit/')[1].split('.')[0] + '_baysian_avg_bbox.jpg' #cv2.imwrite(out_img_name, img) #pdb.set_trace() """ if '09115' in file_name: out_img_name = 'out_img_baysian_fusion/' + file_name.split('thermal_8_bit/')[1].split('.')[0]+'_baysian_avg_bbox.jpg' pdb.set_trace() cv2.imwrite(out_img_name, img) """ results = evaluator.evaluate(out_eval_path='FLIR_pooling_.out') if results is None: results = {} avgRGB = count_1 / num_img avgThermal = count_2 / num_img avgNMS = count_fusion / num_img print('Avg bbox for RGB:', avgRGB, "average count thermal:", avgThermal, 'average count nms:', avgNMS) return results
def forward(self, scores, proposal_boxes): instances = Instances((10, 10)) instances.proposal_boxes = Boxes(proposal_boxes) return self._output_layer.predict_probs((scores, None), [instances])
def inference_single_image( self, anchors: List[Boxes], box_cls: List[Tensor], box_delta: List[Tensor], oks_delta: List[Tensor], image_size: Tuple[int, int], ): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: anchors (list[Boxes]): list of #feature levels. Each entry contains a Boxes object, which contains all the anchors in that feature level. box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W x A, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ boxes_all, keypoint_all, scores_all, class_idxs_all = [[] for _ in range(4)] # Iterate over every feature level for box_cls_i, box_reg_i, oks_reg_i, anchors_i in zip(box_cls, box_delta, oks_delta, anchors): # HxWxAxK, predicted_prob = box_cls_i.flatten().sigmoid_() # Apply two filtering below to make NMS faster. # 1. Keep boxes with confidence score higher than threshold keep_idxs = predicted_prob > self.test_score_thresh predicted_prob = predicted_prob[keep_idxs] topk_idxs = nonzero_tuple(keep_idxs)[0] # 2. Keep top k top scoring boxes only num_topk = min(self.test_topk_candidates, topk_idxs.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, idxs = predicted_prob.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[idxs[:num_topk]] anchor_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes box_reg_i = box_reg_i[anchor_idxs] oks_reg_i = oks_reg_i[anchor_idxs] anchors_i = anchors_i[anchor_idxs] # predict boxes predicted_boxes = self.box2box_transform.apply_deltas(box_reg_i, anchors_i.tensor) predicted_marks = self.mark2mark_transform.apply_deltas(oks_reg_i, anchors_i.tensor) boxes_all.append(predicted_boxes) keypoint_all.append(predicted_marks) scores_all.append(predicted_prob) class_idxs_all.append(classes_idxs) boxes_all, keypoint_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, keypoint_all, scores_all, class_idxs_all] ] keep = batched_nms(boxes_all, scores_all, class_idxs_all, self.test_nms_thresh) keep = keep[: self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) keypoints_all = keypoint_all[keep].reshape(-1, self.num_landmark, 2) keypoints_all = torch.cat( (keypoints_all, 2 * torch.ones(keypoints_all.shape[0], self.num_landmark, 1).to(self.device)), dim=2) result.pred_keypoints = keypoints_all result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return result
def test_StandardROIHeads_scriptability(self): cfg = get_cfg() cfg.MODEL.ROI_BOX_HEAD.NAME = "FastRCNNConvFCHead" cfg.MODEL.ROI_BOX_HEAD.NUM_FC = 2 cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2" cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5) cfg.MODEL.MASK_ON = True cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.01 cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.01 num_images = 2 images_tensor = torch.rand(num_images, 20, 30) image_sizes = [(10, 10), (20, 30)] images = ImageList(images_tensor, image_sizes) num_channels = 1024 features = {"res4": torch.rand(num_images, num_channels, 1, 2)} feature_shape = {"res4": ShapeSpec(channels=num_channels, stride=16)} roi_heads = StandardROIHeads(cfg, feature_shape).eval() proposal0 = Instances(image_sizes[0]) proposal_boxes0 = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32) proposal0.proposal_boxes = Boxes(proposal_boxes0) proposal0.objectness_logits = torch.tensor([0.5, 0.7], dtype=torch.float32) proposal1 = Instances(image_sizes[1]) proposal_boxes1 = torch.tensor([[1, 5, 2, 8], [7, 3, 10, 5]], dtype=torch.float32) proposal1.proposal_boxes = Boxes(proposal_boxes1) proposal1.objectness_logits = torch.tensor([0.1, 0.9], dtype=torch.float32) proposals = [proposal0, proposal1] pred_instances, _ = roi_heads(images, features, proposals) fields = { "objectness_logits": "Tensor", "proposal_boxes": "Boxes", "pred_classes": "Tensor", "scores": "Tensor", "pred_masks": "Tensor", "pred_boxes": "Boxes", "pred_keypoints": "Tensor", "pred_keypoint_heatmaps": "Tensor", } with patch_instances(fields) as new_instances: proposal0 = new_instances.from_instances(proposal0) proposal1 = new_instances.from_instances(proposal1) proposals = [proposal0, proposal1] scripted_rot_heads = torch.jit.script(roi_heads) scripted_pred_instances, _ = scripted_rot_heads( images, features, proposals) for instance, scripted_instance in zip(pred_instances, scripted_pred_instances): self.assertEqual(instance.image_size, scripted_instance.image_size) self.assertTrue( torch.equal(instance.pred_boxes.tensor, scripted_instance.pred_boxes.tensor)) self.assertTrue( torch.equal(instance.scores, scripted_instance.scores)) self.assertTrue( torch.equal(instance.pred_classes, scripted_instance.pred_classes)) self.assertTrue( torch.equal(instance.pred_masks, scripted_instance.pred_masks))
def forward(self, proposal_deltas, proposal_boxes): instances = Instances((10, 10)) instances.proposal_boxes = Boxes(proposal_boxes) return self._output_layer.predict_boxes( (None, proposal_deltas), [instances])
def evaluate(cfg, evaluator, det_1, det_2, predictor, method, bayesian=False): evaluator.reset() img_folder = '../../../Datasets/FLIR/val/thermal_8_bit/' num_img = len(det_2['image']) count_1 = 0 count_2 = 0 count_fusion = 0 print('Method: ', method) img_folder = '../../../Datasets/FLIR/val/thermal_8_bit/' num_img = len(det_2['image']) count_1 = 0 count_2 = 0 count_fusion = 0 X = None Y = np.array([]) cnt = 0 for i in range(num_img): info_1 = {} info_1['img_name'] = det_1['image'][i] info_1['bbox'] = det_1['boxes'][i] info_1['score'] = det_1['scores'][i] info_1['class'] = det_1['classes'][i] info_1['class_logits'] = det_1['class_logits'][i] if 'probs' in det_1.keys(): info_1['prob'] = det_1['probs'][i] info_2 = {} info_2['img_name'] = det_2['image'][i].split('.')[0] + '.jpeg' info_2['bbox'] = det_2['boxes'][i] info_2['score'] = det_2['scores'][i] info_2['class'] = det_2['classes'][i] info_2['class_logits'] = det_2['class_logits'][i] if 'probs' in det_2.keys(): info_2['prob'] = det_2['probs'][i] #img_id = int(info_1['img_name'].split('.')[0].split('_')[1]) - 1 img_id = det_1['image_id'][i] # If no any detection in two results if len(info_1['bbox']) == 0 and len(info_2['bbox']) == 0: continue # If no detection in 1st model: elif len(info_1['bbox']) == 0: print('model 1 miss detected') in_boxes, in_scores, in_class, in_logits, in_prob, num_det = prepare_data_gt_1_det( info_2) elif len(info_2['bbox']) == 0: print('model 2 miss detected') in_boxes, in_scores, in_class, in_logits, in_prob, num_det = prepare_data_gt_1_det( info_1) else: in_boxes, in_scores, in_class, in_logits, in_prob, num_det = prepare_data_gt( info_1, info_2) score_results, box_results, class_results = nms_multiple_box_eval( in_boxes, in_scores, in_class, in_logits, in_prob, 0.5, num_det, method) if bayesian: # summing logits sum_logits = score_results[:, :4] + score_results[:, 4:] pred_prob_multiclass = F.softmax(torch.Tensor(sum_logits)).tolist() out_scores = np.max(pred_prob_multiclass, axis=1) out_class = np.argmax(pred_prob_multiclass, axis=1) else: pred_prob_multiclass = predictor.predict_proba(score_results) out_scores = np.max(pred_prob_multiclass, axis=1) out_class = np.argmax(pred_prob_multiclass, axis=1) #pdb.set_trace() """ Send information to evaluator """ # Image info file_name = img_folder + info_1['img_name'].split('.')[0] + '.jpeg' img = cv2.imread(file_name) H, W, _ = img.shape # Handle inputs inputs = [] input_info = {} input_info['file_name'] = file_name input_info['height'] = H input_info['width'] = W input_info['image_id'] = det_1['image_id'][i] input_info['image'] = torch.Tensor(img) inputs.append(input_info) # Handle outputs outputs = [] out_info = {} proposals = Instances([H, W]) proposals.pred_boxes = Boxes(box_results) proposals.scores = torch.Tensor(out_scores) proposals.pred_classes = torch.Tensor(out_class) out_info['instances'] = proposals outputs.append(out_info) evaluator.process(inputs, outputs) results = evaluator.evaluate(out_eval_path='FLIR_pooling_.out') if results is None: results = {} avgRGB = count_1 / num_img avgThermal = count_2 / num_img avgNMS = count_fusion / num_img print('Avg bbox for RGB:', avgRGB, "average count thermal:", avgThermal, 'average count nms:', avgNMS) return results
def apply_late_fusion_and_evaluate(cfg, evaluator, det_1, det_2, method, predictor, det_3='', bayesian=False): evaluator.reset() img_folder = '../../../Datasets/FLIR/val/thermal_8_bit/' num_img = len(det_2['image']) count_1 = 0 count_2 = 0 count_fusion = 0 print('Method: ', method) for i in range(num_img): info_1 = {} info_1['img_name'] = det_1['image'][i] info_1['bbox'] = det_1['boxes'][i] info_1['score'] = det_1['scores'][i] info_1['class'] = det_1['classes'][i] info_1['class_logits'] = det_1['class_logits'][i] if 'probs' in det_1.keys(): info_1['prob'] = det_1['probs'][i] info_2 = {} info_2['img_name'] = det_2['image'][i].split('.')[0] + '.jpeg' info_2['bbox'] = det_2['boxes'][i] info_2['score'] = det_2['scores'][i] info_2['class'] = det_2['classes'][i] info_2['class_logits'] = det_2['class_logits'][i] if 'probs' in det_2.keys(): info_2['prob'] = det_2['probs'][i] if len(info_1['bbox']) > 0: num_1 = 1 else: num_1 = 0 if len(info_2['bbox']) > 0: num_2 = 1 else: num_2 = 0 num_detections = num_1 + num_2 if det_3: info_3 = {} info_3['img_name'] = det_3['image'][i].split('.')[0] + '.jpeg' info_3['bbox'] = det_3['boxes'][i] info_3['score'] = det_3['scores'][i] info_3['class'] = det_3['classes'][i] info_3['class_logits'] = det_3['class_logits'][i] if 'probs' in det_3.keys(): info_3['prob'] = det_3['probs'][i] if len(info_3['bbox']) > 0: num_3 = 1 else: num_3 = 0 num_detections += num_3 # No detections if num_detections == 0: continue # Only 1 model detection elif num_detections == 1: if len(info_1['bbox']) > 0: out_boxes = np.array(info_1['bbox']) out_class = torch.Tensor(info_1['class']) out_scores = torch.Tensor(info_1['score']) num_det_1 = len(info_1['class_logits']) out_logits = np.zeros((num_det_1, 8)) for k in range(num_det_1): out_logits[k, :4] = info_1['class_logits'][k] elif len(info_2['bbox']) > 0: out_boxes = np.array(info_2['bbox']) out_class = torch.Tensor(info_2['class']) out_scores = torch.Tensor(info_2['score']) num_det_2 = len(info_1['class_logits']) out_logits = np.zeros((num_det_2, 8)) for k in range(num_det_1): out_logits[k, 4:] = info_1['class_logits'][k] else: if det_3: out_boxes = np.array(info_3['bbox']) out_class = torch.Tensor(info_3['class']) out_scores = torch.Tensor(info_3['score']) # Only two models with detections elif num_detections == 2: #pdb.set_trace() if not det_3: if method == 'learned_fusion' or method == 'logRegression': out_boxes, out_scores, out_class, out_logits, _, _ = fusion( method, info_1, info_2, predictor=predictor) else: out_boxes, out_scores, out_class = fusion( method, info_1, info_2) else: if len(info_1['bbox']) == 0: out_boxes, out_scores, out_class = fusion( method, info_2, info_3) elif len(info_2['bbox']) == 0: out_boxes, out_scores, out_class = fusion( method, info_1, info_3) else: out_boxes, out_scores, out_class = fusion( method, info_1, info_2) # All 3 models detected things else: out_boxes, out_scores, out_class = fusion(method, info_1, info_2, info_3=info_3) if bayesian: sum_logits = out_logits[:, :4] + out_logits[:, 4:] pred_prob_multiclass = F.softmax(torch.Tensor(sum_logits)).tolist() out_scores = np.max(pred_prob_multiclass, axis=1) out_class = np.argmax(pred_prob_multiclass, axis=1) elif method == 'learned_fusion': pred_logits = predictor(torch.Tensor(out_logits).cuda(0)) pred_prob_multiclass = F.softmax(pred_logits, dim=1).tolist() out_scores = np.max(pred_prob_multiclass, axis=1) out_class = np.argmax(pred_prob_multiclass, axis=1) elif method == 'logRegression': pred_prob_multiclass = predictor.predict_proba(out_logits) out_scores = np.max(pred_prob_multiclass, axis=1) out_class = np.argmax(pred_prob_multiclass, axis=1) file_name = img_folder + info_1['img_name'].split('.')[0] + '.jpeg' img = cv2.imread(file_name) try: H, W, _ = img.shape except: pdb.set_trace() # Handle inputs inputs = [] input_info = {} input_info['file_name'] = file_name input_info['height'] = H input_info['width'] = W input_info['image_id'] = det_2['image_id'][i] input_info['image'] = torch.Tensor(img) inputs.append(input_info) # Handle outputs outputs = [] out_info = {} proposals = Instances([H, W]) proposals.pred_boxes = Boxes(out_boxes) proposals.scores = out_scores proposals.pred_classes = out_class out_info['instances'] = proposals outputs.append(out_info) evaluator.process(inputs, outputs) results = evaluator.evaluate(out_eval_path='FLIR_pooling_.out') if results is None: results = {} return results
def forward(self, features, pred_instances=None, targets=None): if self.edge_on: with timer.env("pfpn_back"): for i, f in enumerate(self.in_features): if i == 0: x = self.scale_heads[i](features[f]) else: x = x + self.scale_heads[i](features[f]) if self.edge_on: with timer.env("edge"): pred_logits = self.predictor(x) pred_edge = pred_logits.sigmoid() if self.attention: # print('pred edge', pred_edge) att_map = self.attender( 1 - pred_edge ) # regions that need evolution if self.training: edge_target = targets[0] if self.edge_in: edge_prior = targets[0].unsqueeze(1).float().clone() # (B, 1, H, W) edge_prior[edge_prior == self.ignore_value] = 0 # remove ignore value edge_prior = self.mean_filter(edge_prior) edge_prior = F.interpolate( edge_prior, scale_factor=1 / self.common_stride, mode="bilinear", align_corners=False, ) edge_prior[edge_prior > 0] = 1 if self.strong_feat: snake_input = torch.cat([edge_prior, x], dim=1) else: snake_input = torch.cat([edge_prior, features["p2"]], dim=1) else: if self.strong_feat: snake_input = x else: snake_input = features["p2"] if self.edge_on: pred_edge_full = F.interpolate( pred_edge, scale_factor=self.common_stride, mode="bilinear", align_corners=False, ) if self.selective_refine: edge_prior = targets[0].unsqueeze(1).float().clone() # (B, 1, H, W) edge_prior[edge_prior == self.ignore_value] = 0 # remove ignore value edge_prior = self.dilate_filter(edge_prior) # edge_prior = self.dilate_filter(edge_prior) # edge_target = edge_prior.clone() edge_prior[edge_prior > 0] = 1 edge_prior = F.interpolate( edge_prior, scale_factor=1 / self.common_stride, mode="bilinear", align_corners=False, ) if self.strong_feat: snake_input = torch.cat([edge_prior, x], dim=1) else: if self.pred_edge: snake_input = torch.cat( [edge_prior, pred_logits, features["p2"]], dim=1 ) else: snake_input = torch.cat([edge_prior, features["p2"]], dim=1) if self.attention: if self.strong_feat: snake_input = torch.cat([att_map, x], dim=1) else: # dont cater pred_edge option now snake_input = torch.cat([att_map, features["p2"]], dim=1) ### Quick fix for batches that do not have poly after filtering _, poly_loss = self.refine_head(snake_input, None, targets[1]) if self.edge_on: edge_loss = self.loss(pred_edge_full, edge_target) * self.loss_weight poly_loss.update( { "loss_edge_det": edge_loss, } ) return [], poly_loss, [] else: if self.edge_in or self.selective_refine: if self.edge_map_thre > 0: pred_edge = (pred_edge > self.edge_map_thre).float() if "edge" in self.gt_input: assert targets[0] is not None pred_edge = targets[0].unsqueeze(1).float().clone() pred_edge[pred_edge == self.ignore_value] = 0 # remove ignore value if self.selective_refine: pred_edge = self.dilate_filter(pred_edge) # pred_edge = self.dilate_filter(pred_edge) pred_edge = F.interpolate( pred_edge, scale_factor=1 / self.common_stride, mode="bilinear", align_corners=False, ) pred_edge[pred_edge > 0] = 1 if self.strong_feat: snake_input = torch.cat([pred_edge, x], dim=1) else: snake_input = torch.cat([pred_edge, features["p2"]], dim=1) else: if self.strong_feat: snake_input = x else: snake_input = features["p2"] if self.attention: if self.strong_feat: snake_input = torch.cat([att_map, x], dim=1) else: # dont cater pred_edge option now snake_input = torch.cat([att_map, features["p2"]], dim=1) if "instance" in self.gt_input: assert targets[1][0] is not None for im_i in range(len(targets[1][0])): gt_instances_per_im = targets[1][0][im_i] bboxes = gt_instances_per_im.gt_boxes.tensor instances_per_im = Instances(pred_instances[im_i]._image_size) instances_per_im.pred_boxes = Boxes(bboxes) instances_per_im.pred_classes = gt_instances_per_im.gt_classes instances_per_im.scores = torch.ones_like( gt_instances_per_im.gt_classes, device=bboxes.device ) if gt_instances_per_im.has("gt_masks"): gt_masks = gt_instances_per_im.gt_masks ext_pts_off = self.refine_head.get_simple_extreme_points( gt_masks.polygons ).to(bboxes.device) ex_t = torch.stack( [ext_pts_off[:, None, 0], bboxes[:, None, 1]], dim=2 ) ex_l = torch.stack( [bboxes[:, None, 0], ext_pts_off[:, None, 1]], dim=2 ) ex_b = torch.stack( [ext_pts_off[:, None, 2], bboxes[:, None, 3]], dim=2 ) ex_r = torch.stack( [bboxes[:, None, 2], ext_pts_off[:, None, 3]], dim=2 ) instances_per_im.ext_points = ExtremePoints( torch.cat([ex_t, ex_l, ex_b, ex_r], dim=1) ) # TODO: NOTE: Test for theoretic limit. ##### # contours = self.refine_head.get_simple_contour(gt_masks) # poly_sample_targets = [] # for i, cnt in enumerate(contours): # if cnt is None: # xmin, ymin = bboxes[:, 0], bboxes[:, 1] # (n,) # xmax, ymax = bboxes[:, 2], bboxes[:, 3] # (n,) # box = [ # xmax, ymin, xmin, ymin, xmin, ymax, xmax, ymax # ] # box = torch.stack(box, dim=1).view(-1, 4, 2) # sampled_box = self.refine_head.uniform_upsample(box[None], # self.refine_head.num_sampling) # poly_sample_targets.append(sampled_box[i]) # # print(sampled_box.shape) # continue # # # 1) uniform-sample # oct_sampled_targets = self.refine_head.uniform_sample(cnt, # len(cnt) * self.refine_head.num_sampling) # (big, 2) # tt_idx = np.random.randint(len(oct_sampled_targets)) # oct_sampled_targets = np.roll(oct_sampled_targets, -tt_idx, axis=0)[::len(cnt)] # oct_sampled_targets = torch.tensor(oct_sampled_targets, device=bboxes.device) # poly_sample_targets.append(oct_sampled_targets) # # print(oct_sampled_targets.shape) # # # 2) polar-sample # # ... # poly_sample_targets = torch.stack(poly_sample_targets, dim=0) # instances_per_im.pred_polys = PolygonPoints(poly_sample_targets) # TODO: NOTE: Test for theoretic limit. ##### pred_instances[im_i] = instances_per_im new_instances, _ = self.refine_head(snake_input, pred_instances, None) # new_instances = pred_instances if not self.edge_on: pred_edge = torch.rand(1, 1, 5, 5, device=snake_input.device) if self.attention: pred_edge = att_map return pred_edge, {}, new_instances
def fast_rcnn_inference_single_image_with_anchor(proposals, boxes, scores, image_shape, score_thresh, nms_thresh, topk_per_image): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes per image. Returns: Same as `fast_rcnn_inference`, but for only one image. """ anchors = proposals.get_fields()['anchor_boxes'].tensor proposals = proposals.get_fields()['proposal_boxes'].tensor valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all( dim=1) if not valid_mask.all(): boxes = boxes[valid_mask] scores = scores[valid_mask] anchors = anchors[valid_mask] proposals = proposals[valid_mask] scores = scores[:, :-1] num_bbox_reg_classes = boxes.shape[1] // 4 # Convert to Boxes to use the `clip` function ... boxes = Boxes(boxes.reshape(-1, 4)) boxes.clip(image_shape) boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4 anchors = Boxes(anchors) proposals = Boxes(proposals) anchors.clip(image_shape) proposals.clip(image_shape) anchors = anchors.tensor proposals = proposals.tensor # Filter results based on detection scores filter_mask = scores > score_thresh # R x K # R' x 2. First column contains indices of the R predictions; # Second column contains indices of classes. filter_inds = filter_mask.nonzero() if num_bbox_reg_classes == 1: boxes = boxes[filter_inds[:, 0], 0] else: boxes = boxes[filter_mask] scores = scores[filter_mask] anchors = anchors[filter_inds[:, 0]] proposals = proposals[filter_inds[:, 0]] # Apply per-class NMS keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh) if topk_per_image >= 0: keep = keep[:topk_per_image] boxes, scores, filter_inds, anchors, proposals = boxes[keep], scores[keep], filter_inds[keep], anchors[keep], \ proposals[keep] result = Instances(image_shape) result.pred_boxes = Boxes(boxes) result.scores = scores result.pred_classes = filter_inds[:, 1] result.anchors = Boxes(anchors) result.proposals = Boxes(proposals) return result, filter_inds[:, 0]
def _desc_to_example(desc: Dict): # Detectron2 Model Input Format: # image: Tensor[C, H, W]; # height, width: output height and width; # instances: Instances Object to training, with the following fields: # "gt_boxes": # "gt_classes": # "gt_masks": a PolygonMasks or BitMasks object storing N masks, one for each instance. desc = copy.deepcopy(desc) # it will be modified by code below image_path = os.path.join(images_dir, f'{desc["image_id"]}.jpg') # shape: [H, W, C] origin_image = detection_utils.read_image(image_path, format="BGR") oh, ow, oc = origin_height, origin_width, origin_channels = origin_image.shape if augmentations is not None: aug_input = T.AugInput(origin_image) transforms = augmentations(aug_input) auged_image = aug_input.image else: auged_image = origin_image ah, aw, ac = auged_height, auged_width, auged_channels = auged_image.shape if not is_train: return { "image_id": desc['image_id'], # COCOEvaluator.process() need it. # expected shape: [C, H, W] "image": torch.as_tensor( np.ascontiguousarray(auged_image.transpose(2, 0, 1))), "height": auged_height, "width": auged_width, } target = Instances(image_size=(ah, aw)) if 'fill gt_boxes': # shape: n_box, 4 boxes_abs = np.array( [anno['bbox'] for anno in desc['annotations']]) if augmentations is not None: # clip transformed bbox to image size boxes_auged = transforms.apply_box( np.array(boxes_abs)).clip(min=0) boxes_auged = np.minimum( boxes_auged, np.array([aw, ah, aw, ah])[np.newaxis, :]) else: boxes_auged = boxes_abs target.gt_boxes = Boxes(boxes_auged) if 'fill gt_classes': classes = [anno['category_id'] for anno in desc['annotations']] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if 'fill gt_masks': mask_paths = [ os.path.join(masks_dir, f'{anno["mask_id"]}.png') for anno in desc['annotations'] ] masks = np.array( list( map( lambda p: cv2.resize(cv2.imread( p, flags=cv2.IMREAD_GRAYSCALE), dsize=(ow, oh)), mask_paths))) if augmentations is not None: masks_auged = np.array( list(map(lambda x: transforms.apply_segmentation(x), masks))) else: masks_auged = masks masks_auged = masks_auged > MASK_THRESHOLD masks_auged = BitMasks( torch.stack([ torch.from_numpy(np.ascontiguousarray(x)) for x in masks_auged ])) target.gt_masks = masks_auged return { "image_id": desc['image_id'], # COCOEvaluator.process() need it. # expected shape: [C, H, W] "image": torch.as_tensor( np.ascontiguousarray(auged_image.transpose(2, 0, 1))), "height": auged_height, "width": auged_width, "instances": target, # refer: annotations_to_instances() }
def losses(self, logits_pred, reg_pred, ctrness_pred, locations, gt_instances, top_feats=None): """ Return the losses from a set of FCOS predictions and their associated ground-truth. Returns: dict[loss name -> loss value]: A dict mapping from loss name to loss value. """ training_targets = self._get_ground_truth(locations, gt_instances) # Collect all logits and regression predictions over feature maps # and images to arrive at the same shape as the labels and targets # The final ordering is L, N, H, W from slowest to fastest axis. instances = Instances((0, 0)) instances.labels = cat( [ # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,) x.reshape(-1) for x in training_targets["labels"] ], dim=0) instances.gt_inds = cat( [ # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,) x.reshape(-1) for x in training_targets["target_inds"] ], dim=0) instances.im_inds = cat( [x.reshape(-1) for x in training_targets["im_inds"]], dim=0) instances.reg_targets = cat( [ # Reshape: (N, Hi, Wi, 4) -> (N*Hi*Wi, 4) x.reshape(-1, 4) for x in training_targets["reg_targets"] ], dim=0, ) instances.locations = cat( [x.reshape(-1, 2) for x in training_targets["locations"]], dim=0) instances.fpn_levels = cat( [x.reshape(-1) for x in training_targets["fpn_levels"]], dim=0) instances.logits_pred = cat( [ # Reshape: (N, C, Hi, Wi) -> (N, Hi, Wi, C) -> (N*Hi*Wi, C) x.permute(0, 2, 3, 1).reshape(-1, self.num_classes) for x in logits_pred ], dim=0, ) instances.reg_pred = cat( [ # Reshape: (N, B, Hi, Wi) -> (N, Hi, Wi, B) -> (N*Hi*Wi, B) x.permute(0, 2, 3, 1).reshape(-1, 4) for x in reg_pred ], dim=0, ) instances.ctrness_pred = cat( [ # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,) x.permute(0, 2, 3, 1).reshape(-1) for x in ctrness_pred ], dim=0, ) if len(top_feats) > 0: instances.top_feats = cat( [ # Reshape: (N, -1, Hi, Wi) -> (N*Hi*Wi, -1) x.permute(0, 2, 3, 1).reshape(-1, x.size(1)) for x in top_feats ], dim=0, ) return self.fcos_losses(instances)
def inference_single_image( self, pred_logits, pred_deltas, pred_masks, anchors, indexes, image_size ): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: pred_logits (list[Tensor]): list of #feature levels. Each entry contains tensor of size (AxHxW, K) pred_deltas (list[Tensor]): Same shape as 'pred_logits' except that K becomes 4. pred_masks (list[list[Tensor]]): List of #feature levels, each is a list of #anchors. Each entry contains tensor of size (M_i*M_i, H, W). `None` if mask_on=False. anchors (list[Boxes]): list of #feature levels. Each entry contains a Boxes object, which contains all the anchors for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ pred_logits = pred_logits.flatten().sigmoid_() # We get top locations across all levels to accelerate the inference speed, # which does not seem to affect the accuracy. # First select values above the threshold logits_top_idxs = torch.where(pred_logits > self.score_threshold)[0] # Then get the top values num_topk = min(self.topk_candidates, logits_top_idxs.shape[0]) pred_prob, topk_idxs = pred_logits[logits_top_idxs].sort(descending=True) # Keep top k scoring values pred_prob = pred_prob[:num_topk] # Keep top k values top_idxs = logits_top_idxs[topk_idxs[:num_topk]] # class index cls_idxs = top_idxs % self.num_classes # HWA index top_idxs //= self.num_classes # predict boxes pred_boxes = self.box2box_transform.apply_deltas( pred_deltas[top_idxs], anchors[top_idxs].tensor ) # apply caffe_nms keep = batched_nms(pred_boxes, pred_prob, cls_idxs, self.nms_threshold) # pick the top ones keep = keep[: self.detections_im] results = Instances(image_size) results.pred_boxes = Boxes(pred_boxes[keep]) results.scores = pred_prob[keep] results.pred_classes = cls_idxs[keep] # deal with masks result_masks, result_anchors = [], None if self.mask_on: # index and anchors, useful for masks top_indexes = indexes[top_idxs] top_anchors = anchors[top_idxs] result_indexes = top_indexes[keep] result_anchors = top_anchors[keep] # Get masks and do sigmoid for lvl, _, h, w, anc in result_indexes.tolist(): cur_size = self.mask_sizes[anc] * (2 ** lvl if self.bipyramid_on else 1) result_masks.append( torch.sigmoid(pred_masks[lvl][anc][:, h, w].view(1, cur_size, cur_size)) ) return results, (result_masks, result_anchors)
def _inference_one_image(self, input): """ Args: input (dict): one dataset dict Returns: dict: one output dict """ augmented_inputs = self.tta_mapper(input) do_hflip = [k.pop("horiz_flip", False) for k in augmented_inputs] heights = [k["height"] for k in augmented_inputs] widths = [k["width"] for k in augmented_inputs] assert ( len(set(heights)) == 1 and len(set(widths)) == 1 ), "Augmented version of the inputs should have the same original resolution!" height = heights[0] width = widths[0] # 1. Detect boxes from all augmented versions # 1.1: forward with all augmented images with self._turn_off_roi_head("mask_on"), self._turn_off_roi_head("keypoint_on"): # temporarily disable mask/keypoint head outputs = self._batch_inference(augmented_inputs, do_postprocess=False) # 1.2: union the results all_boxes = [] all_scores = [] all_classes = [] for idx, output in enumerate(outputs): rescaled_output = detector_postprocess(output, height, width) pred_boxes = rescaled_output.pred_boxes.tensor if do_hflip[idx]: pred_boxes[:, [0, 2]] = width - pred_boxes[:, [2, 0]] all_boxes.append(pred_boxes) all_scores.extend(rescaled_output.scores) all_classes.extend(rescaled_output.pred_classes) all_boxes = torch.cat(all_boxes, dim=0).cpu() num_boxes = len(all_boxes) # 1.3: select from the union of all results num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES # +1 because fast_rcnn_inference expects background scores as well all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device) for idx, cls, score in zip(count(), all_classes, all_scores): all_scores_2d[idx, cls] = score merged_instances, _ = fast_rcnn_inference_single_image( all_boxes, all_scores_2d, (height, width), 1e-8, self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST, self.cfg.TEST.DETECTIONS_PER_IMAGE, ) if not self.cfg.MODEL.MASK_ON: return {"instances": merged_instances} # 2. Use the detected boxes to obtain masks # 2.1: rescale the detected boxes augmented_instances = [] for idx, input in enumerate(augmented_inputs): actual_height, actual_width = input["image"].shape[1:3] scale_x = actual_width * 1.0 / width scale_y = actual_height * 1.0 / height pred_boxes = merged_instances.pred_boxes.clone() pred_boxes.tensor[:, 0::2] *= scale_x pred_boxes.tensor[:, 1::2] *= scale_y if do_hflip[idx]: pred_boxes.tensor[:, [0, 2]] = actual_width - pred_boxes.tensor[:, [2, 0]] aug_instances = Instances( image_size=(actual_height, actual_width), pred_boxes=pred_boxes, pred_classes=merged_instances.pred_classes, scores=merged_instances.scores, ) augmented_instances.append(aug_instances) # 2.2: run forward on the detected boxes outputs = self._batch_inference(augmented_inputs, augmented_instances, do_postprocess=False) for idx, output in enumerate(outputs): if do_hflip[idx]: output.pred_masks = output.pred_masks.flip(dims=[3]) # 2.3: average the predictions all_pred_masks = torch.stack([o.pred_masks for o in outputs], dim=0) avg_pred_masks = torch.mean(all_pred_masks, dim=0) output = outputs[0] output.pred_masks = avg_pred_masks output = detector_postprocess(output, height, width) return {"instances": output}
def get_empty_instance(h, w): inst = Instances((h, w)) inst.gt_boxes = Boxes(torch.rand(0, 4)) inst.gt_classes = torch.tensor([]).to(dtype=torch.int64) inst.gt_masks = BitMasks(torch.rand(0, h, w)) return inst
def extract_feat_multigpu(split_idx, img_list, cfg, args, actor: ActorHandle): # NOTE ray num_images = len(img_list) print('Number of images on split{}: {}.'.format(split_idx, num_images)) model = DefaultTrainer.build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume ) model.eval() for im_file in (img_list): if os.path.exists(os.path.join(args.output_dir, im_file.split('.')[0]+'.npz')): actor.update.remote(1) # NOTE ray continue im = cv2.imread(os.path.join(args.image_dir, im_file)) if im is None: print(os.path.join(args.image_dir, im_file), "is illegal!") actor.update.remote(1) # NOTE ray continue dataset_dict = get_image_blob(im, cfg.MODEL.PIXEL_MEAN) # extract roi features if cfg.MODEL.BUA.EXTRACTOR.MODE == 1: attr_scores = None with torch.set_grad_enabled(False): if cfg.MODEL.BUA.ATTRIBUTE_ON: boxes, scores, features_pooled, attr_scores = model([dataset_dict]) else: boxes, scores, features_pooled = model([dataset_dict]) boxes = [box.tensor.cpu() for box in boxes] scores = [score.cpu() for score in scores] features_pooled = [feat.cpu() for feat in features_pooled] if not attr_scores is None: attr_scores = [attr_score.cpu() for attr_score in attr_scores] generate_npz(1, args, cfg, im_file, im, dataset_dict, boxes, scores, features_pooled, attr_scores) # extract bbox only elif cfg.MODEL.BUA.EXTRACTOR.MODE == 2: with torch.set_grad_enabled(False): boxes, scores = model([dataset_dict]) boxes = [box.cpu() for box in boxes] scores = [score.cpu() for score in scores] generate_npz(2, args, cfg, im_file, im, dataset_dict, boxes, scores) # extract roi features by bbox elif cfg.MODEL.BUA.EXTRACTOR.MODE == 3: if not os.path.exists(os.path.join(args.bbox_dir, im_file.split('.')[0]+'.npz')): actor.update.remote(1) # NOTE ray continue bbox = torch.from_numpy(np.load(os.path.join(args.bbox_dir, im_file.split('.')[0]+'.npz'))['bbox']) * dataset_dict['im_scale'] proposals = Instances(dataset_dict['image'].shape[-2:]) proposals.proposal_boxes = BUABoxes(bbox) dataset_dict['proposals'] = proposals attr_scores = None with torch.set_grad_enabled(False): if cfg.MODEL.BUA.ATTRIBUTE_ON: boxes, scores, features_pooled, attr_scores = model([dataset_dict]) else: boxes, scores, features_pooled = model([dataset_dict]) boxes = [box.tensor.cpu() for box in boxes] scores = [score.cpu() for score in scores] features_pooled = [feat.cpu() for feat in features_pooled] if not attr_scores is None: attr_scores = [attr_score.data.cpu() for attr_score in attr_scores] generate_npz(3, args, cfg, im_file, im, dataset_dict, boxes, scores, features_pooled, attr_scores) actor.update.remote(1) # NOTE ray
def find_top_rpn_proposals( proposals: List[torch.Tensor], pred_objectness_logits: List[torch.Tensor], image_sizes: List[Tuple[int, int]], nms_thresh: float, pre_nms_topk: int, post_nms_topk: int, min_box_size: int, training: bool, ): """ For each feature map, select the `pre_nms_topk` highest scoring proposals, apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` highest scoring proposals among all the feature maps for each image. Args: proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4). All proposal predictions on the feature maps. pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). image_sizes (list[tuple]): sizes (h, w) for each image nms_thresh (float): IoU threshold to use for NMS pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is per feature map. post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is total, over all feature maps. min_box_size (float): minimum proposal box side length in pixels (absolute units wrt input images). training (bool): True if proposals are to be used in training, otherwise False. This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." comment. Returns: list[Instances]: list of N Instances. The i-th Instances stores post_nms_topk object proposals for image i, sorted by their objectness score in descending order. """ num_images = len(image_sizes) device = proposals[0].device # 1. Select top-k anchor for every level and every image topk_scores = [] # #lvl Tensor, each of shape N x topk topk_proposals = [] level_ids = [] # #lvl Tensor, each of shape (topk,) batch_idx = torch.arange(num_images, device=device) for level_id, proposals_i, logits_i in zip(itertools.count(), proposals, pred_objectness_logits): Hi_Wi_A = logits_i.shape[1] num_proposals_i = min(pre_nms_topk, Hi_Wi_A) # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812) # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1) logits_i, idx = logits_i.sort(descending=True, dim=1) topk_scores_i = logits_i[batch_idx, :num_proposals_i] topk_idx = idx[batch_idx, :num_proposals_i] # each is N x topk topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 4 topk_proposals.append(topk_proposals_i) topk_scores.append(topk_scores_i) level_ids.append( torch.full((num_proposals_i, ), level_id, dtype=torch.int64, device=device)) # 2. Concat all levels together topk_scores = cat(topk_scores, dim=1) topk_proposals = cat(topk_proposals, dim=1) level_ids = cat(level_ids, dim=0) # 3. For each image, run a per-level NMS, and choose topk results. results = [] for n, image_size in enumerate(image_sizes): boxes = Boxes(topk_proposals[n]) scores_per_img = topk_scores[n] lvl = level_ids valid_mask = torch.isfinite( boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img) if not valid_mask.all(): if training: raise FloatingPointError( "Predicted boxes or scores contain Inf/NaN. Training has diverged." ) boxes = boxes[valid_mask] scores_per_img = scores_per_img[valid_mask] lvl = lvl[valid_mask] boxes.clip(image_size) # filter empty boxes keep = boxes.nonempty(threshold=min_box_size) if keep.sum().item() != len(boxes): boxes, scores_per_img, lvl = boxes[keep], scores_per_img[ keep], lvl[keep] keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh) # In Detectron1, there was different behavior during training vs. testing. # (https://github.com/facebookresearch/Detectron/issues/459) # During training, topk is over the proposals from *all* images in the training batch. # During testing, it is over the proposals for each image separately. # As a result, the training behavior becomes batch-dependent, # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. # This bug is addressed in Detectron2 to make the behavior independent of batch size. keep = keep[:post_nms_topk] # keep is already sorted res = Instances(image_size) res.proposal_boxes = boxes[keep] res.objectness_logits = scores_per_img[keep] results.append(res) return results
def construct_hopairs(self, instances: List[Instances]) -> List[Instances]: """ Prepare person-object pairs to be used to train HOI heads. At training, it returns union regions of person-object proposals and assigns training labels. It returns ``self.hoi_batch_size_per_image`` random samples from pesron-object pairs, with a fraction of positives that is no larger than ``self.hoi_positive_sample_fraction``. At inference, it returns union regions of predicted person boxes and object boxes. Args: instances (list[Instances]): At training, proposals_with_gt. See ``self.label_and_sample_proposals`` At inference, predicted box instances. See ``self._forward_box`` Returns: list[Instances]: length `N` list of `Instances`s containing the human-object pairs. Each `Instances` has the following fields: - union_boxes: the union region of person boxes and object boxes - person_boxes: person boxes in a matched sequences with union_boxes - object_boxes: object boxes in a matched sequences with union_boxes - gt_actions: the ground-truth actions that the pair is assigned. Used for training HOI head. - person_box_scores: person box scores from box instances. Used at inference. - object_box_scores: object box scores from box instances. Used at inference. - object_box_classes: predicted box classes from box instances. Used at inference. """ hopairs = [] for instances_per_image in instances: if self.training: # Proposals generated from person branch in HORPN will be seen as person boxes; # Proposals generated from object branch in HORPN will be object boxes. boxes = instances_per_image.proposal_boxes person_idxs = (instances_per_image.is_person == 1).nonzero().squeeze(1) object_idxs = (instances_per_image.is_person == 0).nonzero().squeeze(1) else: # At inference, split person/object boxes based on predicted classes by box head boxes = instances_per_image.pred_boxes person_idxs = torch.nonzero(instances_per_image.pred_classes == 0).squeeze(1) object_idxs = torch.nonzero(instances_per_image.pred_classes > 0).squeeze(1) if self.allow_person_to_person: # Allow person to person interactions. Then all boxes will be used. object_idxs = torch.arange(len(instances_per_image), device=object_idxs.device) num_pboxes, num_oboxes = person_idxs.numel(), object_idxs.numel() union_boxes = _pairwise_union_regions(boxes[person_idxs], boxes[object_idxs]) # Indexing person/object boxes in a matched order. person_idxs = person_idxs[:, None].repeat(1, num_oboxes).flatten() object_idxs = object_idxs[None, :].repeat(num_pboxes, 1).flatten() # Remove self-to-self interaction. keep = (person_idxs != object_idxs).nonzero().squeeze(1) union_boxes = union_boxes[keep] person_idxs = person_idxs[keep] object_idxs = object_idxs[keep] hopairs_per_image = Instances(instances_per_image.image_size) hopairs_per_image.union_boxes = union_boxes hopairs_per_image.person_boxes = boxes[person_idxs] hopairs_per_image.object_boxes = boxes[object_idxs] if self.training: # `person_idxs` and `object_idxs` are used in self.label_and_sample_hopairs() hopairs_per_image.person_idxs = person_idxs hopairs_per_image.object_idxs = object_idxs else: hopairs_per_image.person_box_scores = instances_per_image.scores[person_idxs] hopairs_per_image.object_box_scores = instances_per_image.scores[object_idxs] hopairs_per_image.object_box_classes = instances_per_image.pred_classes[object_idxs] hopairs.append(hopairs_per_image) if self.training: hopairs = self.label_and_sample_hopairs(hopairs, instances) return hopairs
def test_rpn(self): torch.manual_seed(121) cfg = get_cfg() cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RPN" cfg.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator" cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1) backbone = build_backbone(cfg) proposal_generator = build_proposal_generator(cfg, backbone.output_shape()) num_images = 2 images_tensor = torch.rand(num_images, 20, 30) image_sizes = [(10, 10), (20, 30)] images = ImageList(images_tensor, image_sizes) image_shape = (15, 15) num_channels = 1024 features = {"res4": torch.rand(num_images, num_channels, 1, 2)} gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32) gt_instances = Instances(image_shape) gt_instances.gt_boxes = Boxes(gt_boxes) with EventStorage(): # capture events in a new storage to discard them proposals, proposal_losses = proposal_generator( images, features, [gt_instances[0], gt_instances[1]] ) expected_losses = { "loss_rpn_cls": torch.tensor(0.0804563984), "loss_rpn_loc": torch.tensor(0.0990132466), } for name in expected_losses.keys(): err_msg = "proposal_losses[{}] = {}, expected losses = {}".format( name, proposal_losses[name], expected_losses[name] ) self.assertTrue(torch.allclose(proposal_losses[name], expected_losses[name]), err_msg) expected_proposal_boxes = [ Boxes(torch.tensor([[0, 0, 10, 10], [7.3365392685, 0, 10, 10]])), Boxes( torch.tensor( [ [0, 0, 30, 20], [0, 0, 16.7862777710, 13.1362524033], [0, 0, 30, 13.3173446655], [0, 0, 10.8602609634, 20], [7.7165775299, 0, 27.3875980377, 20], ] ) ), ] expected_objectness_logits = [ torch.tensor([0.1225359365, -0.0133192837]), torch.tensor([0.1415634006, 0.0989848152, 0.0565387346, -0.0072308783, -0.0428492837]), ] for proposal, expected_proposal_box, im_size, expected_objectness_logit in zip( proposals, expected_proposal_boxes, image_sizes, expected_objectness_logits ): self.assertEqual(len(proposal), len(expected_proposal_box)) self.assertEqual(proposal.image_size, im_size) self.assertTrue( torch.allclose(proposal.proposal_boxes.tensor, expected_proposal_box.tensor) ) self.assertTrue(torch.allclose(proposal.objectness_logits, expected_objectness_logit))
def assemble_rcnn_outputs_by_name(image_sizes, tensor_outputs, force_mask_on=False): """ A function to assemble caffe2 model's outputs (i.e. Dict[str, Tensor]) to detectron2's format (i.e. list of Instances instance). This only works when the model follows the Caffe2 detectron's naming convention. Args: image_sizes (List[List[int, int]]): [H, W] of every image. tensor_outputs (Dict[str, Tensor]): external_output to its tensor. force_mask_on (Bool): if true, the it make sure there'll be pred_masks even if the mask is not found from tensor_outputs (usually due to model crash) """ results = [Instances(image_size) for image_size in image_sizes] batch_splits = tensor_outputs.get("batch_splits", None) if batch_splits: raise NotImplementedError() assert len(image_sizes) == 1 result = results[0] bbox_nms = tensor_outputs["bbox_nms"] score_nms = tensor_outputs["score_nms"] class_nms = tensor_outputs["class_nms"] # Detection will always success because Conv support 0-batch assert bbox_nms is not None assert score_nms is not None assert class_nms is not None if bbox_nms.shape[1] == 5: result.pred_boxes = RotatedBoxes(bbox_nms) else: result.pred_boxes = Boxes(bbox_nms) result.scores = score_nms result.pred_classes = class_nms.to(torch.int64) mask_fcn_probs = tensor_outputs.get("mask_fcn_probs", None) if mask_fcn_probs is not None: # finish the mask pred mask_probs_pred = mask_fcn_probs num_masks = mask_probs_pred.shape[0] class_pred = result.pred_classes indices = torch.arange(num_masks, device=class_pred.device) mask_probs_pred = mask_probs_pred[indices, class_pred][:, None] result.pred_masks = mask_probs_pred elif force_mask_on: # NOTE: there's no way to know the height/width of mask here, it won't be # used anyway when batch size is 0, so just set them to 0. result.pred_masks = torch.zeros([0, 1, 0, 0], dtype=torch.uint8) keypoints_out = tensor_outputs.get("keypoints_out", None) kps_score = tensor_outputs.get("kps_score", None) if keypoints_out is not None: # keypoints_out: [N, 4, #kypoints], where 4 is in order of (x, y, score, prob) keypoints_tensor = keypoints_out # NOTE: it's possible that prob is not calculated if "should_output_softmax" # is set to False in HeatmapMaxKeypoint, so just using raw score, seems # it doesn't affect mAP. TODO: check more carefully. keypoint_xyp = keypoints_tensor.transpose(1, 2)[:, :, [0, 1, 2]] result.pred_keypoints = keypoint_xyp elif kps_score is not None: # keypoint heatmap to sparse data structure pred_keypoint_logits = kps_score keypoint_head.keypoint_rcnn_inference(pred_keypoint_logits, [result]) return results
def test_rrpn(self): torch.manual_seed(121) cfg = get_cfg() cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RRPN" cfg.MODEL.ANCHOR_GENERATOR.NAME = "RotatedAnchorGenerator" cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]] cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1]] cfg.MODEL.ANCHOR_GENERATOR.ANGLES = [[0, 60]] cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1, 1) cfg.MODEL.RPN.HEAD_NAME = "StandardRPNHead" backbone = build_backbone(cfg) proposal_generator = build_proposal_generator(cfg, backbone.output_shape()) num_images = 2 images_tensor = torch.rand(num_images, 20, 30) image_sizes = [(10, 10), (20, 30)] images = ImageList(images_tensor, image_sizes) image_shape = (15, 15) num_channels = 1024 features = {"res4": torch.rand(num_images, num_channels, 1, 2)} gt_boxes = torch.tensor([[2, 2, 2, 2, 0], [4, 4, 4, 4, 0]], dtype=torch.float32) gt_instances = Instances(image_shape) gt_instances.gt_boxes = RotatedBoxes(gt_boxes) with EventStorage(): # capture events in a new storage to discard them proposals, proposal_losses = proposal_generator( images, features, [gt_instances[0], gt_instances[1]] ) expected_losses = { "loss_rpn_cls": torch.tensor(0.043263837695121765), "loss_rpn_loc": torch.tensor(0.14432406425476074), } for name in expected_losses.keys(): err_msg = "proposal_losses[{}] = {}, expected losses = {}".format( name, proposal_losses[name], expected_losses[name] ) self.assertTrue(torch.allclose(proposal_losses[name], expected_losses[name]), err_msg) expected_proposal_boxes = [ RotatedBoxes( torch.tensor( [ [0.60189795, 1.24095452, 61.98131943, 18.03621292, -4.07244873], [15.64940453, 1.69624567, 59.59749603, 16.34339333, 2.62692475], [-3.02982378, -2.69752932, 67.90952301, 59.62455750, 59.97010040], [16.71863365, 1.98309708, 35.61507797, 32.81484985, 62.92267227], [0.49432933, -7.92979717, 67.77606201, 62.93098450, -1.85656738], [8.00880814, 1.36017394, 121.81007385, 32.74150467, 50.44297409], [16.44299889, -4.82221127, 63.39775848, 61.22503662, 54.12270737], [5.00000000, 5.00000000, 10.00000000, 10.00000000, -0.76943970], [17.64130402, -0.98095351, 61.40377808, 16.28918839, 55.53118134], [0.13016054, 4.60568953, 35.80157471, 32.30180359, 62.52872086], [-4.26460743, 0.39604485, 124.30079651, 31.84611320, -1.58203125], [7.52815342, -0.91636634, 62.39784622, 15.45565224, 60.79549789], ] ) ), RotatedBoxes( torch.tensor( [ [0.07734215, 0.81635046, 65.33510590, 17.34688377, -1.51821899], [-3.41833067, -3.11320257, 64.17595673, 60.55617905, 58.27033234], [20.67383385, -6.16561556, 63.60531998, 62.52315903, 54.85546494], [15.00000000, 10.00000000, 30.00000000, 20.00000000, -0.18218994], [9.22646523, -6.84775209, 62.09895706, 65.46472931, -2.74307251], [15.00000000, 4.93451595, 30.00000000, 9.86903191, -0.60272217], [8.88342094, 2.65560246, 120.95362854, 32.45022202, 55.75970078], [16.39088631, 2.33887148, 34.78761292, 35.61492920, 60.81977463], [9.78298569, 10.00000000, 19.56597137, 20.00000000, -0.86660767], [1.28576660, 5.49873352, 34.93610382, 33.22600174, 60.51599884], [17.58912468, -1.63270092, 62.96052551, 16.45713997, 52.91245270], [5.64749718, -1.90428460, 62.37649155, 16.19474792, 61.09543991], [0.82255805, 2.34931135, 118.83985901, 32.83671188, 56.50753784], [-5.33874989, 1.64404404, 125.28501892, 33.35424042, -2.80731201], ] ) ), ] expected_objectness_logits = [ torch.tensor( [ 0.10111768, 0.09112845, 0.08466332, 0.07589971, 0.06650183, 0.06350251, 0.04299347, 0.01864817, 0.00986163, 0.00078543, -0.04573630, -0.04799230, ] ), torch.tensor( [ 0.11373727, 0.09377633, 0.05281663, 0.05143715, 0.04040275, 0.03250912, 0.01307789, 0.01177734, 0.00038105, -0.00540255, -0.01194804, -0.01461012, -0.03061717, -0.03599222, ] ), ] torch.set_printoptions(precision=8, sci_mode=False) for proposal, expected_proposal_box, im_size, expected_objectness_logit in zip( proposals, expected_proposal_boxes, image_sizes, expected_objectness_logits ): self.assertEqual(len(proposal), len(expected_proposal_box)) self.assertEqual(proposal.image_size, im_size) # It seems that there's some randomness in the result across different machines: # This test can be run on a local machine for 100 times with exactly the same result, # However, a different machine might produce slightly different results, # thus the atol here. err_msg = "computed proposal boxes = {}, expected {}".format( proposal.proposal_boxes.tensor, expected_proposal_box.tensor ) self.assertTrue( torch.allclose( proposal.proposal_boxes.tensor, expected_proposal_box.tensor, atol=1e-5 ), err_msg, ) err_msg = "computed objectness logits = {}, expected {}".format( proposal.objectness_logits, expected_objectness_logit ) self.assertTrue( torch.allclose(proposal.objectness_logits, expected_objectness_logit, atol=1e-5), err_msg, )
def fast_rcnn_inference_single_image( boxes, scores, image_shape, score_thresh, nms_thresh, topk_per_image ): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes per image. Returns: Same as `fast_rcnn_inference`, but for only one image. """ all_scores = scores.clone() all_scores = torch.unsqueeze(all_scores, 0) all_boxes = boxes.clone() all_boxes = torch.unsqueeze(all_boxes, 0) pred_inds = torch.unsqueeze( torch.arange(scores.size(0), device=scores.device, dtype=torch.long), dim=1 ).repeat(1, scores.size(1)) valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1) if not valid_mask.all(): boxes = boxes[valid_mask] scores = scores[valid_mask] pred_inds = pred_inds[valid_mask] scores = scores[:, :-1] num_bbox_reg_classes = boxes.shape[1] // 4 # Convert to Boxes to use the `clip` function ... boxes = Boxes(boxes.reshape(-1, 4)) boxes.clip(image_shape) boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4 pred_inds = pred_inds[:, :-1] # Filter results based on detection scores filter_mask = scores > score_thresh # R x K # R' x 2. First column contains indices of the R predictions; # Second column contains indices of classes. filter_inds = filter_mask.nonzero() if num_bbox_reg_classes == 1: boxes = boxes[filter_inds[:, 0], 0] else: boxes = boxes[filter_mask] scores = scores[filter_mask] pred_inds = pred_inds[filter_mask] # Apply per-class NMS keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh) if topk_per_image >= 0: keep = keep[:topk_per_image] boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep] pred_inds = pred_inds[keep] result = Instances(image_shape) result.pred_boxes = Boxes(boxes) result.scores = scores result.pred_classes = filter_inds[:, 1] result.pred_inds = pred_inds return result, filter_inds[:, 0], all_scores, all_boxes
def detector_postprocess(results, output_height, output_width, mask_threshold=0.5): """ Resize the output instances. The input images are often resized when entering an object detector. As a result, we often need the outputs of the detector in a different resolution from its inputs. This function will resize the raw outputs of an R-CNN detector to produce outputs according to the desired output resolution. Args: results (Instances): the raw outputs from the detector. `results.image_size` contains the input image resolution the detector sees. This object might be modified in-place. output_height, output_width: the desired output resolution. Returns: Instances: the resized output from the model, based on the output resolution """ # Converts integer tensors to float temporaries # to ensure true division is performed when # computing scale_x and scale_y. if isinstance(output_width, torch.Tensor): output_width_tmp = output_width.float() else: output_width_tmp = output_width if isinstance(output_height, torch.Tensor): output_height_tmp = output_height.float() else: output_height_tmp = output_height scale_x, scale_y = ( output_width_tmp / results.image_size[1], output_height_tmp / results.image_size[0], ) results = Instances((output_height, output_width), **results.get_fields()) if results.has("pred_boxes"): output_boxes = results.pred_boxes elif results.has("proposal_boxes"): output_boxes = results.proposal_boxes output_boxes.scale(scale_x, scale_y) output_boxes.clip(results.image_size) results = results[output_boxes.nonempty()] if results.has("pred_masks") and results.has("no_paste"): results.pred_masks = F.interpolate( results.pred_masks, size=(output_height, output_width), mode="bilinear", align_corners=False, ) results.pred_masks = (results.pred_masks[:, 0, :, :] >= mask_threshold).to(dtype=torch.bool) elif results.has("pred_masks"): results.pred_masks = retry_if_cuda_oom(paste_masks_in_image)( results.pred_masks[:, 0, :, :], # N, 1, M, M results.pred_boxes, results.image_size, threshold=mask_threshold, ) if results.has("pred_keypoints"): results.pred_keypoints[:, :, 0] *= scale_x results.pred_keypoints[:, :, 1] *= scale_y return results
def get_pgt(self, prev_pred_boxes, prev_pred_scores, proposals, suffix): if isinstance(prev_pred_scores, torch.Tensor): num_preds_per_image = [len(p) for p in proposals] prev_pred_scores = prev_pred_scores.split(num_preds_per_image, dim=0) else: assert isinstance(prev_pred_scores, list) assert isinstance(prev_pred_scores[0], torch.Tensor) prev_pred_scores = [ torch.index_select(prev_pred_score, 1, gt_int) for prev_pred_score, gt_int in zip(prev_pred_scores, self.gt_classes_img_int) ] pgt_scores_idxs = [ torch.max(prev_pred_score, dim=0) for prev_pred_score in prev_pred_scores ] pgt_scores = [item[0] for item in pgt_scores_idxs] pgt_idxs = [item[1] for item in pgt_scores_idxs] assert isinstance(prev_pred_boxes, tuple) or isinstance(prev_pred_boxes, list) if isinstance(prev_pred_boxes[0], Boxes): pgt_boxes = [ prev_pred_box[pgt_idx] for prev_pred_box, pgt_idx in zip(prev_pred_boxes, pgt_idxs) ] else: assert isinstance(prev_pred_boxes[0], torch.Tensor) if self.cls_agnostic_bbox_reg: num_preds = [prev_pred_box.size(0) for prev_pred_box in prev_pred_boxes] prev_pred_boxes = [ prev_pred_box.unsqueeze(1).expand(num_pred, self.num_classes, 4) for num_pred, prev_pred_box in zip(num_preds, prev_pred_boxes) ] prev_pred_boxes = [ prev_pred_box.view(-1, self.num_classes, 4) for prev_pred_box in prev_pred_boxes ] prev_pred_boxes = [ torch.index_select(prev_pred_box, 1, gt_int) for prev_pred_box, gt_int in zip(prev_pred_boxes, self.gt_classes_img_int) ] pgt_boxes = [ torch.index_select(prev_pred_box, 0, pgt_idx) for prev_pred_box, pgt_idx in zip(prev_pred_boxes, pgt_idxs) ] pgt_boxes = [pgt_box.view(-1, 4) for pgt_box in pgt_boxes] diags = [ torch.tensor( [i * gt_split.numel() + i for i in range(gt_split.numel())], dtype=torch.int64, device=pgt_boxes[0].device, ) for gt_split in self.gt_classes_img_int ] pgt_boxes = [ torch.index_select(pgt_box, 0, diag) for pgt_box, diag in zip(pgt_boxes, diags) ] pgt_boxes = [Boxes(pgt_box) for pgt_box in pgt_boxes] pgt_classes = self.gt_classes_img_int pgt_weights = [ torch.index_select(pred_logits, 1, pgt_class).reshape(-1) for pred_logits, pgt_class in zip( self.pred_class_img_logits.split(1, dim=0), pgt_classes ) ] targets = [ Instances( proposals[i].image_size, gt_boxes=pgt_box, gt_classes=pgt_class, gt_scores=pgt_score, gt_weights=pgt_weight, ) for i, (pgt_box, pgt_class, pgt_score, pgt_weight) in enumerate( zip(pgt_boxes, pgt_classes, pgt_scores, pgt_weights) ) ] self._vis_pgt(targets, "pgt", suffix) return targets
HEIGHT = 480 WIDTH = 640 PREDICTIONS = {'scores': [0.9974532723426819, 0.9938008189201355, 0.988240122795105, 0.9850716590881348, 0.9844247102737427, 0.9810763597488403, 0.9800938963890076, 0.9720492362976074, 0.9688801765441895, 0.9674813747406006, 0.927189290523529, 0.9221163988113403, 0.9185774326324463, 0.9142449498176575, 0.8913487792015076, 0.8826121687889099, 0.8605493903160095, 0.8423078656196594, 0.8416911363601685, 0.8005133271217346, 0.7691927552223206, 0.7283533811569214, 0.7125754356384277, 0.6947720050811768, 0.6323946118354797, 0.5554373264312744, 0.502210259437561], 'pred_classes': [0, 2, 2, 2, 9, 0, 2, 0, 2, 0, 0, 5, 24, 9, 2, 2, 9, 5, 9, 2, 2, 0, 2, 0, 9, 9, 9], 'pred_boxes': [[309.5509033203125, 163.2444610595703, 434.3181457519531, 475.4096984863281], [3.7802395820617676, 242.15467834472656, 537.434814453125, 432.81524658203125], [14.279938697814941, 212.42967224121094, 45.97575759887695, 233.4514617919922], [421.2859191894531, 208.60498046875, 556.5548706054688, 309.9937744140625], [143.9969940185547, 143.22933959960938, 152.34913635253906, 159.94985961914062], [582.5836181640625, 198.43775939941406, 600.1054077148438, 266.8861389160156], [57.001808166503906, 211.5207061767578, 77.75701141357422, 229.96949768066406], [134.99818420410156, 208.92091369628906, 189.38192749023438, 278.38665771484375], [75.46967315673828, 209.60325622558594, 103.36743927001953, 230.1490478515625], [545.7862548828125, 202.4793243408203, 569.5499267578125, 269.63946533203125], [605.6588745117188, 201.08189392089844, 626.3668823242188, 269.6811218261719], [183.23878479003906, 177.90478515625, 242.53317260742188, 238.60890197753906], [129.665771484375, 228.85916137695312, 170.4459686279297, 264.7272644042969], [46.01143264770508, 137.7577362060547, 54.27482604980469, 154.9816436767578], [121.1451416015625, 205.1159210205078, 157.94473266601562, 229.43475341796875], [255.93080139160156, 214.98483276367188, 284.3371887207031, 243.56529235839844], [110.03988647460938, 133.47642517089844, 118.6834487915039, 152.92295837402344], [93.23741912841797, 186.40609741210938, 128.8490447998047, 226.1478271484375], [183.7700653076172, 141.0837860107422, 191.97706604003906, 156.7988739013672], [270.9613037109375, 206.3548583984375, 346.9297790527344, 241.0618438720703], [387.1550598144531, 269.3006286621094, 526.6027221679688, 415.30615234375], [0.49352559447288513, 323.8660583496094, 49.01905822753906, 359.7412414550781], [145.3698272705078, 207.95762634277344, 169.30557250976562, 229.77464294433594], [615.1426391601562, 207.83851623535156, 639.5967407226562, 354.58331298828125], [142.42550659179688, 176.94520568847656, 147.10646057128906, 185.7545166015625], [152.2357635498047, 165.64385986328125, 158.73641967773438, 175.6268768310547], [105.40318298339844, 177.60287475585938, 110.80204772949219, 182.9259796142578]]} Instances(num_instances=27, image_height=480, image_width=640, fields=[scores: [0.9974532723426819, 0.9938008189201355, 0.988240122795105, 0.9850716590881348, 0.9844247102737427, 0.9810763597488403, 0.9800938963890076, 0.9720492362976074, 0.9688801765441895, 0.9674813747406006, 0.927189290523529, 0.9221163988113403, 0.9185774326324463, 0.9142449498176575, 0.8913487792015076, 0.8826121687889099, 0.8605493903160095, 0.8423078656196594, 0.8416911363601685, 0.8005133271217346, 0.7691927552223206, 0.7283533811569214, 0.7125754356384277, 0.6947720050811768, 0.6323946118354797, 0.5554373264312744, 0.502210259437561], pred_classes: [0, 2, 2, 2, 9, 0, 2, 0, 2, 0, 0, 5, 24, 9, 2, 2, 9, 5, 9, 2, 2, 0, 2, 0, 9, 9, 9], pred_boxes: [[309.5509033203125, 163.2444610595703, 434.3181457519531, 475.4096984863281], [3.7802395820617676, 242.15467834472656, 537.434814453125, 432.81524658203125], [14.279938697814941, 212.42967224121094, 45.97575759887695, 233.4514617919922], [421.2859191894531, 208.60498046875, 556.5548706054688, 309.9937744140625], [143.9969940185547, 143.22933959960938, 152.34913635253906, 159.94985961914062], [582.5836181640625, 198.43775939941406, 600.1054077148438, 266.8861389160156], [57.001808166503906, 211.5207061767578, 77.75701141357422, 229.96949768066406], [134.99818420410156, 208.92091369628906, 189.38192749023438, 278.38665771484375], [75.46967315673828, 209.60325622558594, 103.36743927001953, 230.1490478515625], [545.7862548828125, 202.4793243408203, 569.5499267578125, 269.63946533203125], [605.6588745117188, 201.08189392089844, 626.3668823242188, 269.6811218261719], [183.23878479003906, 177.90478515625, 242.53317260742188, 238.60890197753906], [129.665771484375, 228.85916137695312, 170.4459686279297, 264.7272644042969], [46.01143264770508, 137.7577362060547, 54.27482604980469, 154.9816436767578], [121.1451416015625, 205.1159210205078, 157.94473266601562, 229.43475341796875], [255.93080139160156, 214.98483276367188, 284.3371887207031, 243.56529235839844], [110.03988647460938, 133.47642517089844, 118.6834487915039, 152.92295837402344], [93.23741912841797, 186.40609741210938, 128.8490447998047, 226.1478271484375], [183.7700653076172, 141.0837860107422, 191.97706604003906, 156.7988739013672], [270.9613037109375, 206.3548583984375, 346.9297790527344, 241.0618438720703], [387.1550598144531, 269.3006286621094, 526.6027221679688, 415.30615234375], [0.49352559447288513, 323.8660583496094, 49.01905822753906, 359.7412414550781], [145.3698272705078, 207.95762634277344, 169.30557250976562, 229.77464294433594], [615.1426391601562, 207.83851623535156, 639.5967407226562, 354.58331298828125], [142.42550659179688, 176.94520568847656, 147.10646057128906, 185.7545166015625], [152.2357635498047, 165.64385986328125, 158.73641967773438, 175.6268768310547], [105.40318298339844, 177.60287475585938, 110.80204772949219, 182.9259796142578]]]) Instances(num_instances=27, image_height=480, image_width=640, fields=[pred_boxes: Boxes(tensor([[3.0955e+02, 1.6324e+02, 4.3432e+02, 4.7541e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [3.7802e+00, 2.4215e+02, 5.3743e+02, 4.3282e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [1.4280e+01, 2.1243e+02, 4.5976e+01, 2.3345e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [4.2129e+02, 2.0860e+02, 5.5655e+02, 3.0999e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [1.4400e+02, 1.4323e+02, 1.5235e+02, 1.5995e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [5.8258e+02, 1.9844e+02, 6.0011e+02, 2.6689e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [5.7002e+01, 2.1152e+02, 7.7757e+01, 2.2997e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [1.3500e+02, 2.0892e+02, 1.8938e+02, 2.7839e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [7.5470e+01, 2.0960e+02, 1.0337e+02, 2.3015e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [5.4579e+02, 2.0248e+02, 5.6955e+02, 2.6964e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [6.0566e+02, 2.0108e+02, 6.2637e+02, 2.6968e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [1.8324e+02, 1.7790e+02, 2.4253e+02, 2.3861e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [1.2967e+02, 2.2886e+02, 1.7045e+02, 2.6473e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [4.6011e+01, 1.3776e+02, 5.4275e+01, 1.5498e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [1.2115e+02, 2.0512e+02, 1.5794e+02, 2.2943e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [2.5593e+02, 2.1498e+02, 2.8434e+02, 2.4357e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [1.1004e+02, 1.3348e+02, 1.1868e+02, 1.5292e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [9.3237e+01, 1.8641e+02, 1.2885e+02, 2.2615e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [1.8377e+02, 1.4108e+02, 1.9198e+02, 1.5680e+02], 2020-05-01 02:29:48,543 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [2.7096e+02, 2.0635e+02, 3.4693e+02, 2.4106e+02],