def convert_output(output): r = Instances(tuple(output[0])) r.pred_classes = output[1] r.pred_boxes = Boxes(output[2]) r.scores = output[3] return r
def get_empty_instance(h, w): inst = Instances((h, w)) inst.gt_boxes = Boxes(torch.rand(0, 4)) inst.gt_classes = torch.tensor([]).to(dtype=torch.int64) inst.gt_masks = BitMasks(torch.rand(0, h, w)) return inst
def _inference_one_image(self, input): """ Args: input (dict): one dataset dict Returns: dict: one output dict """ augmented_inputs = self.tta_mapper(input) do_hflip = [k.pop("horiz_flip", False) for k in augmented_inputs] heights = [k["height"] for k in augmented_inputs] widths = [k["width"] for k in augmented_inputs] assert ( len(set(heights)) == 1 and len(set(widths)) == 1 ), "Augmented version of the inputs should have the same original resolution!" height = heights[0] width = widths[0] # 1. Detect boxes from all augmented versions # 1.1: forward with all augmented images with self._turn_off_roi_head("mask_on"), self._turn_off_roi_head( "keypoint_on"): # temporarily disable mask/keypoint head outputs = self._batch_inference(augmented_inputs, do_postprocess=False) # 1.2: union the results all_boxes = [] all_scores = [] all_classes = [] for idx, output in enumerate(outputs): rescaled_output = detector_postprocess(output, height, width) pred_boxes = rescaled_output.pred_boxes.tensor if do_hflip[idx]: pred_boxes[:, [0, 2]] = width - pred_boxes[:, [2, 0]] all_boxes.append(pred_boxes) all_scores.extend(rescaled_output.scores) all_classes.extend(rescaled_output.pred_classes) all_boxes = torch.cat(all_boxes, dim=0).cpu() num_boxes = len(all_boxes) # 1.3: select from the union of all results num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES # +1 because fast_rcnn_inference expects background scores as well all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device) for idx, cls, score in zip(count(), all_classes, all_scores): all_scores_2d[idx, cls] = score merged_instances, _ = fast_rcnn_inference_single_image( all_boxes, all_scores_2d, (height, width), 1e-8, self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST, self.cfg.TEST.DETECTIONS_PER_IMAGE, ) if not self.cfg.MODEL.MASK_ON: return {"instances": merged_instances} # 2. Use the detected boxes to obtain masks # 2.1: rescale the detected boxes augmented_instances = [] for idx, input in enumerate(augmented_inputs): actual_height, actual_width = input["image"].shape[1:3] scale_x = actual_width * 1.0 / width scale_y = actual_height * 1.0 / height pred_boxes = merged_instances.pred_boxes.clone() pred_boxes.tensor[:, 0::2] *= scale_x pred_boxes.tensor[:, 1::2] *= scale_y if do_hflip[idx]: pred_boxes.tensor[:, [ 0, 2 ]] = actual_width - pred_boxes.tensor[:, [2, 0]] aug_instances = Instances( image_size=(actual_height, actual_width), pred_boxes=pred_boxes, pred_classes=merged_instances.pred_classes, scores=merged_instances.scores, ) augmented_instances.append(aug_instances) # 2.2: run forward on the detected boxes outputs = self._batch_inference(augmented_inputs, augmented_instances, do_postprocess=False) for idx, output in enumerate(outputs): if do_hflip[idx]: output.pred_masks = output.pred_masks.flip(dims=[3]) # 2.3: average the predictions all_pred_masks = torch.stack([o.pred_masks for o in outputs], dim=0) avg_pred_masks = torch.mean(all_pred_masks, dim=0) output = outputs[0] output.pred_masks = avg_pred_masks output = detector_postprocess(output, height, width) return {"instances": output}
def inference_single_image(self, pred_logits, pred_deltas, pred_masks, anchors, indexes, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: pred_logits (list[Tensor]): list of #feature levels. Each entry contains tensor of size (AxHxW, K) pred_deltas (list[Tensor]): Same shape as 'pred_logits' except that K becomes 4. pred_masks (list[list[Tensor]]): List of #feature levels, each is a list of #anchors. Each entry contains tensor of size (M_i*M_i, H, W). `None` if mask_on=False. anchors (list[Boxes]): list of #feature levels. Each entry contains a Boxes object, which contains all the anchors for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ pred_logits = pred_logits.flatten().sigmoid_() # We get top locations across all levels to accelerate the inference speed, # which does not seem to affect the accuracy. # First select values above the threshold logits_top_idxs = torch.where(pred_logits > self.score_threshold)[0] # Then get the top values num_topk = min(self.topk_candidates, logits_top_idxs.shape[0]) pred_prob, topk_idxs = pred_logits[logits_top_idxs].sort( descending=True) # Keep top k scoring values pred_prob = pred_prob[:num_topk] # Keep top k values top_idxs = logits_top_idxs[topk_idxs[:num_topk]] # class index cls_idxs = top_idxs % self.num_classes # HWA index top_idxs //= self.num_classes # predict boxes pred_boxes = self.box2box_transform.apply_deltas( pred_deltas[top_idxs], anchors[top_idxs].tensor) # apply nms keep = batched_nms(pred_boxes, pred_prob, cls_idxs, self.nms_threshold) # pick the top ones keep = keep[:self.detections_im] results = Instances(image_size) results.pred_boxes = Boxes(pred_boxes[keep]) results.scores = pred_prob[keep] results.pred_classes = cls_idxs[keep] # deal with masks result_masks, result_anchors = [], None if self.mask_on: # index and anchors, useful for masks top_indexes = indexes[top_idxs] top_anchors = anchors[top_idxs] result_indexes = top_indexes[keep] result_anchors = top_anchors[keep] # Get masks and do sigmoid for lvl, _, h, w, anc in result_indexes.tolist(): cur_size = self.mask_sizes[anc] * (2**lvl if self.bipyramid_on else 1) result_masks.append( torch.sigmoid(pred_masks[lvl][anc][:, h, w].view( 1, cur_size, cur_size))) return results, (result_masks, result_anchors)
def inference_single_image(self, box_cls, box_delta, anchors, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W x A, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. anchors (list[Boxes]): list of #feature levels. Each entry contains a Boxes object, which contains all the anchors for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ boxes_all = [] scores_all = [] class_idxs_all = [] # Iterate over every feature level for box_cls_i, box_reg_i, anchors_i in zip(box_cls, box_delta, anchors): # (HxWxAxK,) box_cls_i = box_cls_i.flatten().sigmoid_() # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, box_reg_i.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = box_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] anchor_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes box_reg_i = box_reg_i[anchor_idxs] anchors_i = anchors_i[anchor_idxs] # predict boxes predicted_boxes = self.box2box_transform.apply_deltas( box_reg_i, anchors_i.tensor) boxes_all.append(predicted_boxes) scores_all.append(predicted_prob) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return result
def test_rrpn(self): torch.manual_seed(121) cfg = get_cfg() cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RRPN" cfg.MODEL.ANCHOR_GENERATOR.NAME = "RotatedAnchorGenerator" cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]] cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1]] cfg.MODEL.ANCHOR_GENERATOR.ANGLES = [[0, 60]] cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1, 1) cfg.MODEL.RPN.HEAD_NAME = "StandardRPNHead" backbone = build_backbone(cfg) proposal_generator = build_proposal_generator(cfg, backbone.output_shape()) num_images = 2 images_tensor = torch.rand(num_images, 20, 30) image_sizes = [(10, 10), (20, 30)] images = ImageList(images_tensor, image_sizes) image_shape = (15, 15) num_channels = 1024 features = {"res4": torch.rand(num_images, num_channels, 1, 2)} gt_boxes = torch.tensor([[2, 2, 2, 2, 0], [4, 4, 4, 4, 0]], dtype=torch.float32) gt_instances = Instances(image_shape) gt_instances.gt_boxes = RotatedBoxes(gt_boxes) with EventStorage(): # capture events in a new storage to discard them proposals, proposal_losses = proposal_generator( images, features, [gt_instances[0], gt_instances[1]]) expected_losses = { "loss_rpn_cls": torch.tensor(0.0432923734), "loss_rpn_loc": torch.tensor(0.1552739739), } for name in expected_losses.keys(): self.assertTrue( torch.allclose(proposal_losses[name], expected_losses[name])) expected_proposal_boxes = [ RotatedBoxes( torch.tensor([ [ 0.60189795, 1.24095452, 61.98131943, 18.03621292, -4.07244873 ], [ 15.64940453, 1.69624567, 59.59749603, 16.34339333, 2.62692475 ], [ -3.02982378, -2.69752932, 67.90952301, 59.62455750, 59.97010040 ], [ 16.71863365, 1.98309708, 35.61507797, 32.81484985, 62.92267227 ], [ 0.49432933, -7.92979717, 67.77606201, 62.93098450, -1.85656738 ], [ 8.00880814, 1.36017394, 121.81007385, 32.74150467, 50.44297409 ], [ 16.44299889, -4.82221127, 63.39775848, 61.22503662, 54.12270737 ], [ 5.00000000, 5.00000000, 10.00000000, 10.00000000, -0.76943970 ], [ 17.64130402, -0.98095351, 61.40377808, 16.28918839, 55.53118134 ], [ 0.13016054, 4.60568953, 35.80157471, 32.30180359, 62.52872086 ], [ -4.26460743, 0.39604485, 124.30079651, 31.84611320, -1.58203125 ], [ 7.52815342, -0.91636634, 62.39784622, 15.45565224, 60.79549789 ], ])), RotatedBoxes( torch.tensor([ [ 0.07734215, 0.81635046, 65.33510590, 17.34688377, -1.51821899 ], [ -3.41833067, -3.11320257, 64.17595673, 60.55617905, 58.27033234 ], [ 20.67383385, -6.16561556, 63.60531998, 62.52315903, 54.85546494 ], [ 15.00000000, 10.00000000, 30.00000000, 20.00000000, -0.18218994 ], [ 9.22646523, -6.84775209, 62.09895706, 65.46472931, -2.74307251 ], [ 15.00000000, 4.93451595, 30.00000000, 9.86903191, -0.60272217 ], [ 8.88342094, 2.65560246, 120.95362854, 32.45022202, 55.75970078 ], [ 16.39088631, 2.33887148, 34.78761292, 35.61492920, 60.81977463 ], [ 9.78298569, 10.00000000, 19.56597137, 20.00000000, -0.86660767 ], [ 1.28576660, 5.49873352, 34.93610382, 33.22600174, 60.51599884 ], [ 17.58912468, -1.63270092, 62.96052551, 16.45713997, 52.91245270 ], [ 5.64749718, -1.90428460, 62.37649155, 16.19474792, 61.09543991 ], [ 0.82255805, 2.34931135, 118.83985901, 32.83671188, 56.50753784 ], [ -5.33874989, 1.64404404, 125.28501892, 33.35424042, -2.80731201 ], ])), ] expected_objectness_logits = [ torch.tensor([ 0.10111768, 0.09112845, 0.08466332, 0.07589971, 0.06650183, 0.06350251, 0.04299347, 0.01864817, 0.00986163, 0.00078543, -0.04573630, -0.04799230, ]), torch.tensor([ 0.11373727, 0.09377633, 0.05281663, 0.05143715, 0.04040275, 0.03250912, 0.01307789, 0.01177734, 0.00038105, -0.00540255, -0.01194804, -0.01461012, -0.03061717, -0.03599222, ]), ] torch.set_printoptions(precision=8, sci_mode=False) for proposal, expected_proposal_box, im_size, expected_objectness_logit in zip( proposals, expected_proposal_boxes, image_sizes, expected_objectness_logits): self.assertEqual(len(proposal), len(expected_proposal_box)) self.assertEqual(proposal.image_size, im_size) # It seems that there's some randomness in the result across different machines: # This test can be run on a local machine for 100 times with exactly the same result, # However, a different machine might produce slightly different results, # thus the atol here. err_msg = "computed proposal boxes = {}, expected {}".format( proposal.proposal_boxes.tensor, expected_proposal_box.tensor) self.assertTrue( torch.allclose(proposal.proposal_boxes.tensor, expected_proposal_box.tensor, atol=1e-5), err_msg, ) err_msg = "computed objectness logits = {}, expected {}".format( proposal.objectness_logits, expected_objectness_logit) self.assertTrue( torch.allclose(proposal.objectness_logits, expected_objectness_logit, atol=1e-5), err_msg, )
def forward(self, outputs, target_sizes): """ Perform the computation Parameters: outputs: raw outputs of the model target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch this must be the original image size (before any data augmentation) """ out_logits, out_bboxes = outputs['pred_logits'], outputs['pred_boxes'] assert len(out_logits) == len(target_sizes) target_sizes = out_bboxes.new_tensor(target_sizes) assert target_sizes.shape[1] == 2 prob = F.softmax(out_logits, -1) scores, labels = prob[..., :-1].max(-1) if self.scale_normalize: num_ins = out_bboxes.shape[1] num_loc = num_ins // self.num_stages strides = [] for stride in self.strides: strides.append(out_bboxes.new_tensor(stride).repeat(num_loc)) strides = torch.cat(strides, dim=0) out_bboxes[:, :, 2:] = out_bboxes[:, :, 2:] * strides[None, :, None] * self.scale_coef # convert to [x0, y0, x1, y1] format boxes = box_cxcywh_to_xyxy(out_bboxes) # and from relative [0, 1] to absolute [0, height] coordinates img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) boxes = boxes * scale_fct[:, None, :] results = [{ 'scores': s, 'labels': l, 'boxes': b } for s, l, b in zip(scores, labels, boxes)] # post process. processed_results = [] for result_per_image, image_size in zip(results, target_sizes): result = Instances(image_size) boxes = result_per_image["boxes"].float() scores = result_per_image["scores"].float() labels = result_per_image["labels"].long() # filter. keep = scores > self.score_thr boxes = boxes[keep, :] scores = scores[keep] labels = labels[keep] # sort and keep tok_k. if len(scores) > self.max_per_img: sort_inds = torch.argsort(scores, descending=True) sort_inds = sort_inds[:self.max_per_img] boxes = boxes[sort_inds, :] scores = scores[sort_inds] labels = labels[sort_inds] # append. result.pred_boxes = Boxes(boxes) result.scores = scores result.pred_classes = labels # clip boxes. if result.has("pred_boxes"): output_boxes = result.pred_boxes output_boxes.clip(result.image_size) result = result[output_boxes.nonempty()] processed_results.append({"instances": result}) return processed_results
def construct_hopairs_with_features(self, instances: List[Instances], crop_features) -> List[Instances]: """ Prepare person-object pairs to be used to train HOI heads. At training, it returns union regions of person-object proposals and assigns training labels. It returns ``self.hoi_batch_size_per_image`` random samples from pesron-object pairs, with a fraction of positives that is no larger than ``self.hoi_positive_sample_fraction``. At inference, it returns union regions of predicted person boxes and object boxes. Args: instances (list[Instances]): At training, proposals_with_gt. See ``self.label_and_sample_proposals`` At inference, predicted box instances. See ``self._forward_box`` Returns: list[Instances]: length `N` list of `Instances`s containing the human-object pairs. Each `Instances` has the following fields: - union_boxes: the union region of person boxes and object boxes - person_boxes: person boxes in a matched sequences with union_boxes - object_boxes: object boxes in a matched sequences with union_boxes - gt_actions: the ground-truth actions that the pair is assigned. Used for training HOI head. - person_box_scores: person box scores from box instances. Used at inference. - object_box_scores: object box scores from box instances. Used at inference. - object_box_classes: predicted box classes from box instances. Used at inference. """ hopairs = [] for img_idx, instances_per_image in enumerate(instances): with torch.no_grad(): if self.training: # Proposals generated from person branch in HORPN will be seen as person boxes; # Proposals generated from object branch in HORPN will be object boxes. boxes = instances_per_image.proposal_boxes person_idxs = (instances_per_image.is_person == 1 ).nonzero().squeeze(1) object_idxs = (instances_per_image.is_person == 0 ).nonzero().squeeze(1) else: # At inference, split person/object boxes based on predicted classes by box head boxes = instances_per_image.pred_boxes person_idxs = torch.nonzero( instances_per_image.pred_classes == 0).squeeze(1) object_idxs = torch.nonzero( instances_per_image.pred_classes > 0).squeeze(1) if self.allow_person_to_person: # Allow person to person interactions. Then all boxes will be used. object_idxs = torch.arange(len(instances_per_image), device=object_idxs.device) num_pboxes, num_oboxes = person_idxs.numel( ), object_idxs.numel() union_boxes = _pairwise_union_regions(boxes[person_idxs], boxes[object_idxs]) # Indexing person/object boxes in a matched order. person_idxs = person_idxs[:, None].repeat(1, num_oboxes).flatten() object_idxs = object_idxs[None, :].repeat(num_pboxes, 1).flatten() # Remove self-to-self interaction. keep = (person_idxs != object_idxs).nonzero().squeeze(1) union_boxes = union_boxes[keep] person_idxs = person_idxs[keep] object_idxs = object_idxs[keep] hopairs_per_image = Instances(instances_per_image.image_size) hopairs_per_image.union_boxes = union_boxes hopairs_per_image.person_boxes = boxes[person_idxs] hopairs_per_image.object_boxes = boxes[object_idxs] if self.training: # `person_idxs` and `object_idxs` are used in self.label_and_sample_hopairs() hopairs_per_image.person_idxs = person_idxs hopairs_per_image.object_idxs = object_idxs else: hopairs_per_image.person_box_scores = instances_per_image.scores[ person_idxs] hopairs_per_image.object_box_scores = instances_per_image.scores[ object_idxs] hopairs_per_image.object_box_classes = instances_per_image.pred_classes[ object_idxs] hopairs_per_image.person_feats = crop_features[img_idx][ person_idxs] hopairs_per_image.object_feats = crop_features[img_idx][ object_idxs] hopairs.append(hopairs_per_image) if self.training: hopairs = self.label_and_sample_hopairs(hopairs, instances) return hopairs
def extract_feat(split_idx, img_list, cfg, args, actor: ActorHandle): num_images = len(img_list) print('Number of images on split{}: {}.'.format(split_idx, num_images)) model = DefaultTrainer.build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume ) model.eval() for im_file in (img_list): if os.path.exists(os.path.join(args.output_dir, im_file.split('.')[0]+'.npz')): actor.update.remote(1) continue im = cv2.imread(os.path.join(args.image_dir, im_file)) if im is None: print(os.path.join(args.image_dir, im_file), "is illegal!") actor.update.remote(1) continue dataset_dict = get_image_blob(im, cfg.MODEL.PIXEL_MEAN) # extract roi features if cfg.MODEL.BUA.EXTRACTOR.MODE == 1: attr_scores = None with torch.set_grad_enabled(False): if cfg.MODEL.BUA.ATTRIBUTE_ON: boxes, scores, features_pooled, attr_scores = model([dataset_dict]) else: boxes, scores, features_pooled = model([dataset_dict]) boxes = [box.tensor.cpu() for box in boxes] scores = [score.cpu() for score in scores] features_pooled = [feat.cpu() for feat in features_pooled] if not attr_scores is None: attr_scores = [attr_score.cpu() for attr_score in attr_scores] generate_npz(4, args, cfg, im_file, im, dataset_dict, boxes, scores, features_pooled, attr_scores) # extract bbox only elif cfg.MODEL.BUA.EXTRACTOR.MODE == 2: with torch.set_grad_enabled(False): boxes, scores = model([dataset_dict]) boxes = [box.cpu() for box in boxes] scores = [score.cpu() for score in scores] generate_npz(2, args, cfg, im_file, im, dataset_dict, boxes, scores) # extract roi features by bbox elif cfg.MODEL.BUA.EXTRACTOR.MODE == 3: if not os.path.exists(os.path.join(args.bbox_dir, im_file.split('.')[0]+'.npz')): actor.update.remote(1) continue bbox = torch.from_numpy(np.load(os.path.join(args.bbox_dir, im_file.split('.')[0]+'.npz'))['bbox']) * dataset_dict['im_scale'] proposals = Instances(dataset_dict['image'].shape[-2:]) proposals.proposal_boxes = BUABoxes(bbox) dataset_dict['proposals'] = proposals attr_scores = None with torch.set_grad_enabled(False): if cfg.MODEL.BUA.ATTRIBUTE_ON: boxes, scores, features_pooled, attr_scores = model([dataset_dict]) else: boxes, scores, features_pooled = model([dataset_dict]) boxes = [box.tensor.cpu() for box in boxes] scores = [score.cpu() for score in scores] features_pooled = [feat.cpu() for feat in features_pooled] if not attr_scores is None: attr_scores = [attr_score.data.cpu() for attr_score in attr_scores] generate_npz(3, args, cfg, im_file, im, dataset_dict, boxes, scores, features_pooled, attr_scores) actor.update.remote(1)
def trend_rcnn_inference_single_image(boxes, scores, attributes, image_shape, score_thresh, nms_thresh, topk_per_image, attr_score_thresh, num_attr_classes, max_attr_pred): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes per image. Returns: Same as `fast_rcnn_inference`, but for only one image. """ valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all( dim=1) if not valid_mask.all(): boxes = boxes[valid_mask] scores = scores[valid_mask] attributes = attributes[valid_mask] scores = scores[:, :-1] num_bbox_reg_classes = boxes.shape[1] // 4 #print("Printing the number of classes in the box: ", num_bbox_reg_classes) # Convert to Boxes to use the `clip` function ... boxes = Boxes(boxes.reshape(-1, 4)) boxes.clip(image_shape) boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4 num_attr_reg_classes = attributes.shape[1] // num_attr_classes # [ANMOL] this just prints the number of object classes that we have... here its 46 attributes = attributes.view(-1, num_attr_reg_classes, num_attr_classes) # [ANMOL] reshaped the attributes [proposals, objectclass, attrclass] # Filter results based on detection scores filter_mask = scores > score_thresh # R x K # filter mask shape is same as score shape: [proposals, obj classes] # R' x 2. First column contains indices of the R predictions; # Second column contains indices of classes. filter_inds = filter_mask.nonzero() # there would be more indices/proposals after this compared as more number of scores might be > # greater than threshold would be interesting to check how it would work class agnostic attr classification # might fail there.. In the current example: R=1000, but R'=45806 #print("filter ind shape: ", filter_inds.shape) if num_bbox_reg_classes == 1: boxes = boxes[filter_inds[:, 0], 0] else: boxes = boxes[filter_mask] scores = scores[filter_mask] #before this scores shape was [R,num_classes], after filter mask it will just convert to [R'] if num_attr_reg_classes == 1: attributes = attributes[filter_inds[:, 0], 0] else: attributes = attributes[filter_mask] #BOTH of these should produce attribute of shape [R', attr_classes] # Apply per-class NMS keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh) if topk_per_image >= 0: keep = keep[:topk_per_image] boxes, scores, filter_inds, attributes = boxes[keep], scores[ keep], filter_inds[keep], attributes[keep] attributes[attributes < attr_score_thresh] = 0 attr_scores_sorted, attr_indices = torch.sort(attributes, 1, descending=True) attr_indices[attr_scores_sorted < attr_score_thresh] = 294 attributes_inds = attr_indices[:, 0:max_attr_pred] #del attr_indices result = Instances(image_shape) result.pred_boxes = Boxes(boxes) result.scores = scores result.attr_scores = attributes result.attr_classes = attributes_inds result.pred_classes = filter_inds[:, 1] return result, filter_inds[:, 0]
def _forward_hoi( self, features: Dict[str, torch.Tensor], instances: List[Instances] ) -> Union[Dict[str, torch.Tensor], List[Instances]]: """ Forward logic of the interaction prediction branch. Args: features (dict[str, Tensor]): mapping from feature map names to tensor. Same as in :meth:`ROIHeads.forward`. instances (list[Instances]): At training, the per-image object proposals with matching ground truth. Each has fields "proposal_boxes", and "interactness_logits", "gt_classes", "gt_actions". At inference, the per-image predicted box instances from box head. Each has fields "pred_boxes", "pred_classes", "scores" Returns: In training, a dict of losses. In inference, a list of `Instances`, the predicted hoi instances. Each has fields "person_boxes", "object_boxes", "object_classes", "action_classes", "scores" """ if not self.hoi_on: return {} if self.training else [] features = [features[f] for f in self.in_features] hopairs = self.construct_hopairs(instances) union_features = self.hoi_pooler(features, [x.union_boxes for x in hopairs]) person_features = self.hoi_pooler(features, [x.person_boxes for x in hopairs]) object_features = self.hoi_pooler(features, [x.object_boxes for x in hopairs]) person_features = self.hoi_head(person_features) object_features = self.hoi_head(object_features) union_features = self.hoi_head(union_features) if self.compose_learning != 0 and self.training: # # pass gt_obj_classes = [x.gt_classes[:, 1:2] for x in hopairs] ohot_gt_obj_labels = [] for gt_obj in gt_obj_classes: ohot_gt_obj = torch.FloatTensor(len(gt_obj), 81) ohot_gt_obj.zero_() ohot_gt_obj = ohot_gt_obj.to(gt_obj.device) ohot_gt_obj.scatter_(1, gt_obj, 1) torch.scatter(ohot_gt_obj, 1, gt_obj, 1) ohot_gt_obj_labels.append(ohot_gt_obj) ohot_gt_obj_labels = torch.cat(ohot_gt_obj_labels, dim=0) gt_verbs = torch.cat([x.gt_actions for x in hopairs], dim=0) new_obj_features = object_features ohot_gt_obj_labels = ohot_gt_obj_labels[:, :] union_features_cl_arr = [] person_features_cl_arr = [] gt_verbs_cl_arr = [] ohot_gt_obj_labels_cl_arr = [] obj_features_cl_arr = [] # import ipdb;ipdb.set_trace() per_img_hoi_lengths = [len(x.gt_actions) for x in hopairs] sum_rolls = len(per_img_hoi_lengths) if self.compose_learning == 4: sum_rolls = 3 if self.compose_learning == 5: ohot_gt_obj_labels = torch.flip(ohot_gt_obj_labels, dims=[0]) object_features = torch.flip(object_features, dims=[0]) for ii in range(sum_rolls): """ here, we roll the object list, to match different HOIs. """ union_features_cl_arr.append(union_features) person_features_cl_arr.append(person_features) gt_verbs_cl_arr.append(gt_verbs) ohot_gt_obj_labels_cl_arr.append( torch.roll(ohot_gt_obj_labels, -sum(per_img_hoi_lengths[:ii + 1]), 0)) obj_features_cl_arr.append( torch.roll(object_features, -sum(per_img_hoi_lengths[:ii + 1]), 0)) # ohot_gt_obj_labels = torch.flip(ohot_gt_obj_labels, [0]) # new_obj_features = torch.flip(object_features, [0]) ohot_gt_obj_labels = torch.cat(ohot_gt_obj_labels_cl_arr, dim=0) union_features_cl = torch.cat(union_features_cl_arr, dim=0) person_features_cl = torch.cat(person_features_cl_arr, dim=0) gt_verbs_cl = torch.cat(gt_verbs_cl_arr, dim=0) ohot_gt_obj_labels_cl = torch.cat(ohot_gt_obj_labels_cl_arr, dim=0) obj_features_cl = torch.cat(obj_features_cl_arr, dim=0) HOI_labels = ( torch.matmul( ohot_gt_obj_labels, self.obj_to_HO_matrix.to(ohot_gt_obj_labels.device)) + torch.matmul( gt_verbs_cl.to(ohot_gt_obj_labels.device), self.verb_to_HO_matrix.to(ohot_gt_obj_labels.device))) > 1. HOI_labels = HOI_labels.type(torch.float32) HOI_labels = torch.sum(HOI_labels, dim=-1) > 0 union_features_cl = union_features_cl[HOI_labels] person_features_cl = person_features_cl[HOI_labels] new_obj_features = obj_features_cl[HOI_labels] gt_verbs = gt_verbs_cl[HOI_labels] ohot_gt_obj_cl = ohot_gt_obj_labels_cl[HOI_labels] cl_hopair = Instances((512, 512)) cl_hopair.gt_actions = gt_verbs # gt_classes hopairs.append(cl_hopair) # print(len(gt_verbs), len(new_obj_features)) union_features = torch.cat([union_features, union_features_cl], dim=0) person_features = torch.cat([person_features, person_features_cl], dim=0) object_features = torch.cat([object_features, new_obj_features], dim=0) hoi_predictions = self.hoi_predictor(union_features, person_features, object_features) del union_features, person_features, object_features, features if self.training: if self.compose_learning in [1, 4, 5]: if len(union_features_cl) > 0: losses = self.hoi_predictor.losses( hoi_predictions[:-len(union_features_cl)], hopairs[:-1]) cl_losses = HoiOutputs( hoi_predictions[-len(union_features_cl):], hopairs[-1:], self.hoi_predictor.pos_weights).losses() losses['loss_action_cl'] = cl_losses[ 'loss_action'] * self.cl_weight else: losses = self.hoi_predictor.losses(hoi_predictions, hopairs[:-1]) losses['loss_action_cl'] = losses['loss_action'] * 0. elif self.compose_learning == 3: weights = torch.ones_like(hoi_predictions, dtype=torch.float32) weights[-len(union_features_cl):] = weights[ -len(union_features_cl):] * self.cl_weight losses = self.hoi_predictor.losses(hoi_predictions, hopairs, weights) else: losses = self.hoi_predictor.losses(hoi_predictions, hopairs) return losses else: # if self.is_hoi_prediction: # hoi_predictions = torch.matmul(hoi_predictions, # torch.transpose(self.verb_to_HO_matrix, 1, 0).to(hoi_predictions.device)) pred_interactions = self.hoi_predictor.inference( hoi_predictions, hopairs) if self.is_hoi_prediction: # convert to 117 classes pass return pred_interactions
def forward(self, batched_inputs): # Batched inputs is a list of mapped dictionaries # Note that this depends on the shape being 224x224: this is handled in the mapper # print(batched_inputs[0].keys()) # Get out the images batched_images = [b["image"] for b in batched_inputs] # Normalise (required for yolo) # batched_images = [self.normalise(im) for im in batched_images] # Stack into one big tensor images_tensor = torch.stack(batched_images) # Vgg forward takes in a tensor, get out some logits # self.yolov3_model = self.yolov3_model.to(device) self.to(self.device) images_tensor = images_tensor.to(self.device) # print(type(images_tensor)) # print(images_tensor.shape) # print(images_tensor) if self.training: # Get the height and widths batched_images_w_h = [(b["width"], b["height"]) for b in batched_inputs] # batched_images_w_h = [ (b["image"].shape[1],b["image"].shape[1]) for b in batched_inputs] # Get the target classes target_classes = [b["classID"] for b in batched_inputs] # Compute the bboxes target_bboxes = [b["instances"] for b in batched_inputs] # print(target_bboxes) target_bboxes = [b.get("gt_boxes") for b in target_bboxes] target_centers = [ b.get_centers().tolist()[0] for b in target_bboxes ] target_bboxes = [b.tensor.tolist()[0] for b in target_bboxes] target_w_h = [] for b in target_bboxes: x0, y0, x1, y1 = b w = abs(x0 - x1) h = abs(y0 - y1) target_w_h.append((w, h)) target_bboxes = [] for bbox_center, bbox_w_h, img_w_h, target_class in zip( target_centers, target_w_h, batched_images_w_h, target_classes): img_width = img_w_h[0] img_height = img_w_h[1] center_x = bbox_center[0] / img_width center_y = bbox_center[1] / img_height width = bbox_w_h[0] / img_width height = bbox_w_h[1] / img_height target_bboxes.append( (0, target_class, center_x, center_y, width, height)) targets_tensor = torch.tensor(target_bboxes).to(self.device) # targets need to be in form: # tensor([[0.0000, 0.0000, 0.4900, 0.5000, 0.1454, 0.1829]]) # index_of_bbox_in_image?, label_idx x_center y_center width height # The coordinates should be scaled [0, 1] # print(images_tensor.shape) # print(targets_tensor.shape) losses, _ = super().forward(images_tensor, targets_tensor) return {"total_loss": losses} else: with torch.no_grad(): # Forward pass the model outputs = super().forward(images_tensor) # nms outputs = non_max_suppression(outputs, conf_thres=self.conf_threshold, nms_thres=self.nms_threshold) # For each output,batchedim (Note: only 1 image per batch in evaluation) for output, batched_input in zip(outputs, batched_inputs): # height,width im_height = batched_input["height"] im_width = batched_input["width"] # Get out predictions try: pred_boxes = output[:, :4] pred_scores = output[:, 4] pred_classes = output[:, -1].int() except: new_instance = Instances((im_height, im_width)) new_instance.pred_boxes = Boxes(torch.tensor([])) new_instance.scores = torch.tensor([]) new_instance.pred_classes = torch.tensor([]).int() return [{"instances": new_instance}] # a "box" is len 4: center_x,center_y,width,height # scaled between 0 and 1 pred_boxes = Boxes(pred_boxes) # Add the predictions new_instance = Instances((im_height, im_width)) new_instance.pred_boxes = pred_boxes new_instance.scores = pred_scores new_instance.pred_classes = pred_classes # Immediately return for this loop as testing only involves 1 batch return [{"instances": new_instance}]
def forward_for_single_feature_map(self, locations, box_cls, reg_pred, ctrness, mask_regression, image_sizes): N, C, H, W = box_cls.shape # put in the same format as locations box_cls = box_cls.view(N, C, H, W).permute(0, 2, 3, 1) box_cls = box_cls.reshape(N, -1, C).sigmoid() box_regression = reg_pred.view(N, 4, H, W).permute(0, 2, 3, 1) box_regression = box_regression.reshape(N, -1, 4) ctrness = ctrness.view(N, 1, H, W).permute(0, 2, 3, 1) ctrness = ctrness.reshape(N, -1).sigmoid() mask_regression = mask_regression.view(N, self.num_codes, H, W).permute(0, 2, 3, 1) mask_regression = mask_regression.reshape(N, -1, self.num_codes) # if self.thresh_with_ctr is True, we multiply the classification # scores with centerness scores before applying the threshold. if self.thresh_with_ctr: box_cls = box_cls * ctrness[:, :, None] candidate_inds = box_cls > self.pre_nms_thresh # pre_nms_top_n = candidate_inds.view(N, -1).sum(1) pre_nms_top_n = candidate_inds.reshape(N, -1).sum( 1) # this is suggested when running on my machine pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n) if not self.thresh_with_ctr: box_cls = box_cls * ctrness[:, :, None] results = [] for i in range(N): per_box_cls = box_cls[i] per_candidate_inds = candidate_inds[i] per_box_cls = per_box_cls[per_candidate_inds] per_candidate_nonzeros = per_candidate_inds.nonzero() per_box_loc = per_candidate_nonzeros[:, 0] per_class = per_candidate_nonzeros[:, 1] per_box_regression = box_regression[i] per_box_regression = per_box_regression[per_box_loc] per_locations = locations[per_box_loc] per_box_mask = mask_regression[i] per_box_mask = per_box_mask[per_box_loc] per_pre_nms_top_n = pre_nms_top_n[i] if per_candidate_inds.sum().item() > per_pre_nms_top_n.item(): per_box_cls, top_k_indices = \ per_box_cls.topk(per_pre_nms_top_n, sorted=False) per_class = per_class[top_k_indices] per_box_regression = per_box_regression[top_k_indices] per_locations = per_locations[top_k_indices] per_box_mask = per_box_mask[top_k_indices] detections = torch.stack([ per_locations[:, 0] - per_box_regression[:, 0], per_locations[:, 1] - per_box_regression[:, 1], per_locations[:, 0] + per_box_regression[:, 2], per_locations[:, 1] + per_box_regression[:, 3], ], dim=1) boxlist = Instances(image_sizes[i]) boxlist.pred_boxes = Boxes(detections) boxlist.scores = torch.sqrt(per_box_cls) boxlist.pred_classes = per_class boxlist.locations = per_locations boxlist.pred_masks = per_box_mask results.append(boxlist) return results
def losses(self, logits_pred, reg_pred, ctrness_pred, locations, gt_instances, top_feats=None): """ Return the losses from a set of FCOS predictions and their associated ground-truth. Returns: dict[loss name -> loss value]: A dict mapping from loss name to loss value. """ training_targets = self._get_ground_truth(locations, gt_instances) # Collect all logits and regression predictions over feature maps # and images to arrive at the same shape as the labels and targets # The final ordering is L, N, H, W from slowest to fastest axis. instances = Instances((0, 0)) instances.labels = cat( [ # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,) x.reshape(-1) for x in training_targets["labels"] ], dim=0) instances.gt_inds = cat( [ # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,) x.reshape(-1) for x in training_targets["target_inds"] ], dim=0) instances.im_inds = cat( [x.reshape(-1) for x in training_targets["im_inds"]], dim=0) instances.reg_targets = cat( [ # Reshape: (N, Hi, Wi, 4) -> (N*Hi*Wi, 4) x.reshape(-1, 4) for x in training_targets["reg_targets"] ], dim=0, ) instances.locations = cat( [x.reshape(-1, 2) for x in training_targets["locations"]], dim=0) instances.fpn_levels = cat( [x.reshape(-1) for x in training_targets["fpn_levels"]], dim=0) instances.logits_pred = cat( [ # Reshape: (N, C, Hi, Wi) -> (N, Hi, Wi, C) -> (N*Hi*Wi, C) x.permute(0, 2, 3, 1).reshape(-1, self.num_classes) for x in logits_pred ], dim=0, ) instances.reg_pred = cat( [ # Reshape: (N, B, Hi, Wi) -> (N, Hi, Wi, B) -> (N*Hi*Wi, B) x.permute(0, 2, 3, 1).reshape(-1, 4) for x in reg_pred ], dim=0, ) instances.ctrness_pred = cat( [ # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,) x.permute(0, 2, 3, 1).reshape(-1) for x in ctrness_pred ], dim=0, ) if len(top_feats) > 0: instances.top_feats = cat( [ # Reshape: (N, -1, Hi, Wi) -> (N*Hi*Wi, -1) x.permute(0, 2, 3, 1).reshape(-1, x.size(1)) for x in top_feats ], dim=0, ) return self.fcos_losses(instances)
def find_top_st_rpn_proposals( proposals, pred_objectness_logits, reference_frame_idx, image_size, nms_thresh, pre_nms_topk, post_nms_topk, min_box_side_len, training, ): """ For each feature map, select the `pre_nms_topk` highest scoring proposals, apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` highest scoring proposals among all the feature maps if `training` is True, otherwise, returns the highest `post_nms_topk` scoring proposals for each feature map. Args: proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4). All proposal predictions on the feature maps. pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). reference_frame_idx (int): Reference frame index used to select boxes/scores to execute NMS. image_size: Input images size in (h, w) order. nms_thresh (float): IoU threshold to use for NMS pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is per feature map. post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is total, over all feature maps. min_box_side_len (float): minimum proposal box side length in pixels (absolute units wrt input images). training (bool): True if proposals are to be used in training, otherwise False. This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." comment. Returns: proposals (list[Instances]): list of N Instances. The i-th Instances stores post_nms_topk object proposals for frame i, sorted by the objectness score in the reference frame in descending order. """ device = proposals[0].device # 1. Select top-k anchor for every level and every image topk_scores = [] # #lvl Tensor, each of shape N x topk topk_proposals = [] level_ids = [] # #lvl Tensor, each of shape (topk,) for level_id, proposals_i, logits_i in zip(itertools.count(), proposals, pred_objectness_logits): Hi_Wi_A = logits_i.shape[1] num_proposals_i = min(pre_nms_topk, Hi_Wi_A) # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812) # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1) logits_i, idx = logits_i[reference_frame_idx].sort(descending=True, dim=0) topk_scores_i = logits_i[:num_proposals_i] topk_idx = idx[:num_proposals_i] topk_proposals_i = proposals_i[:, topk_idx] # N x topk x 4 topk_proposals.append(topk_proposals_i) topk_scores.append(topk_scores_i) level_ids.append( torch.full((num_proposals_i, ), level_id, dtype=torch.int64, device=device)) # 2. Concat all levels together topk_scores = cat(topk_scores, dim=0) topk_proposals = cat(topk_proposals, dim=1) level_ids = cat(level_ids, dim=0) # 3. For the reference frame, run a per-level NMS, and choose topk results for # every input frame. st_boxes = [] # TODO: cache valid proposals mask for previous frames lvl = level_ids valid_mask = torch.isfinite(topk_proposals).all(dim=2).all( dim=0) & torch.isfinite(topk_scores) if not valid_mask.all(): topk_proposals = topk_proposals[:, valid_mask] topk_scores = topk_scores[valid_mask] lvl = lvl[valid_mask] keep = None st_boxes = [] for proposal_boxes_f in topk_proposals: boxes = Boxes(proposal_boxes_f) boxes.clip(image_size) # filter empty boxes keep_f = boxes.nonempty(threshold=min_box_side_len) keep = keep_f if keep is None else keep & keep_f st_boxes.append(boxes) if keep.sum().item() != len(st_boxes[0]): topk_scores, lvl = topk_scores[keep], lvl[keep] filtered_st_boxes = [] for boxes in st_boxes: if keep.sum().item() != len(boxes): boxes = boxes[keep] filtered_st_boxes.append(boxes) keep = batched_nms(filtered_st_boxes[reference_frame_idx].tensor, topk_scores, lvl, nms_thresh) # In Detectron1, there was different behavior during training vs. testing. # (https://github.com/facebookresearch/Detectron/issues/459) # During training, topk is over the proposals from *all* images in the training batch. # During testing, it is over the proposals for each image separately. # As a result, the training behavior becomes batch-dependent, # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. # This bug is addressed in Detectron2 to make the behavior independent of batch size. keep = keep[:post_nms_topk] # keep is already sorted scores = topk_scores[keep] results = [] for boxes in filtered_st_boxes: res = Instances(image_size) res.proposal_boxes = boxes[keep] res.objectness_logits = scores results.append(res) return results
def inference_single_image(self, locations, box_cls, box_reg, center_score, image_size): boxes_all = [] scores_all = [] class_idxs_all = [] # Iterate over every feature level for box_cls_i, box_reg_i, locs_i, center_score_i in zip( box_cls, box_reg, locations, center_score): # (HxW, C) box_cls_i = box_cls_i.sigmoid_() keep_idxs = box_cls_i > self.pre_nms_thresh # multiply the classification scores with center scores box_cls_i *= center_score_i.sigmoid_() box_cls_i = box_cls_i[keep_idxs] keep_idxs_nonzero_i = keep_idxs.nonzero() box_loc_i = keep_idxs_nonzero_i[:, 0] class_i = keep_idxs_nonzero_i[:, 1] box_reg_i = box_reg_i[box_loc_i] locs_i = locs_i[box_loc_i] per_pre_nms_top_n = keep_idxs.sum().clamp(max=self.pre_nms_top_n) if keep_idxs.sum().item() > per_pre_nms_top_n.item(): box_cls_i, topk_idxs = box_cls_i.topk(per_pre_nms_top_n, sorted=False) class_i = class_i[topk_idxs] box_reg_i = box_reg_i[topk_idxs] locs_i = locs_i[topk_idxs] # predict boxes predicted_boxes = torch.stack([ locs_i[:, 0] - box_reg_i[:, 0], locs_i[:, 1] - box_reg_i[:, 1], locs_i[:, 0] + box_reg_i[:, 2], locs_i[:, 1] + box_reg_i[:, 3], ], dim=1) box_cls_i = torch.sqrt(box_cls_i) boxes_all.append(predicted_boxes) scores_all.append(box_cls_i) class_idxs_all.append(class_i) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] # Apply per-class nms for each image keep = batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_thresh) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return result
def test_rpn(self): torch.manual_seed(121) cfg = get_cfg() cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RPN" cfg.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator" cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1) backbone = build_backbone(cfg) proposal_generator = build_proposal_generator(cfg, backbone.output_shape()) num_images = 2 images_tensor = torch.rand(num_images, 20, 30) image_sizes = [(10, 10), (20, 30)] images = ImageList(images_tensor, image_sizes) image_shape = (15, 15) num_channels = 1024 features = {"res4": torch.rand(num_images, num_channels, 1, 2)} gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32) gt_instances = Instances(image_shape) gt_instances.gt_boxes = Boxes(gt_boxes) with EventStorage(): # capture events in a new storage to discard them proposals, proposal_losses = proposal_generator( images, features, [gt_instances[0], gt_instances[1]]) expected_losses = { "loss_rpn_cls": torch.tensor(0.0804563984), "loss_rpn_loc": torch.tensor(0.0990132466), } for name in expected_losses.keys(): self.assertTrue( torch.allclose(proposal_losses[name], expected_losses[name])) expected_proposal_boxes = [ Boxes(torch.tensor([[0, 0, 10, 10], [7.3365392685, 0, 10, 10]])), Boxes( torch.tensor([ [0, 0, 30, 20], [0, 0, 16.7862777710, 13.1362524033], [0, 0, 30, 13.3173446655], [0, 0, 10.8602609634, 20], [7.7165775299, 0, 27.3875980377, 20], ])), ] expected_objectness_logits = [ torch.tensor([0.1225359365, -0.0133192837]), torch.tensor([ 0.1415634006, 0.0989848152, 0.0565387346, -0.0072308783, -0.0428492837 ]), ] for proposal, expected_proposal_box, im_size, expected_objectness_logit in zip( proposals, expected_proposal_boxes, image_sizes, expected_objectness_logits): self.assertEqual(len(proposal), len(expected_proposal_box)) self.assertEqual(proposal.image_size, im_size) self.assertTrue( torch.allclose(proposal.proposal_boxes.tensor, expected_proposal_box.tensor)) self.assertTrue( torch.allclose(proposal.objectness_logits, expected_objectness_logit))
def compute_targets_for_locations(self, locations, targets, size_ranges): labels = [] reg_targets = [] mask_targets = [] mask_indices = [] xs, ys = locations[:, 0], locations[:, 1] for im_i in range(len(targets)): targets_per_im = targets[im_i] bboxes = targets_per_im.gt_boxes.tensor labels_per_im = targets_per_im.gt_classes # no gt if bboxes.numel() == 0: labels.append( labels_per_im.new_zeros(locations.size(0)) + self.num_classes) reg_targets.append(locations.new_zeros((locations.size(0), 4))) continue area = targets_per_im.gt_boxes.area() l = xs[:, None] - bboxes[:, 0][None] t = ys[:, None] - bboxes[:, 1][None] r = bboxes[:, 2][None] - xs[:, None] b = bboxes[:, 3][None] - ys[:, None] reg_targets_per_im = torch.stack([l, t, r, b], dim=2) if self.center_sample: is_in_boxes = self.get_sample_region(bboxes, self.strides, self.num_loc_list, xs, ys, radius=self.radius) else: is_in_boxes = reg_targets_per_im.min(dim=2)[0] > 0 max_reg_targets_per_im = reg_targets_per_im.max(dim=2)[0] # limit the regression range for each location is_cared_in_the_level = \ (max_reg_targets_per_im >= size_ranges[:, [0]]) & \ (max_reg_targets_per_im <= size_ranges[:, [1]]) locations_to_gt_area = area[None].repeat(len(locations), 1) locations_to_gt_area[is_in_boxes == 0] = INF locations_to_gt_area[is_cared_in_the_level == 0] = INF # if there are still more than one objects for a location, # we choose the one with minimal area locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min( dim=1) reg_targets_per_im = reg_targets_per_im[range(len(locations)), locations_to_gt_inds] labels_per_im = labels_per_im[locations_to_gt_inds] labels_per_im[locations_to_min_area == INF] = self.num_classes labels.append(labels_per_im) reg_targets.append(reg_targets_per_im) # Mask Encoding. pos_inds = torch.nonzero( labels_per_im != self.num_classes).squeeze(1) pos_labels = labels_per_im[pos_inds] pos_reg_targets = reg_targets_per_im[pos_inds] pos_locations = locations[pos_inds] bbs = torch.stack([ pos_locations[:, 0] - pos_reg_targets[:, 0], pos_locations[:, 1] - pos_reg_targets[:, 1], pos_locations[:, 0] + pos_reg_targets[:, 2], pos_locations[:, 1] + pos_reg_targets[:, 3], ], dim=1) bbs = Boxes(bbs) mask_targets_per_im = Instances(targets_per_im.image_size) mask_targets_per_im.set("pos_classes", pos_labels) mask_targets_per_im.set("pos_boxes", bbs) mask_targets.append(mask_targets_per_im) mask_indices.append(pos_inds) return { "labels": labels, "reg_targets": reg_targets, "mask_targets": mask_targets, "mask_indices": mask_indices }
def assemble_rcnn_outputs_by_name(image_sizes, tensor_outputs, force_mask_on=False): """ A function to assemble caffe2 model's outputs (i.e. Dict[str, Tensor]) to detectron2's format (i.e. list of Instances instance). This only works when the model follows the Caffe2 detectron's naming convention. Args: image_sizes (List[List[int, int]]): [H, W] of every image. tensor_outputs (Dict[str, Tensor]): external_output to its tensor. force_mask_on (Bool): if true, the it make sure there'll be pred_masks even if the mask is not found from tensor_outputs (usually due to model crash) """ results = [Instances(image_size) for image_size in image_sizes] batch_splits = tensor_outputs.get("batch_splits", None) if batch_splits: raise NotImplementedError() assert len(image_sizes) == 1 result = results[0] bbox_nms = tensor_outputs["bbox_nms"] score_nms = tensor_outputs["score_nms"] class_nms = tensor_outputs["class_nms"] # Detection will always success because Conv support 0-batch assert bbox_nms is not None assert score_nms is not None assert class_nms is not None if bbox_nms.shape[1] == 5: result.pred_boxes = RotatedBoxes(bbox_nms) else: result.pred_boxes = Boxes(bbox_nms) result.scores = score_nms result.pred_classes = class_nms.to(torch.int64) mask_fcn_probs = tensor_outputs.get("mask_fcn_probs", None) if mask_fcn_probs is not None: # finish the mask pred mask_probs_pred = mask_fcn_probs num_masks = mask_probs_pred.shape[0] class_pred = result.pred_classes indices = torch.arange(num_masks, device=class_pred.device) mask_probs_pred = mask_probs_pred[indices, class_pred][:, None] result.pred_masks = mask_probs_pred elif force_mask_on: # NOTE: there's no way to know the height/width of mask here, it won't be # used anyway when batch size is 0, so just set them to 0. result.pred_masks = torch.zeros([0, 1, 0, 0], dtype=torch.uint8) keypoints_out = tensor_outputs.get("keypoints_out", None) kps_score = tensor_outputs.get("kps_score", None) if keypoints_out is not None: # keypoints_out: [N, 4, #kypoints], where 4 is in order of (x, y, score, prob) keypoints_tensor = keypoints_out # NOTE: it's possible that prob is not calculated if "should_output_softmax" # is set to False in HeatmapMaxKeypoint, so just using raw score, seems # it doesn't affect mAP. TODO: check more carefully. keypoint_xyp = keypoints_tensor.transpose(1, 2)[:, :, [0, 1, 2]] result.pred_keypoints = keypoint_xyp elif kps_score is not None: # keypoint heatmap to sparse data structure pred_keypoint_logits = kps_score keypoint_head.keypoint_rcnn_inference(pred_keypoint_logits, [result]) return results
def get_pgt(self, prev_pred_boxes, prev_pred_scores, proposals, suffix): if isinstance(prev_pred_scores, torch.Tensor): num_preds_per_image = [len(p) for p in proposals] prev_pred_scores = prev_pred_scores.split(num_preds_per_image, dim=0) else: assert isinstance(prev_pred_scores, list) assert isinstance(prev_pred_scores[0], torch.Tensor) prev_pred_scores = [ torch.index_select(prev_pred_score, 1, gt_int) for prev_pred_score, gt_int in zip( prev_pred_scores, self.gt_classes_img_int) ] pgt_scores_idxs = [ torch.max(prev_pred_score, dim=0) for prev_pred_score in prev_pred_scores ] pgt_scores = [item[0] for item in pgt_scores_idxs] pgt_idxs = [item[1] for item in pgt_scores_idxs] assert isinstance(prev_pred_boxes, tuple) or isinstance( prev_pred_boxes, list) if isinstance(prev_pred_boxes[0], Boxes): pgt_boxes = [ prev_pred_box[pgt_idx] for prev_pred_box, pgt_idx in zip(prev_pred_boxes, pgt_idxs) ] else: assert isinstance(prev_pred_boxes[0], torch.Tensor) if self.cls_agnostic_bbox_reg: num_preds = [ prev_pred_box.size(0) for prev_pred_box in prev_pred_boxes ] prev_pred_boxes = [ prev_pred_box.unsqueeze(1).expand(num_pred, self.num_classes, 4) for num_pred, prev_pred_box in zip(num_preds, prev_pred_boxes) ] prev_pred_boxes = [ prev_pred_box.view(-1, self.num_classes, 4) for prev_pred_box in prev_pred_boxes ] prev_pred_boxes = [ torch.index_select(prev_pred_box, 1, gt_int) for prev_pred_box, gt_int in zip(prev_pred_boxes, self.gt_classes_img_int) ] pgt_boxes = [ torch.index_select(prev_pred_box, 0, pgt_idx) for prev_pred_box, pgt_idx in zip(prev_pred_boxes, pgt_idxs) ] pgt_boxes = [pgt_box.view(-1, 4) for pgt_box in pgt_boxes] diags = [ torch.tensor( [ i * gt_split.numel() + i for i in range(gt_split.numel()) ], dtype=torch.int64, device=pgt_boxes[0].device, ) for gt_split in self.gt_classes_img_int ] pgt_boxes = [ torch.index_select(pgt_box, 0, diag) for pgt_box, diag in zip(pgt_boxes, diags) ] pgt_boxes = [Boxes(pgt_box) for pgt_box in pgt_boxes] pgt_classes = self.gt_classes_img_int pgt_weights = [ torch.index_select(pred_logits, 1, pgt_class).reshape(-1) for pred_logits, pgt_class in zip( self.pred_class_img_logits.split(1, dim=0), pgt_classes) ] targets = [ Instances( proposals[i].image_size, gt_boxes=pgt_box, gt_classes=pgt_class, gt_scores=pgt_score, gt_weights=pgt_weight, ) for i, (pgt_box, pgt_class, pgt_score, pgt_weight) in enumerate( zip(pgt_boxes, pgt_classes, pgt_scores, pgt_weights)) ] self._vis_pgt(targets, "pgt", suffix) return targets
def find_top_rrpn_proposals( proposals, pred_objectness_logits, images, nms_thresh, pre_nms_topk, post_nms_topk, min_box_side_len, training, ): """ For each feature map, select the `pre_nms_topk` highest scoring proposals, apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` highest scoring proposals among all the feature maps if `training` is True, otherwise, returns the highest `post_nms_topk` scoring proposals for each feature map. Args: proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 5). All proposal predictions on the feature maps. pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). images (ImageList): Input images as an :class:`ImageList`. nms_thresh (float): IoU threshold to use for NMS pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. When RRPN is run on multiple feature maps (as in FPN) this number is per feature map. post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. When RRPN is run on multiple feature maps (as in FPN) this number is total, over all feature maps. min_box_side_len (float): minimum proposal box side length in pixels (absolute units wrt input images). training (bool): True if proposals are to be used in training, otherwise False. This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." comment. Returns: proposals (list[Instances]): list of N Instances. The i-th Instances stores post_nms_topk object proposals for image i. """ image_sizes = images.image_sizes # in (h, w) order num_images = len(image_sizes) device = proposals[0].device # 1. Select top-k anchor for every level and every image topk_scores = [] # #lvl Tensor, each of shape N x topk topk_proposals = [] level_ids = [] # #lvl Tensor, each of shape (topk,) batch_idx = torch.arange(num_images, device=device) for level_id, proposals_i, logits_i in zip( itertools.count(), proposals, pred_objectness_logits ): Hi_Wi_A = logits_i.shape[1] num_proposals_i = min(pre_nms_topk, Hi_Wi_A) # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812) # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1) logits_i, idx = logits_i.sort(descending=True, dim=1) topk_scores_i = logits_i[batch_idx, :num_proposals_i] topk_idx = idx[batch_idx, :num_proposals_i] # each is N x topk topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 5 topk_proposals.append(topk_proposals_i) topk_scores.append(topk_scores_i) level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device)) # 2. Concat all levels together topk_scores = cat(topk_scores, dim=1) topk_proposals = cat(topk_proposals, dim=1) level_ids = cat(level_ids, dim=0) # 3. For each image, run a per-level NMS, and choose topk results. results = [] for n, image_size in enumerate(image_sizes): boxes = RotatedBoxes(topk_proposals[n]) scores_per_img = topk_scores[n] boxes.clip(image_size) # filter empty boxes keep = boxes.nonempty(threshold=min_box_side_len) lvl = level_ids if keep.sum().item() != len(boxes): boxes, scores_per_img, lvl = (boxes[keep], scores_per_img[keep], level_ids[keep]) keep = batched_nms_rotated(boxes.tensor, scores_per_img, lvl, nms_thresh) # In Detectron1, there was different behavior during training vs. testing. # (https://github.com/facebookresearch/Detectron/issues/459) # During training, topk is over the proposals from *all* images in the training batch. # During testing, it is over the proposals for each image separately. # As a result, the training behavior becomes batch-dependent, # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. # This bug is addressed in Detectron2 to make the behavior independent of batch size. keep = keep[:post_nms_topk] res = Instances(image_size) res.proposal_boxes = boxes[keep] res.objectness_logits = scores_per_img[keep] results.append(res) return results
def get_pgt_top_k( self, prev_pred_boxes, prev_pred_scores, proposals, top_k=1, thres=0, need_instance=True, need_weight=True, suffix="", ): assert isinstance(prev_pred_boxes, tuple) or isinstance( prev_pred_boxes, list) if isinstance(prev_pred_boxes[0], Boxes): num_preds = [ len(prev_pred_box) for prev_pred_box in prev_pred_boxes ] prev_pred_boxes = [ prev_pred_box.tensor.unsqueeze(1).expand( num_pred, self.num_classes, 4) for num_pred, prev_pred_box in zip(num_preds, prev_pred_boxes) ] else: assert isinstance(prev_pred_boxes[0], torch.Tensor) if self.cls_agnostic_bbox_reg: num_preds = [ prev_pred_box.size(0) for prev_pred_box in prev_pred_boxes ] prev_pred_boxes = [ prev_pred_box.unsqueeze(1).expand(num_pred, self.num_classes, 4) for num_pred, prev_pred_box in zip(num_preds, prev_pred_boxes) ] prev_pred_boxes = [ prev_pred_box.view(-1, self.num_classes, 4) for prev_pred_box in prev_pred_boxes ] if isinstance(prev_pred_scores, torch.Tensor): num_preds_per_image = [len(p) for p in proposals] prev_pred_scores = prev_pred_scores.split(num_preds_per_image, dim=0) else: assert isinstance(prev_pred_scores, list) assert isinstance(prev_pred_scores[0], torch.Tensor) prev_pred_scores = [ torch.index_select(prev_pred_score, 1, gt_int) for prev_pred_score, gt_int in zip( prev_pred_scores, self.gt_classes_img_int) ] prev_pred_boxes = [ torch.index_select(prev_pred_box, 1, gt_int) for prev_pred_box, gt_int in zip( prev_pred_boxes, self.gt_classes_img_int) ] # get top k num_preds = [ prev_pred_score.size(0) for prev_pred_score in prev_pred_scores ] if top_k >= 1: top_ks = [min(num_pred, int(top_k)) for num_pred in num_preds] elif top_k < 1 and top_k > 0: top_ks = [max(int(num_pred * top_k), 1) for num_pred in num_preds] else: top_ks = [min(num_pred, 1) for num_pred in num_preds] pgt_scores_idxs = [ torch.topk(prev_pred_score, top_k, dim=0) for prev_pred_score, top_k in zip(prev_pred_scores, top_ks) ] pgt_scores = [item[0] for item in pgt_scores_idxs] pgt_idxs = [item[1] for item in pgt_scores_idxs] pgt_idxs = [ torch.unsqueeze(pgt_idx, 2).expand(top_k, gt_int.numel(), 4) for pgt_idx, top_k, gt_int in zip(pgt_idxs, top_ks, self.gt_classes_img_int) ] pgt_boxes = [ torch.gather(prev_pred_box, 0, pgt_idx) for prev_pred_box, pgt_idx in zip(prev_pred_boxes, pgt_idxs) ] pgt_classes = [ torch.unsqueeze(gt_int, 0).expand(top_k, gt_int.numel()) for gt_int, top_k in zip(self.gt_classes_img_int, top_ks) ] if need_weight: pgt_weights = [ torch.index_select(pred_logits, 1, gt_int).expand(top_k, gt_int.numel()) for pred_logits, gt_int, top_k in zip( self.pred_class_img_logits.split(1, dim=0), self.gt_classes_img_int, top_ks) ] if thres > 0: # get large scores masks = [pgt_score.ge(thres) for pgt_score in pgt_scores] masks = [ torch.cat([torch.full_like(mask[0:1, :], True), mask[1:, :]], dim=0) for mask in masks ] pgt_scores = [ torch.masked_select(pgt_score, mask) for pgt_score, mask in zip(pgt_scores, masks) ] pgt_boxes = [ torch.masked_select( pgt_box, torch.unsqueeze(mask, 2).expand(top_k, gt_int.numel(), 4)) for pgt_box, mask, top_k, gt_int in zip( pgt_boxes, masks, top_ks, self.gt_classes_img_int) ] pgt_classes = [ torch.masked_select(pgt_class, mask) for pgt_class, mask in zip(pgt_classes, masks) ] if need_weight: pgt_weights = [ torch.masked_select(pgt_weight, mask) for pgt_weight, mask in zip(pgt_weights, masks) ] pgt_scores = [pgt_score.reshape(-1) for pgt_score in pgt_scores] pgt_boxes = [pgt_box.reshape(-1, 4) for pgt_box in pgt_boxes] pgt_classes = [pgt_class.reshape(-1) for pgt_class in pgt_classes] if need_weight: pgt_weights = [ pgt_weight.reshape(-1) for pgt_weight in pgt_weights ] if not need_instance and need_weight: return pgt_scores, pgt_boxes, pgt_classes, pgt_weights elif not need_instance and not need_weight: return pgt_scores, pgt_boxes, pgt_classes pgt_boxes = [Boxes(pgt_box) for pgt_box in pgt_boxes] targets = [ Instances( proposals[i].image_size, gt_boxes=pgt_box, gt_classes=pgt_class, gt_scores=pgt_score, gt_weights=pgt_weight, ) for i, (pgt_box, pgt_class, pgt_score, pgt_weight) in enumerate( zip(pgt_boxes, pgt_classes, pgt_scores, pgt_weights)) ] self._vis_pgt(targets, "pgt_top_k", suffix) return targets
def annotations_to_instances(annos, image_size, mask_format="polygon"): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", "gt_masks", "gt_keypoints", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos] target = Instances(image_size) target.gt_boxes = Boxes(boxes) classes = [int(obj["category_id"]) for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] if mask_format == "polygon": # TODO check type and provide better error masks = PolygonMasks(segms) else: assert mask_format == "bitmask", mask_format masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image_size)) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim ) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a binary segmentation mask " " in a 2D numpy array of shape HxW.".format(type(segm)) ) # torch.from_numpy does not support array with negative stride. masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks]) ) target.gt_masks = masks if len(annos) and "keypoints" in annos[0]: kpts = [obj.get("keypoints", []) for obj in annos] target.gt_keypoints = Keypoints(kpts) return target
def general_black_box_ensembles_post_processing( input_im, ensemble_pred_box_list, ensembles_class_idxs_list, ensemble_pred_prob_vectors_list, ensembles_pred_box_covariance_list, nms_threshold=0.5, max_detections_per_image=100, affinity_threshold=0.7, is_generalized_rcnn=False): """ Args: input_im (list): an input im list generated from dataset handler. ensemble_pred_box_list (list): predicted box list ensembles_class_idxs_list (list): predicted classes list ensemble_pred_prob_vectors_list (list): predicted probability vector list ensembles_pred_box_covariance_list (list): predicted covariance matrices nms_threshold (float): non-maximum suppression threshold between 0-1 affinity_threshold (float): cluster affinity threshold between 0-1 is_generalized_rcnn (bool): used to handle category selection by removing background class. Returns: result (Instances): final results after nms """ predicted_boxes = torch.cat(ensemble_pred_box_list, 0) predicted_boxes_covariance = torch.cat(ensembles_pred_box_covariance_list, 0) predicted_prob_vectors = torch.cat(ensemble_pred_prob_vectors_list, 0) predicted_class_idxs = torch.cat(ensembles_class_idxs_list, 0) # Compute iou between all output boxes and each other output box. match_quality_matrix = pairwise_iou(Boxes(predicted_boxes), Boxes(predicted_boxes)) # Perform basic sequential clustering. clusters = [] for i in range(match_quality_matrix.shape[0]): # Check if current box is already a member of any previous cluster. if i != 0: all_clusters = torch.cat(clusters, 0) if (all_clusters == i).any(): continue # Only add if boxes have the same category. cluster_membership_test = ( match_quality_matrix[i, :] >= affinity_threshold) & ( predicted_class_idxs == predicted_class_idxs[i]) inds = torch.where(cluster_membership_test) clusters.extend(inds) # Compute mean and covariance for every cluster. predicted_boxes_list = [] predicted_boxes_covariance_list = [] predicted_prob_vectors_list = [] # Compute cluster mean and covariance matrices. for cluster in clusters: box_cluster = predicted_boxes[cluster] box_cluster_covariance = predicted_boxes_covariance[cluster] if box_cluster.shape[0] >= 2: cluster_mean = box_cluster.mean(0) # Compute epistemic covariance residuals = (box_cluster - cluster_mean).unsqueeze(2) predicted_covariance = torch.sum( torch.matmul(residuals, torch.transpose(residuals, 2, 1)), 0) / (box_cluster.shape[0] - 1) # Add epistemic covariance predicted_covariance = predicted_covariance + \ box_cluster_covariance.mean(0) predicted_boxes_list.append(cluster_mean) predicted_boxes_covariance_list.append(predicted_covariance) predicted_prob_vectors_list.append( predicted_prob_vectors[cluster].mean(0)) else: predicted_boxes_list.append(predicted_boxes[cluster].mean(0)) predicted_boxes_covariance_list.append( predicted_boxes_covariance[cluster].mean(0)) predicted_prob_vectors_list.append( predicted_prob_vectors[cluster].mean(0)) result = Instances( (input_im[0]['image'].shape[1], input_im[0]['image'].shape[2])) if len(predicted_boxes_list) > 0: predicted_prob_vectors = torch.stack(predicted_prob_vectors_list, 0) # Remove background class if generalized rcnn if is_generalized_rcnn: predicted_prob_vectors_no_bkg = predicted_prob_vectors[:, :-1] else: predicted_prob_vectors_no_bkg = predicted_prob_vectors predicted_prob, classes_idxs = torch.max(predicted_prob_vectors_no_bkg, 1) predicted_boxes = torch.stack(predicted_boxes_list, 0) # We want to keep the maximum allowed boxes per image to be consistent # with the rest of the methods. However, just sorting will lead to quite alot of # redundant detections so we have to use one more NMS step. keep = batched_nms(predicted_boxes, predicted_prob, classes_idxs, nms_threshold) keep = keep[:max_detections_per_image] result.pred_boxes = Boxes(predicted_boxes[keep]) result.scores = predicted_prob[keep] result.pred_classes = classes_idxs[keep] result.pred_cls_probs = predicted_prob_vectors[keep] result.pred_boxes_covariance = torch.stack( predicted_boxes_covariance_list, 0)[keep] else: result.pred_boxes = Boxes(predicted_boxes) result.scores = torch.zeros(predicted_boxes.shape[0]).to(device) result.pred_classes = predicted_class_idxs result.pred_cls_probs = predicted_prob_vectors result.pred_boxes_covariance = torch.empty( (predicted_boxes.shape + (4, ))).to(device) return result
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper`. Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "sem_seg": semantic segmentation ground truth * "center": center points heatmap ground truth * "offset": pixel offsets to center points ground truth * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model (may be different from input resolution), used in inference. Returns: list[dict]: each dict is the results for one image. The dict contains the following keys: * "panoptic_seg", "sem_seg": see documentation :doc:`/tutorials/models` for the standard output format * "instances": available if ``predict_instances is True``. see documentation :doc:`/tutorials/models` for the standard output format """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] # To avoid error in ASPP layer when input has different size. size_divisibility = ( self.size_divisibility if self.size_divisibility > 0 else self.backbone.size_divisibility ) images = ImageList.from_tensors(images, size_divisibility) features = self.backbone(images.tensor) losses = {} if "sem_seg" in batched_inputs[0]: targets = [x["sem_seg"].to(self.device) for x in batched_inputs] targets = ImageList.from_tensors( targets, size_divisibility, self.sem_seg_head.ignore_value ).tensor if "sem_seg_weights" in batched_inputs[0]: # The default D2 DatasetMapper may not contain "sem_seg_weights" # Avoid error in testing when default DatasetMapper is used. weights = [x["sem_seg_weights"].to(self.device) for x in batched_inputs] weights = ImageList.from_tensors(weights, size_divisibility).tensor else: weights = None else: targets = None weights = None sem_seg_results, sem_seg_losses = self.sem_seg_head(features, targets, weights) losses.update(sem_seg_losses) if "center" in batched_inputs[0] and "offset" in batched_inputs[0]: center_targets = [x["center"].to(self.device) for x in batched_inputs] center_targets = ImageList.from_tensors( center_targets, size_divisibility ).tensor.unsqueeze(1) center_weights = [x["center_weights"].to(self.device) for x in batched_inputs] center_weights = ImageList.from_tensors(center_weights, size_divisibility).tensor offset_targets = [x["offset"].to(self.device) for x in batched_inputs] offset_targets = ImageList.from_tensors(offset_targets, size_divisibility).tensor offset_weights = [x["offset_weights"].to(self.device) for x in batched_inputs] offset_weights = ImageList.from_tensors(offset_weights, size_divisibility).tensor else: center_targets = None center_weights = None offset_targets = None offset_weights = None center_results, offset_results, center_losses, offset_losses = self.ins_embed_head( features, center_targets, center_weights, offset_targets, offset_weights ) losses.update(center_losses) losses.update(offset_losses) if self.training: return losses if self.benchmark_network_speed: return [] processed_results = [] for sem_seg_result, center_result, offset_result, input_per_image, image_size in zip( sem_seg_results, center_results, offset_results, batched_inputs, images.image_sizes ): height = input_per_image.get("height") width = input_per_image.get("width") r = sem_seg_postprocess(sem_seg_result, image_size, height, width) c = sem_seg_postprocess(center_result, image_size, height, width) o = sem_seg_postprocess(offset_result, image_size, height, width) # Post-processing to get panoptic segmentation. panoptic_image, _ = get_panoptic_segmentation( r.argmax(dim=0, keepdim=True), c, o, thing_ids=self.meta.thing_dataset_id_to_contiguous_id.values(), label_divisor=self.meta.label_divisor, stuff_area=self.stuff_area, void_label=-1, threshold=self.threshold, nms_kernel=self.nms_kernel, top_k=self.top_k, ) # For semantic segmentation evaluation. processed_results.append({"sem_seg": r}) panoptic_image = panoptic_image.squeeze(0) semantic_prob = F.softmax(r, dim=0) # Write results to disk: img = input_per_image["image"] from detectron2.utils.visualizer import Visualizer from detectron2.data.detection_utils import convert_image_to_rgb from PIL import Image import os img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format).astype("uint8") img = np.array(Image.fromarray(img).resize((width, height))) v_panoptic = Visualizer(img, self.meta) v_panoptic = v_panoptic.draw_panoptic_seg_predictions(panoptic_image.cpu(), None) pan_img = v_panoptic.get_image() image_path = input_per_image['file_name'].split(os.sep) image_name = os.path.splitext(image_path[-1])[0] Image.fromarray(pan_img).save(os.path.join('/home/ahabbas/projects/conseg/affinityNet/output_pdl/coco/eval_vis', image_name + '_panoptic.png')) # For panoptic segmentation evaluation. processed_results[-1]["panoptic_seg"] = (panoptic_image, None) # For instance segmentation evaluation. if self.predict_instances: instances = [] panoptic_image_cpu = panoptic_image.cpu().numpy() for panoptic_label in np.unique(panoptic_image_cpu): if panoptic_label == -1: continue pred_class = panoptic_label // self.meta.label_divisor isthing = pred_class in list( self.meta.thing_dataset_id_to_contiguous_id.values() ) # Get instance segmentation results. if isthing: instance = Instances((height, width)) # Evaluation code takes continuous id starting from 0 instance.pred_classes = torch.tensor( [pred_class], device=panoptic_image.device ) mask = panoptic_image == panoptic_label instance.pred_masks = mask.unsqueeze(0) # Average semantic probability sem_scores = semantic_prob[pred_class, ...] sem_scores = torch.mean(sem_scores[mask]) # Center point probability mask_indices = torch.nonzero(mask).float() center_y, center_x = ( torch.mean(mask_indices[:, 0]), torch.mean(mask_indices[:, 1]), ) center_scores = c[0, int(center_y.item()), int(center_x.item())] # Confidence score is semantic prob * center prob. instance.scores = torch.tensor( [sem_scores * center_scores], device=panoptic_image.device ) # Get bounding boxes instance.pred_boxes = BitMasks(instance.pred_masks).get_bounding_boxes() instances.append(instance) if len(instances) > 0: processed_results[-1]["instances"] = Instances.cat(instances) return processed_results
def general_anchor_statistics_postprocessing(input_im, outputs, nms_threshold=0.5, max_detections_per_image=100, affinity_threshold=0.7): """ Args: input_im (list): an input im list generated from dataset handler. outputs (list): output list form model specific inference function nms_threshold (float): non-maximum suppression threshold between 0-1 max_detections_per_image (int): maximum allowed number of detections per image. affinity_threshold (float): cluster affinity threshold between 0-1 Returns: result (Instances): final results after nms """ predicted_boxes, predicted_boxes_covariance, predicted_prob, classes_idxs, predicted_prob_vectors = outputs # Get pairwise iou matrix match_quality_matrix = pairwise_iou(Boxes(predicted_boxes), Boxes(predicted_boxes)) # Get cluster centers using standard nms. Much faster than sequential # clustering. keep = batched_nms(predicted_boxes, predicted_prob, classes_idxs, nms_threshold) keep = keep[:max_detections_per_image] clusters_inds = match_quality_matrix[keep, :] clusters_inds = clusters_inds > affinity_threshold # Compute mean and covariance for every cluster. predicted_prob_vectors_list = [] predicted_boxes_list = [] predicted_boxes_covariance_list = [] for cluster_idxs, center_idx in zip(clusters_inds, keep): if cluster_idxs.sum(0) >= 2: # Make sure to only select cluster members of same class as center cluster_center_classes_idx = classes_idxs[center_idx] cluster_classes_idxs = classes_idxs[cluster_idxs] class_similarity_idxs = cluster_classes_idxs == cluster_center_classes_idx # Grab cluster box_cluster = predicted_boxes[cluster_idxs, :][ class_similarity_idxs, :] cluster_mean = box_cluster.mean(0) residuals = (box_cluster - cluster_mean).unsqueeze(2) cluster_covariance = torch.sum( torch.matmul(residuals, torch.transpose(residuals, 2, 1)), 0) / max((box_cluster.shape[0] - 1), 1.0) # Assume final result as mean and covariance of gaussian mixture of cluster members if covariance is provided # by neural network if predicted_boxes_covariance is not None: if len(predicted_boxes_covariance) > 0: cluster_covariance = cluster_covariance + \ predicted_boxes_covariance[cluster_idxs, :][class_similarity_idxs, :].mean(0) # Compute average over cluster probabilities cluster_probs_vector = predicted_prob_vectors[cluster_idxs, :][ class_similarity_idxs, :].mean(0) else: cluster_mean = predicted_boxes[center_idx] cluster_probs_vector = predicted_prob_vectors[center_idx] cluster_covariance = 1e-4 * torch.eye(4, 4).to(device) if predicted_boxes_covariance is not None: if len(predicted_boxes_covariance) > 0: cluster_covariance = predicted_boxes_covariance[center_idx] predicted_boxes_list.append(cluster_mean) predicted_boxes_covariance_list.append(cluster_covariance) predicted_prob_vectors_list.append(cluster_probs_vector) result = Instances( (input_im[0]['image'].shape[1], input_im[0]['image'].shape[2])) if len(predicted_boxes_list) > 0: # We do not average the probability vectors for this post processing method. Averaging results in # very low mAP due to mixing with low scoring detection instances. result.pred_boxes = Boxes(torch.stack(predicted_boxes_list, 0)) predicted_prob_vectors = torch.stack(predicted_prob_vectors_list, 0) predicted_prob, classes_idxs = torch.max(predicted_prob_vectors, 1) result.scores = predicted_prob result.pred_classes = classes_idxs result.pred_cls_probs = predicted_prob_vectors result.pred_boxes_covariance = torch.stack( predicted_boxes_covariance_list, 0) else: result.pred_boxes = Boxes(predicted_boxes) result.scores = torch.zeros(predicted_boxes.shape[0]).to(device) result.pred_classes = classes_idxs result.pred_cls_probs = predicted_prob_vectors result.pred_boxes_covariance = torch.empty( (predicted_boxes.shape + (4, ))).to(device) return result
def forward_for_single_feature_map(self, locations, logits_pred, reg_pred, ctrness_pred, image_sizes, top_feat=None): N, C, H, W = logits_pred.shape # put in the same format as locations logits_pred = logits_pred.view(N, C, H, W).permute(0, 2, 3, 1) logits_pred = logits_pred.reshape(N, -1, C).sigmoid() box_regression = reg_pred.view(N, 4, H, W).permute(0, 2, 3, 1) box_regression = box_regression.reshape(N, -1, 4) ctrness_pred = ctrness_pred.view(N, 1, H, W).permute(0, 2, 3, 1) ctrness_pred = ctrness_pred.reshape(N, -1).sigmoid() if top_feat is not None: top_feat = top_feat.view(N, -1, H, W).permute(0, 2, 3, 1) top_feat = top_feat.reshape(N, H * W, -1) # if self.thresh_with_ctr is True, we multiply the classification # scores with centerness scores before applying the threshold. if self.thresh_with_ctr: logits_pred = logits_pred * ctrness_pred[:, :, None] candidate_inds = logits_pred > self.pre_nms_thresh pre_nms_top_n = candidate_inds.view(N, -1).sum(1) pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_topk) if not self.thresh_with_ctr: logits_pred = logits_pred * ctrness_pred[:, :, None] results = [] for i in range(N): per_box_cls = logits_pred[i] per_candidate_inds = candidate_inds[i] per_box_cls = per_box_cls[per_candidate_inds] per_candidate_nonzeros = per_candidate_inds.nonzero() per_box_loc = per_candidate_nonzeros[:, 0] per_class = per_candidate_nonzeros[:, 1] per_box_regression = box_regression[i] per_box_regression = per_box_regression[per_box_loc] per_locations = locations[per_box_loc] if top_feat is not None: per_top_feat = top_feat[i] per_top_feat = per_top_feat[per_box_loc] per_pre_nms_top_n = pre_nms_top_n[i] if per_candidate_inds.sum().item() > per_pre_nms_top_n.item(): per_box_cls, top_k_indices = \ per_box_cls.topk(per_pre_nms_top_n, sorted=False) per_class = per_class[top_k_indices] per_box_regression = per_box_regression[top_k_indices] per_locations = per_locations[top_k_indices] if top_feat is not None: per_top_feat = per_top_feat[top_k_indices] detections = torch.stack([ per_locations[:, 0] - per_box_regression[:, 0], per_locations[:, 1] - per_box_regression[:, 1], per_locations[:, 0] + per_box_regression[:, 2], per_locations[:, 1] + per_box_regression[:, 3], ], dim=1) boxlist = Instances(image_sizes[i]) boxlist.pred_boxes = Boxes(detections) boxlist.scores = torch.sqrt(per_box_cls) boxlist.pred_classes = per_class boxlist.locations = per_locations if top_feat is not None: boxlist.top_feat = per_top_feat results.append(boxlist) return results
"roi_heads.box_head.cls_score.bias": "roi_heads.box_predictor.cls_score.bias", "roi_heads.box_head.bbox_pred.weight": "roi_heads.box_predictor.bbox_pred.weight", "roi_heads.box_head.bbox_pred.bias": "roi_heads.box_predictor.bbox_pred.bias", } temp = torch.load("weight.pt") temp = {state_dict_map.get(k, k): v for k, v in temp.items()} print("Problems with:\n" + "\n".join([k for k in net.state_dict() if k not in temp])) net.load_state_dict({k: temp.get(k, v) for k, v in net.state_dict().items()}) #net.eval() targets = Instances((512, 512)) targets.gt_boxes = Boxes(torch.load("targets.pt")["boxes"]) targets.gt_classes = torch.load("targets.pt")["classes"] data = [{"image": torch.load("data.pt").cuda(), "instances": targets}] storage_4del = EventStorage(0).__enter__() torch.random.manual_seed(0) torch.cuda.manual_seed(0) # with torch.no_grad(): for i in range(3): torch.cuda.synchronize() t = time.time() losses = net(data)
def detector_postprocess(results: Instances, output_height: int, output_width: int, mask_threshold: float = 0.5): """ Resize the output instances. The input images are often resized when entering an object detector. As a result, we often need the outputs of the detector in a different resolution from its inputs. This function will resize the raw outputs of an R-CNN detector to produce outputs according to the desired output resolution. Args: results (Instances): the raw outputs from the detector. `results.image_size` contains the input image resolution the detector sees. This object might be modified in-place. output_height, output_width: the desired output resolution. Returns: Instances: the resized output from the model, based on the output resolution """ # Change to 'if is_tracing' after PT1.7 if isinstance(output_height, torch.Tensor): # Converts integer tensors to float temporaries to ensure true # division is performed when computing scale_x and scale_y. output_width_tmp = output_width.float() output_height_tmp = output_height.float() new_size = torch.stack([output_height, output_width]) else: new_size = (output_height, output_width) output_width_tmp = output_width output_height_tmp = output_height scale_x, scale_y = ( output_width_tmp / results.image_size[1], output_height_tmp / results.image_size[0], ) results = Instances(new_size, **results.get_fields()) if results.has("pred_boxes"): output_boxes = results.pred_boxes elif results.has("proposal_boxes"): output_boxes = results.proposal_boxes else: output_boxes = None assert output_boxes is not None, "Predictions must contain boxes!" output_boxes.scale(scale_x, scale_y) output_boxes.clip(results.image_size) results = results[output_boxes.nonempty()] if results.has("pred_masks"): results.pred_masks = retry_if_cuda_oom(paste_masks_in_image)( results.pred_masks[:, 0, :, :], # N, 1, M, M results.pred_boxes, results.image_size, threshold=mask_threshold, ) if results.has("pred_keypoints"): results.pred_keypoints[:, :, 0] *= scale_x results.pred_keypoints[:, :, 1] *= scale_y return results
def f(x: Tensor): image_shape = (15, 15) # __init__ can take arguments inst = Instances(image_shape, a=x, proposal_boxes=Boxes(x)) inst2 = Instances(image_shape, a=x) return inst.a, inst2.a