def inference(self, box_cls, box_pred, image_sizes): """ Arguments: box_cls (Tensor): tensor of shape (batch_size, num_proposals, K). The tensor predicts the classification probability for each proposal. box_pred (Tensor): tensors of shape (batch_size, num_proposals, 4). The tensor predicts 4-vector (x,y,w,h) box regression values for every proposal image_sizes (List[torch.Size]): the input image sizes Returns: results (List[Instances]): a list of #images elements. """ assert len(box_cls) == len(image_sizes) results = [] if self.use_focal: scores = torch.sigmoid(box_cls) labels = torch.arange(self.num_classes, device=self.device). \ unsqueeze(0).repeat(self.num_proposals, 1).flatten(0, 1) for i, (scores_per_image, box_pred_per_image, image_size) in enumerate(zip(scores, box_pred, image_sizes)): result = Instances(image_size) scores_per_image, topk_indices = scores_per_image.flatten( 0, 1).topk(self.num_proposals, sorted=False) labels_per_image = labels[topk_indices] box_pred_per_image = box_pred_per_image.view(-1, 1, 4).repeat( 1, self.num_classes, 1).view(-1, 4) box_pred_per_image = box_pred_per_image[topk_indices] result.pred_boxes = Boxes(box_pred_per_image) result.scores = scores_per_image result.pred_classes = labels_per_image results.append(result) else: scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1) for i, (scores_per_image, labels_per_image, box_pred_per_image, image_size) \ in enumerate(zip(scores, labels, box_pred, image_sizes)): result = Instances(image_size) result.pred_boxes = Boxes(box_pred_per_image) result.pred_boxes.scale(scale_x=image_size[1], scale_y=image_size[0]) result.scores = scores_per_image result.pred_classes = labels_per_image results.append(result) return results
def inference_single_image(self, conf_pred_per_image, loc_pred_per_image, default_boxes, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: conf_pred_per_image (list[Tensor]): list of #feature levels. Each entry contains tensor of size [Hi x Wi x D, C]. loc_pred_per_image (list[Tensor]): same shape as 'conf_pred_per_image' except that C becomes 4. default_boxes (list['Boxes']): a list of 'Boxes' elements. The Boxes contains default boxes of one image on the specific feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ # predict confidence conf_pred = torch.cat(conf_pred_per_image, dim=0) # [R, C] conf_pred = conf_pred.softmax(dim=1) # predict boxes loc_pred = torch.cat(loc_pred_per_image, dim=0) # [R, 4] default_boxes = Boxes.cat(default_boxes) # [R, 4] boxes_pred = self.box2box_transform.apply_deltas( loc_pred, default_boxes.tensor) num_boxes, num_classes = conf_pred.shape boxes_pred = boxes_pred.view(num_boxes, 1, 4).expand( num_boxes, num_classes, 4) # [R, C, 4] labels = torch.arange(num_classes, device=self.device) # [0, ..., C] labels = labels.view(1, num_classes).expand_as(conf_pred) # [R, C] # remove predictions with the background label boxes_pred = boxes_pred[:, :-1] conf_pred = conf_pred[:, :-1] labels = labels[:, :-1] # batch everything, by making every class prediction be a separate instance boxes_pred = boxes_pred.reshape(-1, 4) conf_pred = conf_pred.reshape(-1) labels = labels.reshape(-1) # remove low scoring boxes indices = torch.nonzero(conf_pred > self.score_threshold, as_tuple=False).squeeze(1) boxes_pred, conf_pred, labels = boxes_pred[indices], conf_pred[indices], labels[indices] keep = generalized_batched_nms(boxes_pred, conf_pred, labels, self.nms_threshold, nms_type=self.nms_type) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_pred[keep]) result.scores = conf_pred[keep] result.pred_classes = labels[keep] return result
def create_instances(predictions, image_size): ret = Instances(image_size) score = np.asarray([x["score"] for x in predictions]) chosen = (score > args.conf_threshold).nonzero()[0] score = score[chosen] bbox = np.asarray([predictions[i]["bbox"] for i in chosen]) if score.shape[0] == 0: bbox = np.zeros((0, 4)) else: bbox = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) labels = np.asarray( [dataset_id_map(predictions[i]["category_id"]) for i in chosen]) ret.scores = score ret.pred_boxes = Boxes(bbox) ret.pred_classes = labels try: ret.pred_masks = [predictions[i]["segmentation"] for i in chosen] except KeyError: pass return ret
def _inference_one_image(self, inputs): augmented_inputs = self.tta_mapper(inputs) assert len({x["file_name"] for x in augmented_inputs}) == 1, "inference different images" heights = [k["height"] for k in augmented_inputs] widths = [k["width"] for k in augmented_inputs] assert ( len(set(heights)) == 1 and len(set(widths)) == 1 ), "Augmented version of the inputs should have the same original resolution!" height = heights[0] width = widths[0] # 1. Detect boxes from all augmented versions # TODO wangfeng02: use box structures instead of boxes, scores and classes all_boxes = [] all_scores = [] all_classes = [] factors = 2 if self.tta_mapper.flip else 1 if self.enable_scale_filter: assert len(augmented_inputs) == len(self.scale_ranges) * factors for i, single_input in enumerate(augmented_inputs): do_hflip = single_input.pop("horiz_flip", False) # 1.1: forward with single augmented image output = self.model._inference_for_ms_test([single_input]) # 1.2: union the results pred_boxes = output.get("pred_boxes").tensor if do_hflip: pred_boxes[:, [0, 2]] = width - pred_boxes[:, [2, 0]] pred_scores = output.get("scores") pred_classes = output.get("pred_classes") if self.enable_scale_filter: keep = filter_boxes(pred_boxes, *self.scale_ranges[i // factors]) pred_boxes = pred_boxes[keep] pred_scores = pred_scores[keep] pred_classes = pred_classes[keep] all_boxes.append(pred_boxes) all_scores.append(pred_scores) all_classes.append(pred_classes) boxes_all = torch.cat(all_boxes, dim=0) scores_all = torch.cat(all_scores, dim=0) class_idxs_all = torch.cat(all_classes, dim=0) boxes_all, scores_all, class_idxs_all = merge_result_from_multi_scales( boxes_all, scores_all, class_idxs_all, nms_type="soft_vote", vote_thresh=0.65, max_detection=self.max_detection ) result = Instances((height, width)) result.pred_boxes = Boxes(boxes_all) result.scores = scores_all result.pred_classes = class_idxs_all return {"instances": result}
def fast_rcnn_inference_single_image(boxes, scores, image_shape, score_thresh, nms_thresh, nms_type, topk_per_image): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes per image. Returns: Same as `fast_rcnn_inference`, but for only one image. """ scores = scores[:, :-1] num_bbox_reg_classes = boxes.shape[1] // 4 # Convert to Boxes to use the `clip` function ... boxes = Boxes(boxes.reshape(-1, 4)) boxes.clip(image_shape) boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4 # Filter results based on detection scores filter_mask = scores > score_thresh # R x K # R' x 2. First column contains indices of the R predictions; # Second column contains indices of classes. filter_inds = filter_mask.nonzero(as_tuple=False) if num_bbox_reg_classes == 1: boxes = boxes[filter_inds[:, 0], 0] else: boxes = boxes[filter_mask] scores = scores[filter_mask] # Apply per-class NMS keep = generalized_batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh, nms_type=nms_type) if topk_per_image >= 0: keep = keep[:topk_per_image] boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep] result = Instances(image_shape) result.pred_boxes = Boxes(boxes) result.scores = scores result.pred_classes = filter_inds[:, 1] return result, filter_inds[:, 0]
def _inference_one_image(self, inputs): augmented_inputs = self.tta_mapper(inputs) assert len({x["file_name"] for x in augmented_inputs }) == 1, "inference different images" heights = [k["height"] for k in augmented_inputs] widths = [k["width"] for k in augmented_inputs] assert ( len(set(heights)) == 1 and len(set(widths)) == 1 ), "Augmented version of the inputs should have the same original resolution!" height = heights[0] width = widths[0] # 1. Detect boxes from all augmented versions all_boxes = [] all_scores = [] all_classes = [] for single_input in augmented_inputs: do_hflip = single_input.pop("horiz_flip", False) # 1.1: forward with single augmented image output = self.model._inference_for_ms_test([single_input]) # 1.2: union the results pred_boxes = output.get("pred_boxes").tensor if do_hflip: pred_boxes[:, [0, 2]] = width - pred_boxes[:, [2, 0]] all_boxes.append(pred_boxes) all_scores.append(output.get("scores")) all_classes.append(output.get("pred_classes")) boxes_all = torch.cat(all_boxes, dim=0) scores_all = torch.cat(all_scores, dim=0) class_idxs_all = torch.cat(all_classes, dim=0) keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.model.nms_threshold, nms_type=self.model.nms_type) keep = keep[:self.model.max_detections_per_image] result = Instances((height, width)) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return {"instances": result}
def inference_single_image(self, box_cls, box_center, border_cls, border_delta, bd_based_box, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. box_center (list[Tensor]): Same shape as 'box_cls' except that K becomes 1. shifts (list[Tensor]): list of #feature levels. Each entry contains a tensor, which contains all the shifts for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ boxes_all = [] scores_all = [] class_idxs_all = [] border_bbox_std = bd_based_box[0].new_tensor(self.border_bbox_std) # Iterate over every feature level for box_cls_i, box_ctr_i, bd_box_cls_i, bd_box_reg_i, bd_based_box_i in zip( box_cls, box_center, border_cls, border_delta, bd_based_box): # (HxWxK,) box_cls_i = box_cls_i.sigmoid_() box_ctr_i = box_ctr_i.sigmoid_() bd_box_cls_i = bd_box_cls_i.sigmoid_() predicted_prob = (box_cls_i * box_ctr_i).sqrt() # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob * bd_box_cls_i predicted_prob = predicted_prob[keep_idxs] # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, predicted_prob.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = predicted_prob.sort(descending=True) topk_idxs = topk_idxs[:num_topk] keep_idxs = keep_idxs.nonzero() keep_idxs = keep_idxs[topk_idxs] keep_box_idxs = keep_idxs[:, 0] classes_idxs = keep_idxs[:, 1] predicted_prob = predicted_prob[:num_topk] bd_box_reg_i = bd_box_reg_i[keep_box_idxs] bd_based_box_i = bd_based_box_i[keep_box_idxs] det_wh = (bd_based_box_i[..., 2:4] - bd_based_box_i[..., :2]) det_wh = torch.cat([det_wh, det_wh], dim=1) predicted_boxes = bd_based_box_i + (bd_box_reg_i * border_bbox_std * det_wh) boxes_all.append(predicted_boxes) scores_all.append(predicted_prob.sqrt()) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold, nms_type=self.nms_type) boxes_all = boxes_all[keep] scores_all = scores_all[keep] class_idxs_all = class_idxs_all[keep] number_of_detections = len(keep) # Limit to max_per_image detections **over all classes** if number_of_detections > self.max_detections_per_image > 0: image_thresh, _ = torch.kthvalue( scores_all, number_of_detections - self.max_detections_per_image + 1) keep = scores_all >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) boxes_all = boxes_all[keep] scores_all = scores_all[keep] class_idxs_all = class_idxs_all[keep] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all) result.scores = scores_all result.pred_classes = class_idxs_all return result
def inference_single_image(self, box_cls, box_delta, box_center, shifts, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. box_center (list[Tensor]): Same shape as 'box_cls' except that K becomes 1. shifts (list[Tensor]): list of #feature levels. Each entry contains a tensor, which contains all the shifts for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ boxes_all = [] scores_all = [] class_idxs_all = [] # Iterate over every feature level for box_cls_i, box_reg_i, box_ctr_i, shifts_i in zip( box_cls, box_delta, box_center, shifts): # (HxWxK,) box_cls_i = box_cls_i.flatten().sigmoid_() # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, box_reg_i.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = box_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] shift_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes box_reg_i = box_reg_i[shift_idxs] shifts_i = shifts_i[shift_idxs] # predict boxes predicted_boxes = self.shift2box_transform.apply_deltas( box_reg_i, shifts_i) box_ctr_i = box_ctr_i.flatten().sigmoid_()[shift_idxs] predicted_prob = torch.sqrt(predicted_prob * box_ctr_i) boxes_all.append(predicted_boxes) scores_all.append(predicted_prob) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold, nms_type=self.nms_type) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return result
def inference_single_image(self, pred_logits, pred_deltas, pred_masks, anchors, indexes, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: pred_logits (list[Tensor]): list of #feature levels. Each entry contains tensor of size (AxHxW, K) pred_deltas (list[Tensor]): Same shape as 'pred_logits' except that K becomes 4. pred_masks (list[list[Tensor]]): List of #feature levels, each is a list of #anchors. Each entry contains tensor of size (M_i*M_i, H, W). `None` if mask_on=False. anchors (list[Boxes]): list of #feature levels. Each entry contains a Boxes object, which contains all the anchors for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ pred_logits = pred_logits.flatten().sigmoid_() # We get top locations across all levels to accelerate the inference speed, # which does not seem to affect the accuracy. # First select values above the threshold logits_top_idxs = torch.where(pred_logits > self.score_threshold)[0] # Then get the top values num_topk = min(self.topk_candidates, logits_top_idxs.shape[0]) pred_prob, topk_idxs = pred_logits[logits_top_idxs].sort( descending=True) # Keep top k scoring values pred_prob = pred_prob[:num_topk] # Keep top k values top_idxs = logits_top_idxs[topk_idxs[:num_topk]] # class index cls_idxs = top_idxs % self.num_classes # HWA index top_idxs //= self.num_classes # predict boxes pred_boxes = self.box2box_transform.apply_deltas( pred_deltas[top_idxs], anchors[top_idxs].tensor) # apply nms keep = generalized_batched_nms(pred_boxes, pred_prob, cls_idxs, self.nms_threshold, nms_type=self.nms_type) # pick the top ones keep = keep[:self.detections_im] results = Instances(image_size) results.pred_boxes = Boxes(pred_boxes[keep]) results.scores = pred_prob[keep] results.pred_classes = cls_idxs[keep] # deal with masks result_masks, result_anchors = [], None if self.mask_on: # index and anchors, useful for masks top_indexes = indexes[top_idxs] top_anchors = anchors[top_idxs] result_indexes = top_indexes[keep] result_anchors = top_anchors[keep] # Get masks and do sigmoid for lvl, _, h, w, anc in result_indexes.tolist(): cur_size = self.mask_sizes[anc] * (2**lvl if self.bipyramid_on else 1) result_masks.append( torch.sigmoid(pred_masks[lvl][anc][:, h, w].view( 1, cur_size, cur_size))) return results, (result_masks, result_anchors)
def inference_single_image(self, cate_preds, seg_preds, featmap_size, img_shape, ori_shape): """ Args: cate_preds, seg_preds: see: method: `inference`. featmap_size (list[tuple]): feature map size per level. img_shape (tuple): the size of the image fed into the model (height and width). ori_shape (tuple): original image shape (height and width). Returns: result (Instances): predicted results of single image after post-processing. """ assert len(cate_preds) == len(seg_preds) result = Instances(ori_shape) # overall info. h, w = img_shape upsampled_size_out = (featmap_size[0] * 4, featmap_size[1] * 4) # process. inds = (cate_preds > self.score_threshold) # category scores. cate_scores = cate_preds[inds] if len(cate_scores) == 0: return result # category labels. inds = inds.nonzero(as_tuple=False) cate_labels = inds[:, 1] # strides. size_trans = cate_labels.new_tensor(self.seg_num_grids).pow(2).cumsum( 0) # [1600, 2896, 3472, 3728, 3872] strides = cate_scores.new_ones(size_trans[-1]) n_stage = len(self.seg_num_grids) strides[:size_trans[0]] *= self.feature_strides[0] for ind_ in range(1, n_stage): strides[size_trans[ind_ - 1]:size_trans[ind_]] *= self.feature_strides[ ind_] strides = strides[inds[:, 0]] # masks. seg_preds = seg_preds[inds[:, 0]] seg_masks = seg_preds > self.mask_threshold sum_masks = seg_masks.sum((1, 2)).float() # filter. keep = sum_masks > strides if keep.sum() == 0: return result seg_masks = seg_masks[keep, ...] seg_preds = seg_preds[keep, ...] sum_masks = sum_masks[keep] cate_scores = cate_scores[keep] cate_labels = cate_labels[keep] # mask scoring. seg_scores = (seg_preds * seg_masks.float()).sum((1, 2)) / sum_masks cate_scores *= seg_scores # sort and keep top nms_pre sort_inds = torch.argsort(cate_scores, descending=True) if len(sort_inds) > self.nms_per_image: sort_inds = sort_inds[:self.nms_per_image] seg_masks = seg_masks[sort_inds, :, :] seg_preds = seg_preds[sort_inds, :, :] sum_masks = sum_masks[sort_inds] cate_scores = cate_scores[sort_inds] cate_labels = cate_labels[sort_inds] # Matrix NMS cate_scores = matrix_nms(seg_masks, cate_labels, cate_scores, kernel=self.nms_kernel, sigma=self.nms_sigma, sum_masks=sum_masks) # filter. keep = cate_scores >= self.update_threshold if keep.sum() == 0: return result seg_preds = seg_preds[keep, :, :] cate_scores = cate_scores[keep] cate_labels = cate_labels[keep] # sort and keep top_k sort_inds = torch.argsort(cate_scores, descending=True) if len(sort_inds) > self.max_detections_per_image: sort_inds = sort_inds[:self.max_detections_per_image] seg_preds = seg_preds[sort_inds, :, :] cate_scores = cate_scores[sort_inds] cate_labels = cate_labels[sort_inds] seg_preds = F.interpolate(seg_preds.unsqueeze(0), size=upsampled_size_out, mode='bilinear')[:, :, :h, :w] seg_masks = F.interpolate(seg_preds, size=ori_shape, mode='bilinear').squeeze(0) seg_masks = seg_masks > self.mask_threshold seg_masks = BitMasks(seg_masks) result.pred_masks = seg_masks result.pred_boxes = seg_masks.get_bounding_boxes() result.scores = cate_scores result.pred_classes = cate_labels return result
def inference_single_image(self, cls_logits, pts_refine, pts_strides, points, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguemnts: cls_logits (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W, K) pts_refine (list[Tensor]): Same shape as 'cls_logits' except that K becomes 2 * num_points. pts_strides (list(Tensor)): list of #feature levels. Each entry contains tensor of size (H x W, ) points (list[Tensor]): list of #feature levels. Each entry contains a tensor, which contains all the points for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but only for one image """ assert len(cls_logits) == len(pts_refine) == len(pts_strides) boxes_all = [] scores_all = [] class_idxs_all = [] # Iterate over every feature level for cls_logits_i, pts_refine_i, points_i, pts_strides_i in zip( cls_logits, pts_refine, points, pts_strides): bbox_pos_center = torch.cat([points_i, points_i], dim=1) bbox_pred = self.pts_to_bbox(pts_refine_i) bbox_pred = bbox_pred * pts_strides_i.reshape(-1, 1) + bbox_pos_center bbox_pred[:, 0].clamp_(min=0, max=image_size[1]) bbox_pred[:, 1].clamp_(min=0, max=image_size[0]) bbox_pred[:, 2].clamp_(min=0, max=image_size[1]) bbox_pred[:, 3].clamp_(min=0, max=image_size[0]) # (HxWxK, ) point_cls_i = cls_logits_i.flatten().sigmoid_() # keep top k scoring indices only num_topk = min(self.topk_candidates, point_cls_i.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = point_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] point_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes predicted_boxes = bbox_pred[point_idxs] boxes_all.append(predicted_boxes) scores_all.append(predicted_prob) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold, nms_type=self.nms_type) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return result
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images, labels = self.preprocess_image(batched_inputs, self.training) # batched_inputs[0]['image'] = images.tensor[0].cpu() * 255 # self.visualize_data(batched_inputs[0]) x = images.tensor img_size = x.shape[-2:] def _branch(_embedding, _in): for i, e in enumerate(_embedding): _in = e(_in) if i == 4: out_branch = _in return _in, out_branch # backbone # x2, x1, x0 = self.backbone(x) out_features = self.backbone(x) features = [out_features[f] for f in self.in_features] [x2, x1, x0] = features # yolo branch 0 out0, out0_branch = _branch(self.out0, x0) # yolo branch 1 x1_in = self.out1_cbl(out0_branch) x1_in = self.out1_upsample(x1_in) x1_in = torch.cat([x1_in, x1], 1) out1, out1_branch = _branch(self.out1, x1_in) # yolo branch 2 x2_in = self.out2_cbl(out1_branch) x2_in = self.out2_upsample(x2_in) x2_in = torch.cat([x2_in, x2], 1) out2, out2_branch = _branch(self.out2, x2_in) outputs = [out0, out1, out2] if self.training: losses = [ loss_evaluator(out, labels, img_size) for out, loss_evaluator in zip(outputs, self.loss_evaluators) ] keys = [ "loss_x", "loss_y", "loss_w", "loss_h", "loss_conf", "loss_cls" ] losses_dict = {} for key in keys: losses_dict[key] = sum([loss[key] for loss in losses]) return losses_dict else: predictions_list = [ loss_evaluator(out, labels, img_size) for out, loss_evaluator in zip(outputs, self.loss_evaluators) ] predictions = torch.cat(predictions_list, 1) detections = postprocess(predictions, self.num_classes, self.conf_threshold, self.nms_threshold, nms_type=self.nms_type) results = [] for idx, out in enumerate(detections): if out is None: out = x.new_zeros((0, 7)) # image_size = images.image_sizes[idx] image_size = img_size result = Instances(image_size) result.pred_boxes = Boxes(out[:, :4]) result.scores = out[:, 5] * out[:, 4] result.pred_classes = out[:, -1] results.append(result) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) B, C, H, W = images.tensor.shape device = images.tensor.device mask = torch.ones((B, H, W), dtype=torch.bool, device=device) for img_shape, m in zip(images.image_sizes, mask): m[:img_shape[0], :img_shape[1]] = False src = self.backbone(images.tensor)["res5"] mask = F.interpolate(mask[None].float(), size=src.shape[-2:]).bool()[0] pos = self.position_embedding(src, mask) hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos)[0] outputs_class = self.class_embed(hs) outputs_coord = self.bbox_embed(hs).sigmoid() out = { "pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1] } if self.training: targets = self.convert_anno_format(batched_inputs) if self.aux_loss: out["aux_outputs"] = [{ "pred_logits": a, "pred_boxes": b } for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] loss_dict = self.criterion(out, targets) for k, v in loss_dict.items(): loss_dict[k] = v * self.weight_dict[ k] if k in self.weight_dict else v return loss_dict else: target_sizes = torch.stack([ torch.tensor([ bi.get("height", img_size[0]), bi.get("width", img_size[1]) ], device=self.device) for bi, img_size in zip(batched_inputs, images.image_sizes) ]) res = self.post_processors["bbox"](out, target_sizes) processed_results = [] # for results_per_image, input_per_image, image_size in zip( for results_per_image, _, image_size in zip( res, batched_inputs, images.image_sizes): result = Instances(image_size) result.pred_boxes = Boxes(results_per_image["boxes"].float()) result.scores = results_per_image["scores"].float() result.pred_classes = results_per_image["labels"] processed_results.append({"instances": result}) return processed_results
def inference_single_image(self, cate_preds, seg_preds_x, seg_preds_y, featmap_size, img_shape, ori_shape): result = Instances(ori_shape) # overall info. h, w = img_shape upsampled_size_out = (featmap_size[0] * 4, featmap_size[1] * 4) # trans trans_diff. trans_size = torch.Tensor(self.seg_num_grids).pow(2).cumsum(0).long() trans_diff = torch.ones(trans_size[-1].item(), device=self.device).long() num_grids = torch.ones(trans_size[-1].item(), device=self.device).long() seg_size = torch.Tensor(self.seg_num_grids).cumsum(0).long() seg_diff = torch.ones(trans_size[-1].item(), device=self.device).long() strides = torch.ones(trans_size[-1].item(), device=self.device) n_stage = len(self.seg_num_grids) trans_diff[:trans_size[0]] *= 0 seg_diff[:trans_size[0]] *= 0 num_grids[:trans_size[0]] *= self.seg_num_grids[0] strides[:trans_size[0]] *= self.feature_strides[0] for ind_ in range(1, n_stage): trans_diff[trans_size[ind_ - 1]:trans_size[ind_]] *= trans_size[ind_ - 1] seg_diff[trans_size[ind_ - 1]:trans_size[ind_]] *= seg_size[ind_ - 1] num_grids[trans_size[ind_ - 1]:trans_size[ind_]] *= self.seg_num_grids[ ind_] strides[trans_size[ind_ - 1]:trans_size[ind_]] *= self.feature_strides[ ind_] # process. inds = (cate_preds > self.score_threshold) # category scores. cate_scores = cate_preds[inds] # category labels. inds = inds.nonzero(as_tuple=False) trans_diff = torch.index_select(trans_diff, dim=0, index=inds[:, 0]) seg_diff = torch.index_select(seg_diff, dim=0, index=inds[:, 0]) num_grids = torch.index_select(num_grids, dim=0, index=inds[:, 0]) strides = torch.index_select(strides, dim=0, index=inds[:, 0]) y_inds = (inds[:, 0] - trans_diff) // num_grids x_inds = (inds[:, 0] - trans_diff) % num_grids y_inds += seg_diff x_inds += seg_diff cate_labels = inds[:, 1] seg_masks_soft = seg_preds_x[x_inds, ...] * seg_preds_y[y_inds, ...] seg_masks = seg_masks_soft > self.mask_threshold sum_masks = seg_masks.sum((1, 2)).float() # filter. keep = sum_masks > strides if keep.sum() == 0: return result seg_masks_soft = seg_masks_soft[keep, ...] seg_masks = seg_masks[keep, ...] cate_scores = cate_scores[keep] sum_masks = sum_masks[keep] cate_labels = cate_labels[keep] # mask scoring seg_score = (seg_masks_soft * seg_masks.float()).sum( (1, 2)) / sum_masks cate_scores *= seg_score if len(cate_scores) == 0: return result # sort and keep top nms_pre sort_inds = torch.argsort(cate_scores, descending=True) if len(sort_inds) > self.nms_per_image: sort_inds = sort_inds[:self.nms_per_image] seg_masks_soft = seg_masks_soft[sort_inds, :, :] seg_masks = seg_masks[sort_inds, :, :] cate_scores = cate_scores[sort_inds] sum_masks = sum_masks[sort_inds] cate_labels = cate_labels[sort_inds] # Matrix NMS cate_scores = matrix_nms(seg_masks, cate_labels, cate_scores, kernel=self.nms_kernel, sigma=self.nms_sigma, sum_masks=sum_masks) # filter. keep = cate_scores >= self.update_threshold seg_masks_soft = seg_masks_soft[keep, :, :] cate_scores = cate_scores[keep] cate_labels = cate_labels[keep] # sort and keep top_k sort_inds = torch.argsort(cate_scores, descending=True) if len(sort_inds) > self.max_detections_per_image: sort_inds = sort_inds[:self.max_detections_per_image] seg_masks_soft = seg_masks_soft[sort_inds, :, :] cate_scores = cate_scores[sort_inds] cate_labels = cate_labels[sort_inds] seg_masks_soft = F.interpolate(seg_masks_soft.unsqueeze(0), size=upsampled_size_out, mode='bilinear')[:, :, :h, :w] seg_masks = F.interpolate(seg_masks_soft, size=ori_shape, mode='bilinear').squeeze(0) seg_masks = seg_masks > self.mask_threshold seg_masks = BitMasks(seg_masks) result.pred_masks = seg_masks result.pred_boxes = seg_masks.get_bounding_boxes() result.scores = cate_scores result.pred_classes = cate_labels return result
def inference_single_image(self, box_cls, box_delta, box_center, box_param, shifts, image_size, fpn_levels, img_id): boxes_all = [] scores_all = [] class_idxs_all = [] box_params_all = [] shifts_all = [] fpn_levels_all = [] # Iterate over every feature level for box_cls_i, box_reg_i, box_ctr_i, box_param_i, shifts_i, fpn_level_i in zip( box_cls, box_delta, box_center, box_param, shifts, fpn_levels): box_cls_i = box_cls_i.flatten().sigmoid_() if self.thresh_with_centerness: box_ctr_i = box_ctr_i.expand( (-1, self.num_classes)).flatten().sigmoid() box_cls_i = box_cls_i * box_ctr_i # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, box_reg_i.shape[0]) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = box_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold # after topk predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] shift_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes box_reg_i = box_reg_i[shift_idxs] shifts_i = shifts_i[shift_idxs] fpn_level_i = fpn_level_i[shift_idxs] # predict boxes predicted_boxes = self.shift2box_transform.apply_deltas( box_reg_i, shifts_i) if not self.thresh_with_centerness: box_ctr_i = box_ctr_i.flatten().sigmoid_()[shift_idxs] predicted_prob = predicted_prob * box_ctr_i # instances conv params for predicted boxes box_param = box_param_i[shift_idxs] boxes_all.append(predicted_boxes) scores_all.append(torch.sqrt(predicted_prob)) class_idxs_all.append(classes_idxs) box_params_all.append(box_param) shifts_all.append(shifts_i) fpn_levels_all.append(fpn_level_i) boxes_all, scores_all, class_idxs_all, box_params_all, shifts_all, fpn_levels_all = [ cat(x) for x in [ boxes_all, scores_all, class_idxs_all, box_params_all, shifts_all, fpn_levels_all ] ] keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold, nms_type=self.nms_type) keep = keep[:self.max_detections_per_image] im_inds = scores_all.new_ones(len(scores_all), dtype=torch.long) * img_id proposals_i = Instances(image_size) proposals_i.pred_boxes = Boxes(boxes_all[keep]) proposals_i.scores = scores_all[keep] proposals_i.pred_classes = class_idxs_all[keep] proposals_i.inst_parmas = box_params_all[keep] proposals_i.fpn_levels = fpn_levels_all[keep] proposals_i.shifts = shifts_all[keep] proposals_i.im_inds = im_inds[keep] return proposals_i