def inference(self, box_cls, box_pred, mask_pred, image_sizes): """ Arguments: box_cls (Tensor): tensor of shape (batch_size, num_queries, K). The tensor predicts the classification probability for each query. box_pred (Tensor): tensors of shape (batch_size, num_queries, 4). The tensor predicts 4-vector (x,y,w,h) box regression values for every queryx image_sizes (List[torch.Size]): the input image sizes Returns: results (List[Instances]): a list of #images elements. """ assert len(box_cls) == len(image_sizes) results = [] # For each box we assign the best class or the second best if the best on is `no_object`. if self.use_focal_loss: prob = box_cls.sigmoid() # TODO make top-100 as an option for non-focal-loss as well scores, topk_indexes = torch.topk(prob.view(box_cls.shape[0], -1), 100, dim=1) topk_boxes = topk_indexes // box_cls.shape[2] labels = topk_indexes % box_cls.shape[2] else: scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1) for i, ( scores_per_image, labels_per_image, box_pred_per_image, image_size, ) in enumerate(zip(scores, labels, box_pred, image_sizes)): result = Instances(image_size) boxes = box_cxcywh_to_xyxy(box_pred_per_image) if self.use_focal_loss: boxes = torch.gather(boxes, 0, topk_boxes[i].unsqueeze(-1).repeat(1, 4)) result.pred_boxes = Boxes(boxes) result.pred_boxes.scale(scale_x=image_size[1], scale_y=image_size[0]) if self.mask_on: mask = F.interpolate( mask_pred[i].unsqueeze(0), size=image_size, mode="bilinear", align_corners=False, ) mask = mask[0].sigmoid() > 0.5 B, N, H, W = mask_pred.shape mask = BitMasks(mask.cpu()).crop_and_resize( result.pred_boxes.tensor.cpu(), 32) result.pred_masks = mask.unsqueeze(1).to(mask_pred[0].device) result.scores = scores_per_image result.pred_classes = labels_per_image results.append(result) return results
def inference(self, box_cls, box_pred, mask_pred, image_sizes): """ Arguments: box_cls (Tensor): tensor of shape (batch_size, num_queries, K). The tensor predicts the classification probability for each query. box_pred (Tensor): tensors of shape (batch_size, num_queries, 4). The tensor predicts 4-vector (x,y,w,h) box regression values for every queryx image_sizes (List[torch.Size]): the input image sizes Returns: results (List[Instances]): a list of #images elements. """ assert len(box_cls) == len(image_sizes) results = [] scores, labels = F.softmax(box_cls, axis=-1)[:, :, :-1].max(-1) for i, (scores_per_image, labels_per_image, box_pred_per_image, image_size) in enumerate( zip(scores, labels, box_pred, image_sizes)): result = Instances(image_size) result.pred_boxes = Boxes(box_cxcywh_to_xyxy(box_pred_per_image)) result.pred_boxes.scale(scale_x=image_size[1], scale_y=\ image_size[0]) if self.mask_on: mask = F.interpolate(mask_pred[i].unsqueeze(0), size=\ image_size, mode='bilinear', align_corners=False) mask = mask[0].sigmoid() > 0.5 B, N, H, W = mask_pred.shape mask = BitMasks(mask.cpu()).crop_and_resize( result.pred_boxes.tensor.cpu(), 32) result.pred_masks = mask.unsqueeze(1).to(mask_pred[0].device) result.scores = scores_per_image result.pred_classes = labels_per_image results.append(result) return results