def loss_boxes(self, outputs, targets, indices, num_boxes): """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size. """ assert "pred_boxes" in outputs idx = self._get_src_permutation_idx(indices) src_boxes = outputs["pred_boxes"][idx] target_boxes = torch.cat( [t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0 ) loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none") losses = {} losses["loss_bbox"] = loss_bbox.sum() / num_boxes loss_giou = 1 - torch.diag( box_ops.generalized_box_iou( box_ops.box_cxcywh_to_xyxy(src_boxes), box_ops.box_cxcywh_to_xyxy(target_boxes), ) ) losses["loss_giou"] = loss_giou.sum() / num_boxes return losses
def forward(self, outputs, targets): """ Performs the matching Params: outputs: This is a dict that contains at least these entries: "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ bs, num_queries = outputs["pred_logits"].shape[:2] # We flatten to compute the cost matrices in a batch out_prob = outputs["pred_logits"].flatten(0, 1).softmax( -1) # [batch_size * num_queries, num_classes] out_bbox = outputs["pred_boxes"].flatten( 0, 1) # [batch_size * num_queries, 4] # Also concat the target labels and boxes tgt_ids = torch.cat([v["labels"] for v in targets]) tgt_bbox = torch.cat([v["boxes"] for v in targets]) # Compute the classification cost. Contrary to the loss, we don't use the NLL, # but approximate it in 1 - proba[target class]. # The 1 is a constant that doesn't change the matching, it can be ommitted. cost_class = -out_prob[:, tgt_ids] # Compute the L1 cost between boxes cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) # Compute the giou cost betwen boxes cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) # Final cost matrix C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou C = C.view(bs, num_queries, -1).cpu() sizes = [len(v["boxes"]) for v in targets] indices = [ linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1)) ] return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
def forward(self, outputs, target_sizes): """ Perform the computation Parameters: outputs: raw outputs of the model target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch For evaluation, this must be the original image size (before any data augmentation) For visualization, this should be the image size after data augment, but before padding """ out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes'] assert len(out_logits) == len(target_sizes) assert target_sizes.shape[1] == 2 prob = F.softmax(out_logits, -1) scores, labels = prob[..., :-1].max(-1) # convert to [x0, y0, x1, y1] format boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) # and from relative [0, 1] to absolute [0, height] coordinates img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) boxes = boxes * scale_fct[:, None, :] results = [{ 'scores': s, 'labels': l, 'boxes': b } for s, l, b in zip(scores, labels, boxes)] return results
def inference(self, box_cls, box_pred, mask_pred, image_sizes): """ Arguments: box_cls (Tensor): tensor of shape (batch_size, num_queries, K). The tensor predicts the classification probability for each query. box_pred (Tensor): tensors of shape (batch_size, num_queries, 4). The tensor predicts 4-vector (x,y,w,h) box regression values for every queryx image_sizes (List[torch.Size]): the input image sizes Returns: results (List[Instances]): a list of #images elements. """ assert len(box_cls) == len(image_sizes) results = [] # For each box we assign the best class or the second best if the best on is `no_object`. if self.use_focal_loss: prob = box_cls.sigmoid() # TODO make top-100 as an option for non-focal-loss as well scores, topk_indexes = torch.topk(prob.view(box_cls.shape[0], -1), 100, dim=1) topk_boxes = topk_indexes // box_cls.shape[2] labels = topk_indexes % box_cls.shape[2] else: scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1) for i, ( scores_per_image, labels_per_image, box_pred_per_image, image_size, ) in enumerate(zip(scores, labels, box_pred, image_sizes)): result = Instances(image_size) boxes = box_cxcywh_to_xyxy(box_pred_per_image) if self.use_focal_loss: boxes = torch.gather(boxes, 0, topk_boxes[i].unsqueeze(-1).repeat(1, 4)) result.pred_boxes = Boxes(boxes) result.pred_boxes.scale(scale_x=image_size[1], scale_y=image_size[0]) if self.mask_on: mask = F.interpolate( mask_pred[i].unsqueeze(0), size=image_size, mode="bilinear", align_corners=False, ) mask = mask[0].sigmoid() > 0.5 B, N, H, W = mask_pred.shape mask = BitMasks(mask.cpu()).crop_and_resize( result.pred_boxes.tensor.cpu(), 32) result.pred_masks = mask.unsqueeze(1).to(mask_pred[0].device) result.scores = scores_per_image result.pred_classes = labels_per_image results.append(result) return results
def detr_probabilistic_inference(self, input_im): outputs = self.model(input_im, return_raw_results=True, is_mc_dropout=self.mc_dropout_enabled) image_width = input_im[0]['image'].shape[2] image_height = input_im[0]['image'].shape[1] # Handle logits and classes predicted_logits = outputs['pred_logits'][0] if 'pred_logits_var' in outputs.keys(): predicted_logits_var = outputs['pred_logits_var'][0] box_cls_dists = torch.distributions.normal.Normal( predicted_logits, scale=torch.sqrt(torch.exp(predicted_logits_var))) predicted_logits = box_cls_dists.rsample( (self.model.cls_var_num_samples, )) predicted_prob_vectors = F.softmax(predicted_logits, dim=-1) predicted_prob_vectors = predicted_prob_vectors.mean(0) else: predicted_prob_vectors = F.softmax(predicted_logits, dim=-1) predicted_prob, classes_idxs = predicted_prob_vectors[:, :-1].max(-1) # Handle boxes and covariance matrices predicted_boxes = outputs['pred_boxes'][0] # Rescale boxes to inference image size (not COCO original size) pred_boxes = Boxes(box_cxcywh_to_xyxy(predicted_boxes)) pred_boxes.scale(scale_x=image_width, scale_y=image_height) predicted_boxes = pred_boxes.tensor # Rescale boxes to inference image size (not COCO original size) if 'pred_boxes_cov' in outputs.keys(): predicted_boxes_covariance = covariance_output_to_cholesky( outputs['pred_boxes_cov'][0]) predicted_boxes_covariance = torch.matmul( predicted_boxes_covariance, predicted_boxes_covariance.transpose(1, 2)) transform_mat = torch.tensor([[[1.0, 0.0, -0.5, 0.0], [0.0, 1.0, 0.0, -0.5], [1.0, 0.0, 0.5, 0.0], [0.0, 1.0, 0.0, 0.5]]]).to(self.model.device) predicted_boxes_covariance = torch.matmul( torch.matmul(transform_mat, predicted_boxes_covariance), transform_mat.transpose(1, 2)) scale_mat = torch.diag_embed( torch.as_tensor( (image_width, image_height, image_width, image_height), dtype=torch.float32)).to(self.model.device).unsqueeze(0) predicted_boxes_covariance = torch.matmul( torch.matmul(scale_mat, predicted_boxes_covariance), torch.transpose(scale_mat, 2, 1)) else: predicted_boxes_covariance = [] return predicted_boxes, predicted_boxes_covariance, predicted_prob, classes_idxs, predicted_prob_vectors
def forward(self, outputs, processed_sizes, target_sizes=None): # noqa: C901 """This function computes the panoptic prediction from the model's predictions. Parameters: outputs: This is a dict coming directly from the model. See the model doc for the content. processed_sizes: This is a list of tuples (or torch tensors) of sizes of the images that were passed to the model, ie the size after data augmentation but before batching. target_sizes: This is a list of tuples (or torch tensors) corresponding to the requested final size of each prediction. If left to None, it will default to the processed_sizes """ if target_sizes is None: target_sizes = processed_sizes assert len(processed_sizes) == len(target_sizes) out_logits, raw_masks, raw_boxes = ( outputs["pred_logits"], outputs["pred_masks"], outputs["pred_boxes"], ) assert len(out_logits) == len(raw_masks) == len(target_sizes) preds = [] def to_tuple(tup): if isinstance(tup, tuple): return tup return tuple(tup.cpu().tolist()) for cur_logits, cur_masks, cur_boxes, size, target_size in zip( out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes ): # we filter empty queries and detection below threshold scores, labels = cur_logits.softmax(-1).max(-1) keep = labels.ne(outputs["pred_logits"].shape[-1] - 1) & ( scores > self.threshold ) cur_scores, cur_classes = cur_logits.softmax(-1).max(-1) cur_scores = cur_scores[keep] cur_classes = cur_classes[keep] cur_masks = cur_masks[keep] cur_masks = interpolate( cur_masks[:, None], to_tuple(size), mode="bilinear" ).squeeze(1) cur_boxes = box_ops.box_cxcywh_to_xyxy(cur_boxes[keep]) h, w = cur_masks.shape[-2:] assert len(cur_boxes) == len(cur_classes) # It may be that we have several predicted masks for the same stuff class. # In the following, we track the list of masks ids for each stuff class (they are merged later on) cur_masks = cur_masks.flatten(1) stuff_equiv_classes = defaultdict(lambda: []) for k, label in enumerate(cur_classes): if not self.is_thing_map[label.item()]: stuff_equiv_classes[label.item()].append(k) def get_ids_area(masks, scores, dedup=False): # This helper function creates the final panoptic segmentation image # It also returns the area of the masks that appears on the image m_id = masks.transpose(0, 1).softmax(-1) if m_id.shape[-1] == 0: # We didn't detect any mask :( m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device) else: m_id = m_id.argmax(-1).view(h, w) if dedup: # Merge the masks corresponding to the same stuff class for equiv in stuff_equiv_classes.values(): if len(equiv) > 1: for eq_id in equiv: m_id.masked_fill_(m_id.eq(eq_id), equiv[0]) final_h, final_w = to_tuple(target_size) seg_img = Image.fromarray(id2rgb(m_id.view(h, w).cpu().numpy())) seg_img = seg_img.resize( size=(final_w, final_h), resample=Image.NEAREST ) np_seg_img = ( torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())) .view(final_h, final_w, 3) .numpy() ) m_id = torch.from_numpy(rgb2id(np_seg_img)) area = [] for i in range(len(scores)): area.append(m_id.eq(i).sum().item()) return area, seg_img area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True) if cur_classes.numel() > 0: # We know filter empty masks as long as we find some while True: filtered_small = torch.as_tensor( [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device, ) if filtered_small.any().item(): cur_scores = cur_scores[~filtered_small] cur_classes = cur_classes[~filtered_small] cur_masks = cur_masks[~filtered_small] area, seg_img = get_ids_area(cur_masks, cur_scores) else: break else: cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device) segments_info = [] for i, a in enumerate(area): cat = cur_classes[i].item() segments_info.append( { "id": i, "isthing": self.is_thing_map[cat], "category_id": cat, "area": a, } ) del cur_classes with io.BytesIO() as out: seg_img.save(out, format="PNG") predictions = { "png_string": out.getvalue(), "segments_info": segments_info, } preds.append(predictions) return preds
def test_box_cxcywh_to_xyxy(self): t = torch.rand(10, 4) r = box_ops.box_xyxy_to_cxcywh(box_ops.box_cxcywh_to_xyxy(t)) self.assertLess((t - r).abs().max(), 1e-5)