def forward(self, outputs, target_sizes): """ Perform the computation Parameters: outputs: raw outputs of the model target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch For evaluation, this must be the original image size (before any data augmentation) For visualization, this should be the image size after data augment, but before padding """ out_logits, out_bbox = outputs["pred_logits"], outputs["pred_boxes"] assert len(out_logits) == len(target_sizes) assert target_sizes.shape[1] == 2 prob = F.softmax(out_logits, -1) scores, labels = prob[..., :-1].max(-1) # convert to [x0, y0, x1, y1] format boxes = box_cxcywh_to_xyxy(out_bbox) # and from relative [0, 1] to absolute [0, height] coordinates img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) boxes = boxes * scale_fct[:, None, :] results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] return results
def convert_to_coco_api(ds): coco_ds = COCO() # annotation IDs need to start at 1, not 0, see torchvision issue #1530 ann_id = 1 dataset = {"images": [], "categories": [], "annotations": []} categories = set() for img_idx in range(len(ds)): # find better way to get target # targets = ds.get_annotations(img_idx) img, targets = ds.get_in_coco_format(img_idx) image_id = targets["image_id"].item() img_dict = {} img_dict["id"] = image_id height, width = targets["orig_size"].tolist() img_dict["height"] = height img_dict["width"] = width dataset["images"].append(img_dict) bboxes = targets["boxes"] # the boxes are in 0-1 format, in cxcywh format # let's convert it into the format expected by COCO api bboxes = box_cxcywh_to_xyxy(bboxes) bboxes = bboxes * torch.tensor([width, height, width, height], dtype=torch.float32) bboxes[:, 2:] -= bboxes[:, :2] bboxes = bboxes.tolist() labels = targets["labels"].tolist() areas = targets["area"].tolist() iscrowd = targets["iscrowd"].tolist() if "masks" in targets: masks = targets["masks"] # make masks Fortran contiguous for coco_mask masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1) if "keypoints" in targets: keypoints = targets["keypoints"] keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist() num_objs = len(bboxes) for i in range(num_objs): ann = {} ann["image_id"] = image_id ann["bbox"] = bboxes[i] ann["category_id"] = labels[i] categories.add(labels[i]) ann["area"] = areas[i] ann["iscrowd"] = iscrowd[i] ann["id"] = ann_id if "masks" in targets: ann["segmentation"] = coco_mask.encode(masks[i].numpy()) if "keypoints" in targets: ann["keypoints"] = keypoints[i] ann["num_keypoints"] = sum(k != 0 for k in keypoints[i][2::3]) dataset["annotations"].append(ann) ann_id += 1 dataset["categories"] = [{"id": i} for i in sorted(categories)] coco_ds.dataset = dataset coco_ds.createIndex() return coco_ds
def loss_boxes(self, outputs, targets, indices, num_boxes): """ Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size. """ assert "pred_boxes" in outputs idx = self._get_src_permutation_idx(indices) src_boxes = outputs["pred_boxes"][idx] target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none") losses = {} losses["loss_bbox"] = loss_bbox.sum() / num_boxes loss_giou = 1 - torch.diag( generalized_box_iou( box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes) ) ) losses["loss_giou"] = loss_giou.sum() / num_boxes return losses
def test_box_cxcywh_to_xyxy(self): t = torch.rand(10, 4) r = box_ops.box_xyxy_to_cxcywh(box_ops.box_cxcywh_to_xyxy(t)) self.assertTrue((t - r).abs().max() < 1e-5)
def forward(self, outputs, targets): """ Performs the matching Args: outputs (dict): This is a dict that contains at least these entries: "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates targets (list): This is a list of targets (len(targets) = batch_size), where each target is a dict: "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ bs, num_queries = outputs["pred_logits"].shape[:2] # We flatten to compute the cost matrices in a batch out_prob = outputs["pred_logits"].flatten(0, 1).softmax( -1) # [batch_size * num_queries, num_classes] out_bbox = outputs["pred_boxes"].flatten( 0, 1) # [batch_size * num_queries, 4] # Also concat the target labels and boxes tgt_ids = torch.cat([v["labels"] for v in targets]) tgt_bbox = torch.cat([v["boxes"] for v in targets]) # Compute the classification cost. Contrary to the loss, we don't use the NLL, # but approximate it in 1 - proba[target class]. # The 1 is a constant that doesn't change the matching, it can be ommitted. cost_class = -out_prob[:, tgt_ids] # Compute the L1 cost between boxes cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) # Compute the giou cost betwen boxes cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) # Final cost matrix C = self.cost_bbox * cost_bbox + self.cost_class * \ cost_class + self.cost_giou * cost_giou C = C.view(bs, num_queries, -1).cpu() sizes = [len(v["boxes"]) for v in targets] indices = [ linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1)) ] return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]