Exemple #1
0
    def loss_boxes(self, outputs, gt_instances: List[Instances], indices: List[tuple], num_boxes):
        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
           The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
        """
        # We ignore the regression loss of the track-disappear slots.
        #TODO: Make this filter process more elegant.
        filtered_idx = []
        for src_per_img, tgt_per_img in indices:
            keep = tgt_per_img != -1
            filtered_idx.append((src_per_img[keep], tgt_per_img[keep]))
        indices = filtered_idx
        idx = self._get_src_permutation_idx(indices)
        src_boxes = outputs['pred_boxes'][idx]
        target_boxes = torch.cat([gt_per_img.boxes[i] for gt_per_img, (_, i) in zip(gt_instances, indices)], dim=0)

        # for pad target, don't calculate regression loss, judged by whether obj_id=-1
        target_obj_ids = torch.cat([gt_per_img.obj_ids[i] for gt_per_img, (_, i) in zip(gt_instances, indices)], dim=0) # size(16)
        mask = (target_obj_ids != -1)

        loss_bbox = F.l1_loss(src_boxes[mask], target_boxes[mask], reduction='none')
        loss_giou = 1 - torch.diag(box_ops.generalized_box_iou(
            box_ops.box_cxcywh_to_xyxy(src_boxes[mask]),
            box_ops.box_cxcywh_to_xyxy(target_boxes[mask])))

        losses = {}
        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
        losses['loss_giou'] = loss_giou.sum() / num_boxes

        return losses
Exemple #2
0
    def loss_boxes(self, outputs, targets, indices, num_boxes):
        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
        """
        assert 'pred_boxes' in outputs
        idx = self._get_src_permutation_idx(indices)
        src_boxes = outputs['pred_boxes'][idx]
        target_boxes = torch.cat(
            [t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)

        src_boxes_coordinates = src_boxes[:, :2]
        src_boxes_dimensions = src_boxes[:, 2:]
        target_boxes_coordinates = target_boxes[:, :2]
        target_boxes_dimensions = target_boxes[:, 2:]

        loss_bbox_coordinates = F.l1_loss(src_boxes_coordinates,
                                          target_boxes_coordinates,
                                          reduction='none')
        loss_bbox_dimensions = F.l1_loss(src_boxes_dimensions,
                                         target_boxes_dimensions,
                                         reduction='none')

        losses = {}
        losses['loss_bbox_coordinates'] = loss_bbox_coordinates.sum(
        ) / num_boxes
        losses['loss_bbox_dimensions'] = loss_bbox_dimensions.sum() / num_boxes

        loss_giou = 1 - torch.diag(
            box_ops.generalized_box_iou(
                box_ops.box_cxcywh_to_xyxy(src_boxes),
                box_ops.box_cxcywh_to_xyxy(target_boxes)))
        losses['loss_giou'] = loss_giou.sum() / num_boxes
        return losses
Exemple #3
0
def optimization_test(tgt_boxes, src_boxes, lr=1e-3, max_iter=int(1e5)):

    img_array = list()

    optimizer = optim.Adam([src_boxes], lr=lr)
    for i in range(max_iter):
        optimizer.zero_grad()
        loss_giou = 1 - box_ops.generalized_box_iou(
            box_ops.box_center_to_corners(tgt_boxes),
            box_ops.box_center_to_corners(src_boxes))
        loss = loss_giou.sum()

        if i % 10 == 0:
            img = np.ones((256, 256, 3), np.uint8) * 255
            img = draw_boxes(src_boxes, img, (0, 0, 255))
            img = draw_boxes(tgt_boxes, img, (255, 0, 0))
            img = cv2.putText(img, f'lr={lr}, step: {i}', (10, 30),
                              cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
            img_array.append(img)

            print(loss.item())

        loss.backward()
        optimizer.step()

    out = cv2.VideoWriter('box_test.avi', cv2.VideoWriter_fourcc(*'DIVX'), 15,
                          (256, 256))
    for i in range(len(img_array)):
        out.write(img_array[i])
    out.release()
Exemple #4
0
    def forward(self, outputs, targets):
        """ Performs the matching

        Params:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates

            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
                           objects in the target) containing the class labels
                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates

        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        bs, num_queries = outputs["pred_logits"].shape[:2]

        # We flatten to compute the cost matrices in a batch
        out_prob = outputs["pred_logits"].flatten(0, 1).softmax(
            -1)  # [batch_size * num_queries, num_classes]
        out_bbox = outputs["pred_boxes"].flatten(
            0, 1)  # [batch_size * num_queries, 4]

        # Also concat the target labels and boxes
        tgt_ids = torch.cat([v["labels"] for v in targets])
        tgt_bbox = torch.cat([v["boxes"] for v in targets])

        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
        # but approximate it in 1 - proba[target class].
        # The 1 is a constant that doesn't change the matching, it can be ommitted.
        # 在out_prob中,dim1中每一维都代表了对每个该bbox的所属类别的概率,由于有92个类,所以有92个数字
        # 由于candidate box远比实际的box数量要多,因此并不知道到底哪个candidate能与gt box进行匹配
        # 所以先获取所有tgt_id,并在out_ptob中取出对应的概率,因为知道在众多candidate中必有一个bbox与某个gt bbox最为匹配
        # 之所以用减号就是想知道与理想概率1的差距,但这里加不加1其实无所谓
        cost_class = -out_prob[:, tgt_ids]

        # Compute the L1 cost between boxes
        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)

        # Compute the giou cost betwen boxes
        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
                                         box_cxcywh_to_xyxy(tgt_bbox))

        # Final cost matrix
        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
        C = C.view(bs, num_queries, -1).cpu()

        sizes = [len(v["boxes"]) for v in targets]
        indices = [
            linear_sum_assignment(c[i])
            for i, c in enumerate(C.split(sizes, -1))
        ]
        return [(torch.as_tensor(i, dtype=torch.int64),
                 torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
Exemple #5
0
    def loss_boxes(self, outputs, targets, indices, num_boxes):
        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
        """
        assert 'human_pred_boxes' in outputs
        assert 'object_pred_boxes' in outputs

        idx = self._get_src_permutation_idx(indices)

        human_src_boxes = outputs['human_pred_boxes'][idx]
        human_target_boxes = torch.cat(
            [t['human_boxes'][i] for t, (_, i) in zip(targets, indices)],
            dim=0)
        object_src_boxes = outputs['object_pred_boxes'][idx]
        object_target_boxes = torch.cat(
            [t['object_boxes'][i] for t, (_, i) in zip(targets, indices)],
            dim=0)

        human_loss_bbox = F.l1_loss(human_src_boxes,
                                    human_target_boxes,
                                    reduction='none')
        object_loss_bbox = F.l1_loss(object_src_boxes,
                                     object_target_boxes,
                                     reduction='none')

        losses = dict()
        losses['human_loss_bbox'] = human_loss_bbox.sum() / num_boxes
        losses['object_loss_bbox'] = object_loss_bbox.sum() / num_boxes
        losses['loss_bbox'] = losses['human_loss_bbox'] + losses[
            'object_loss_bbox']

        human_loss_giou = 1 - torch.diag(
            box_ops.generalized_box_iou(
                box_ops.box_cxcywh_to_xyxy(human_src_boxes),
                box_ops.box_cxcywh_to_xyxy(human_target_boxes)))
        object_loss_giou = 1 - torch.diag(
            box_ops.generalized_box_iou(
                box_ops.box_cxcywh_to_xyxy(object_src_boxes),
                box_ops.box_cxcywh_to_xyxy(object_target_boxes)))
        losses['human_loss_giou'] = human_loss_giou.sum() / num_boxes
        losses['object_loss_giou'] = object_loss_giou.sum() / num_boxes

        losses['loss_giou'] = losses['human_loss_giou'] + losses[
            'object_loss_giou']
        return losses
Exemple #6
0
    def forward(self, outputs, targets):
        bs, num_queries = outputs['pred_obj_logits'].shape[:2]

        out_obj_prob = outputs['pred_obj_logits'].flatten(0, 1).softmax(-1)
        out_verb_prob = outputs['pred_verb_logits'].flatten(0, 1).sigmoid()
        out_sub_bbox = outputs['pred_sub_boxes'].flatten(0, 1)
        out_obj_bbox = outputs['pred_obj_boxes'].flatten(0, 1)

        tgt_obj_labels = torch.cat([v['obj_labels'] for v in targets])
        tgt_verb_labels = torch.cat([v['verb_labels'] for v in targets])
        tgt_verb_labels_permute = tgt_verb_labels.permute(1, 0)
        tgt_sub_boxes = torch.cat([v['sub_boxes'] for v in targets])
        tgt_obj_boxes = torch.cat([v['obj_boxes'] for v in targets])

        cost_obj_class = -out_obj_prob[:, tgt_obj_labels]

        tgt_verb_labels_permute = tgt_verb_labels.permute(1, 0)
        cost_verb_class = -(out_verb_prob.matmul(tgt_verb_labels_permute) / \
                            (tgt_verb_labels_permute.sum(dim=0, keepdim=True) + 1e-4) + \
                            (1 - out_verb_prob).matmul(1 - tgt_verb_labels_permute) / \
                            ((1 - tgt_verb_labels_permute).sum(dim=0, keepdim=True) + 1e-4)) / 2

        cost_sub_bbox = torch.cdist(out_sub_bbox, tgt_sub_boxes, p=1)
        cost_obj_bbox = torch.cdist(out_obj_bbox, tgt_obj_boxes, p=1) * (tgt_obj_boxes != 0).any(dim=1).unsqueeze(0)
        if cost_sub_bbox.shape[1] == 0:
            cost_bbox = cost_sub_bbox
        else:
            cost_bbox = torch.stack((cost_sub_bbox, cost_obj_bbox)).max(dim=0)[0]

        cost_sub_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_sub_bbox), box_cxcywh_to_xyxy(tgt_sub_boxes))
        cost_obj_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_obj_bbox), box_cxcywh_to_xyxy(tgt_obj_boxes)) + \
                        cost_sub_giou * (tgt_obj_boxes == 0).all(dim=1).unsqueeze(0)
        if cost_sub_giou.shape[1] == 0:
            cost_giou = cost_sub_giou
        else:
            cost_giou = torch.stack((cost_sub_giou, cost_obj_giou)).max(dim=0)[0]

        C = self.cost_obj_class * cost_obj_class + self.cost_verb_class * cost_verb_class + \
            self.cost_bbox * cost_bbox + self.cost_giou * cost_giou
        C = C.view(bs, num_queries, -1).cpu()

        sizes = [len(v['obj_labels']) for v in targets]
        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
Exemple #7
0
    def forward(self, outputs, targets):
        """ Performs the matching

        Params:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates

            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
                           objects in the target) containing the class labels
                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates

        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        bs, num_queries = outputs["pred_logits"].shape[:2]

        # We flatten to compute the cost matrices in a batch
        out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]

        # Also concat the target labels and boxes
        tgt_ids = torch.cat([v["labels"] for v in targets])
        tgt_bbox = torch.cat([v["boxes"] for v in targets])

        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
        # but approximate it in 1 - proba[target class].
        # The 1 is a constant that doesn't change the matching, it can be ommitted.
        cost_class = -out_prob[:, tgt_ids]

        # Compute the L1 cost between boxes
        # Note cdist with p=1 is the manhattan distance norm
        # cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
        out_bbox_coordinates = out_bbox[:,:2]
        out_bbox_dimensions = out_bbox[:,2:]
        tgt_bbox_coordinates = tgt_bbox[:,:2]
        tgt_bbox_dimensions = tgt_bbox[:,2:]
        cost_bbox_coordinates = torch.cdist(out_bbox_coordinates, tgt_bbox_coordinates, p=1)
        cost_bbox_dimensions = torch.cdist(out_bbox_dimensions, tgt_bbox_dimensions, p=1)

        # Compute the giou cost betwen boxes
        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))

        # Final cost matrix
        C = self.cost_bbox_coordinates * cost_bbox_coordinates + self.cost_bbox_dimensions * cost_bbox_dimensions + self.cost_class * cost_class + self.cost_giou * cost_giou
        C = C.view(bs, num_queries, -1).cpu()

        sizes = [len(v["boxes"]) for v in targets]
        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
Exemple #8
0
    def forward(self, outputs, targets, positive_map):
        """Performs the matching

        Params:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates

            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
                           objects in the target) containing the class labels
                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates

        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        bs, num_queries = outputs["pred_logits"].shape[:2]

        # We flatten to compute the cost matrices in a batch
        out_prob = self.norm(outputs["pred_logits"].flatten(
            0, 1))  # [batch_size * num_queries, num_classes]
        out_bbox = outputs["pred_boxes"].flatten(
            0, 1)  # [batch_size * num_queries, 4]

        # Also concat the target labels and boxes
        tgt_bbox = torch.cat([v["boxes"] for v in targets])
        assert len(tgt_bbox) == len(positive_map)

        # Compute the soft-cross entropy between the predicted token alignment and the GT one for each box
        cost_class = -(out_prob.unsqueeze(1) *
                       positive_map.unsqueeze(0)).sum(-1)

        # Compute the L1 cost between boxes
        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
        assert cost_class.shape == cost_bbox.shape

        # Compute the giou cost betwen boxes
        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
                                         box_cxcywh_to_xyxy(tgt_bbox))

        # Final cost matrix
        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
        C = C.view(bs, num_queries, -1).cpu()

        sizes = [len(v["boxes"]) for v in targets]
        indices = [
            linear_sum_assignment(c[i])
            for i, c in enumerate(C.split(sizes, -1))
        ]
        return [(torch.as_tensor(i, dtype=torch.int64),
                 torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
Exemple #9
0
    def get_cls_loss(self,
                     outputs,
                     targets,
                     criterion,
                     cls_losses,
                     weights=None):
        """

        """
        # TODO: make sure we are not backpropagating from here
        # (probably from the function that will call this(with no grad)

        outputs_without_aux = {
            k: v
            for k, v in outputs.items() if k != 'aux_outputs'
        }
        indices = criterion.matcher(outputs_without_aux, targets)

        src_idx = criterion._get_src_permutation_idx(indices)

        # order the labels by the indices
        target_classes = torch.cat(
            [t["labels"][J] for t, (_, J) in zip(targets, indices)])  # BOXES
        src_logits = outputs['pred_logits'][src_idx]  # (BOXES) X C

        # order the bounding boxes by the indices
        target_boxes = torch.cat(
            [t['boxes'][i] for t, (_, i) in zip(targets, indices)],
            dim=0)  # BOXES
        src_boxes = outputs['pred_boxes'][src_idx]  # BOXES

        classes = torch.unique(target_classes)
        for cls in classes:
            idx = torch.where(target_classes == cls)[0]

            loss_ce = F.cross_entropy(src_logits[idx],
                                      target_classes[idx],
                                      reduction='sum')

            loss_bbox = F.l1_loss(src_boxes[idx],
                                  target_boxes[idx],
                                  reduction='sum')

            loss_giou = (1 - torch.diag(
                box_ops.generalized_box_iou(
                    box_ops.box_cxcywh_to_xyxy(src_boxes[idx]),
                    box_ops.box_cxcywh_to_xyxy(target_boxes[idx])))).sum()

            losses = torch.tensor([len(idx), loss_ce, loss_bbox, loss_giou])
            cls_losses[cls] += losses

        return cls_losses
Exemple #10
0
    def forward(self, outputs, targets):
        """ Performs the matching

        Params:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates

            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
                           objects in the target) containing the class labels
                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates

        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        with torch.no_grad():
            bs, num_queries = outputs["pred_logits"].shape[:2]

            # We flatten to compute the cost matrices in a batch
            out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
            out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]

            # Also concat the target labels and boxes
            tgt_ids = torch.cat([v["labels"] for v in targets])
            tgt_bbox = torch.cat([v["boxes"] for v in targets])

            # Compute the classification cost.
            alpha = 0.25
            gamma = 2.0
            neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
            pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
            cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]

            # Compute the L1 cost between boxes
            cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)

            # Compute the giou cost betwen boxes
            cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
                                             box_cxcywh_to_xyxy(tgt_bbox))

            # Final cost matrix
            C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
            C = C.view(bs, num_queries, -1).cpu()

            sizes = [len(v["boxes"]) for v in targets]
            indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
            return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
Exemple #11
0
    def loss_sub_obj_boxes(self, outputs, targets, indices, num_interactions):
        assert 'pred_sub_boxes' in outputs and 'pred_obj_boxes' in outputs
        idx = self._get_src_permutation_idx(indices)
        src_sub_boxes = outputs['pred_sub_boxes'][idx]
        src_obj_boxes = outputs['pred_obj_boxes'][idx]
        target_sub_boxes = torch.cat(
            [t['sub_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
        target_obj_boxes = torch.cat(
            [t['obj_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)

        exist_obj_boxes = (target_obj_boxes != 0).any(dim=1)

        losses = {}
        if src_sub_boxes.shape[0] == 0:
            losses['loss_sub_bbox'] = src_sub_boxes.sum()
            losses['loss_obj_bbox'] = src_obj_boxes.sum()
            losses['loss_sub_giou'] = src_sub_boxes.sum()
            losses['loss_obj_giou'] = src_obj_boxes.sum()
        else:
            loss_sub_bbox = F.l1_loss(src_sub_boxes,
                                      target_sub_boxes,
                                      reduction='none')
            loss_obj_bbox = F.l1_loss(src_obj_boxes,
                                      target_obj_boxes,
                                      reduction='none')
            losses['loss_sub_bbox'] = loss_sub_bbox.sum() / num_interactions
            losses['loss_obj_bbox'] = (loss_obj_bbox *
                                       exist_obj_boxes.unsqueeze(1)).sum() / (
                                           exist_obj_boxes.sum() + 1e-4)
            loss_sub_giou = 1 - torch.diag(
                generalized_box_iou(box_cxcywh_to_xyxy(src_sub_boxes),
                                    box_cxcywh_to_xyxy(target_sub_boxes)))
            loss_obj_giou = 1 - torch.diag(
                generalized_box_iou(box_cxcywh_to_xyxy(src_obj_boxes),
                                    box_cxcywh_to_xyxy(target_obj_boxes)))
            losses['loss_sub_giou'] = loss_sub_giou.sum() / num_interactions
            losses['loss_obj_giou'] = (loss_obj_giou * exist_obj_boxes
                                       ).sum() / (exist_obj_boxes.sum() + 1e-4)
        return losses
Exemple #12
0
    def summarize(self):
        if dist.is_main_process():
            dataset2score = {
                "refcoco": {k: 0.0 for k in self.k},
                "refcoco+": {k: 0.0 for k in self.k},
                "refcocog": {k: 0.0 for k in self.k},
            }
            dataset2count = {"refcoco": 0.0, "refcoco+": 0.0, "refcocog": 0.0}
            for image_id in self.img_ids:
                ann_ids = self.refexp_gt.getAnnIds(imgIds=image_id)
                assert len(ann_ids) == 1
                img_info = self.refexp_gt.loadImgs(image_id)[0]

                target = self.refexp_gt.loadAnns(ann_ids[0])
                prediction = self.predictions[image_id]
                assert prediction is not None
                sorted_scores_boxes = sorted(
                    zip(prediction["scores"].tolist(), prediction["boxes"].tolist()), reverse=True
                )
                sorted_scores, sorted_boxes = zip(*sorted_scores_boxes)
                sorted_boxes = torch.cat([torch.as_tensor(x).view(1, 4) for x in sorted_boxes])
                target_bbox = target[0]["bbox"]
                converted_bbox = [
                    target_bbox[0],
                    target_bbox[1],
                    target_bbox[2] + target_bbox[0],
                    target_bbox[3] + target_bbox[1],
                ]
                giou = generalized_box_iou(sorted_boxes, torch.as_tensor(converted_bbox).view(-1, 4))
                for k in self.k:
                    if max(giou[:k]) >= self.thresh_iou:
                        dataset2score[img_info["dataset_name"]][k] += 1.0
                dataset2count[img_info["dataset_name"]] += 1.0

            for key, value in dataset2score.items():
                for k in self.k:
                    try:
                        value[k] /= dataset2count[key]
                    except:
                        pass
            results = {}
            for key, value in dataset2score.items():
                results[key] = sorted([v for k, v in value.items()])
                print(f" Dataset: {key} - Precision @ 1, 5, 10: {results[key]} \n")

            return results
        return None
Exemple #13
0
def get_matched_loss(tgt_boxes, src_boxes, cost_matcher):
    outputs = dict()
    outputs["pred_boxes"] = src_boxes.unsqueeze(0)  # (1, N, 5)

    target = dict()
    target["boxes"] = tgt_boxes  # (N, 5)
    targets = [target]

    indices = cost_matcher(outputs, targets)
    src_idx, tgt_idx = indices[0]  # batch=1

    src_boxes = src_boxes[src_idx]
    tgt_boxes = tgt_boxes[tgt_idx]

    loss_giou = 1 - box_ops.generalized_box_iou(
        box_ops.box_center_to_corners(tgt_boxes),
        box_ops.box_center_to_corners(src_boxes))
    loss = torch.diag(loss_giou).sum()
    return loss
Exemple #14
0
    def loss_boxes(self, outputs, targets, positive_map, indices, num_boxes):
        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
        targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
        The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
        """
        assert "pred_boxes" in outputs
        idx = self._get_src_permutation_idx(indices)
        src_boxes = outputs["pred_boxes"][idx]
        target_boxes = torch.cat(
            [t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)

        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none")

        losses = {}
        losses["loss_bbox"] = loss_bbox.sum() / num_boxes

        loss_giou = 1 - torch.diag(
            box_ops.generalized_box_iou(
                box_ops.box_cxcywh_to_xyxy(src_boxes),
                box_ops.box_cxcywh_to_xyxy(target_boxes)))
        losses["loss_giou"] = loss_giou.sum() / num_boxes
        return losses
    def forward(self, outputs, targets):
        """ Performs the matching

        Params:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates

            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
                           objects in the target) containing the class labels
                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates

        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        bs, num_queries = outputs['pred_logits'].shape[:2]
        out_prob = outputs['pred_logits'].flatten(0, 1).softmax(-1)
        out_bbox = outputs['pred_boxes'].flatten(0, 1)
        tgt_ids = torch2paddle.concat([v['labels'] for v in targets])
        tgt_bbox = torch2paddle.concat([v['boxes'] for v in targets])
        cost_class = -out_prob[:, tgt_ids]
        cost_bbox = paddle.dist(out_bbox, tgt_bbox, p=1)
        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
                                         box_cxcywh_to_xyxy(tgt_bbox))
        C = (self.cost_bbox * cost_bbox + self.cost_class * cost_class +
             self.cost_giou * cost_giou)
        C = C.view(bs, num_queries, -1).cpu()
        sizes = [len(v['boxes']) for v in targets]
        indices = [
            linear_sum_assignment(c[i])
            for i, c in enumerate(C.split(sizes, -1))
        ]
        return [(paddle.to_tensor(i, dtype=torch.int64),
                 paddle.to_tensor(j, dtype=torch.int64)) for i, j in indices]
Exemple #16
0
    def loss_boxes(self,
                   outputs,
                   targets,
                   indices,
                   num_boxes,
                   boxes,
                   visible,
                   rnn_weight=0.5):
        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
           The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
        """
        assert 'pred_boxes' in outputs
        # pred_boxes needs to be associated with final target. This is largest distance between frames, for the Transformer.
        idx = self._get_src_permutation_idx(indices)
        src_boxes = outputs['pred_boxes'][idx]
        target_boxes = torch.cat(
            [t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')

        losses = {}
        losses['loss_bbox'] = loss_bbox.sum() / num_boxes

        losses['loss_giou_circuit'] = torch.tensor(0)
        losses['loss_iou_circuit'] = torch.tensor(0)

        giou, iou = box_ops.generalized_box_iou(
            box_ops.box_cxcywh_to_xyxy(src_boxes),
            box_ops.box_cxcywh_to_xyxy(target_boxes))
        giou = torch.diag(giou)
        iou = torch.diag(iou)
        loss_giou = 1 - giou
        iou = iou
        losses['loss_giou'] = loss_giou.sum() / num_boxes
        losses['iou'] = iou.sum() / num_boxes
        return losses
Exemple #17
0
    def loss_boxes(self, outputs, targets, indices, num_boxes):
        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
        """
        assert 'pred_boxes' in outputs
        # (batch indices, query indices)
        # shape都是(num_matched_queries1+num_matched_queries2+...)
        idx = self._get_src_permutation_idx(indices)
        # outputs['pred_boxes']的shape是(b,num_queries=100,4)
        # src_boxes的shape是(num_matched_queries1+num_matched_queries2+...,4)
        src_boxes = outputs['pred_boxes'][idx]
        # (num_matched_objs1+num_matched_objs2+...,4)
        # num_matched_queries1+num_matched_queries2+..., 和 num_matched_objs1+num_matched_objs2+...
        # 是相等的,在forward部分的matcher的返回结果注释中有说明。
        target_boxes = torch.cat(
            [t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)

        # 以下就是loss的计算。注意下 reduction 参数,若不显式进行设置,在Pytorch的实现中默认是'mean',即返回所有涉及误差计算的元素的均值。
        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')

        losses = {}
        # num_boxes是一个batch图像中目标物体的数量
        losses['loss_bbox'] = loss_bbox.sum() / num_boxes

        # 由于generalized_box_iou返回的是每个预测结果与每个GT的giou,因此取对角线代表获取的是相互匹配的预测结果与GT的giou。
        # 在计算GIoU loss时,使用了torch.diag()获取对角线元素,这是因为generalized_box_iou()方法返回
        # 的是所有预测框与所有GT的GIoU,比如预测框有N个,GT有M个,那么返回结果就是NxM个GIoU。我们预先对
        # 匹配的预测框和GT进行了排列,即N个预测框中的第1个匹配M个GT中的第1个,N中第2个匹配M中第2个,..,N中
        # 第i个匹配M中第i个,于是我们要取相互匹配的那一项来计算loss。
        loss_giou = 1 - torch.diag(
            box_ops.generalized_box_iou(
                box_ops.box_cxcywh_to_xyxy(src_boxes),
                box_ops.box_cxcywh_to_xyxy(target_boxes)))
        losses['loss_giou'] = loss_giou.sum() / num_boxes
        return losses
Exemple #18
0
    def forward(self, outputs, targets):
        """ Performs the matching

        Params:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates

            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
                           objects in the target) containing the class labels
                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates

        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        # 首先将预测结果和GT进行reshape,并对应起来,方便进行计算。
        bs, num_queries = outputs["pred_logits"].shape[:2]

        # We flatten to compute the cost matrices in a batch
        out_prob = outputs["pred_logits"].flatten(0, 1).softmax(
            -1)  # [batch_size * num_queries, num_classes]
        out_bbox = outputs["pred_boxes"].flatten(
            0, 1)  # [batch_size * num_queries, 4]

        # Also concat the target labels and boxes
        # (num_targets1+num_targets2+...,)
        tgt_ids = torch.cat([v["labels"] for v in targets])
        # (num_targets1+num_targets2+...,4)
        tgt_bbox = torch.cat([v["boxes"] for v in targets])

        # 然后就可以对各种度量(各类型loss)进行计算。这里的cost与之前的分类和回归的loss并不完全一样,比如对于分类
        # 来说,loss计算使用的是交叉熵,而这里为了更加简便,直接采用1减去预测概率的形式,同时由于1是常数,于是作者
        # 甚至连1都省去了,有够机智(懒)的...
        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
        # but approximate it in 1 - proba[target class].
        # The 1 is a constant that doesn't change the matching, it can be ommitted.
        # (batch_size*num_queries,num_targets1+num_targets2+...,)
        cost_class = -out_prob[:, tgt_ids]

        # Compute the L1 cost between boxes
        # 另外,在计算bbox的L1误差时,使用了torch.cdist(),其中设置参数p=1代表L1范式(默认是p=2,即L2范式),这个方法会对每个预测
        # 框与GT都进行误差计算,如预测框有N个,GT有M个,结果就会有NxM个值。
        # out_bbox中的每个元组都与tgt_bbox中的计算l1 loss(p=1):|x-x`hat|+|y-y`hat|+|w-w`hat|+|h-h`hat|
        # (batch_size*num_queries, num_targets1+num_targets2+...,)
        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)

        # Compute the giou cost betwen boxes
        # (batch_size*num_queries, num_targets1+num_targets2+...,)
        # 省略了常数1
        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
                                         box_cxcywh_to_xyxy(tgt_bbox))

        # Final cost matrix
        # 接着对各部分度量加权求和,得到一个总度量
        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
        # (batch_size, num_queries, num_targets1+num_targets2+...,)
        C = C.view(bs, num_queries, -1).cpu()

        # 统计了当前batch中每张图像的GT数量,这个操作是为什么呢?接着看,你会发现这招很妙!
        # num_targets1+num_targets2+...
        sizes = [len(v["boxes"]) for v in targets]
        # C.split()在最后一维按各张图像的目标数量进行分割,这样就可以在各图像中将预测结果与GT进行匹配了。
        # 匹配方法使用的是scipy优化模块中的linear_sum_assignment(),其输入是二分图的度量矩阵,该方法是计算这个二分图度量
        # 矩阵的最小权重分配方式,返回的是匹配方案对应的矩阵行索引和列索引。
        indices = [
            linear_sum_assignment(c[i])
            for i, c in enumerate(C.split(sizes, -1))
        ]
        # for each image in a batch, len(i)=len(j)=min(num_queries, num_target_boxes)
        return [(torch.as_tensor(i, dtype=torch.int64),
                 torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
Exemple #19
0
    def forward(self, outputs, targets, indices_track):
        """ Performs the matching

        Params:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates

            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
                           objects in the target) containing the class labels
                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates

        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        indices_final = list()

        bs = len(indices_track)
        targets_cur = targets[0]
        targets_pre = targets[1]
        out_prob = outputs["pred_logits"].softmax(-1)  # [batch_size, num_queries, num_classes]
        out_bbox = outputs["pred_boxes"]  # [batch_size, num_queries, 4]
        for i in range(bs):
            tmp_track_pre = targets_pre[i]["tracks"]
            tmp_track_pre = tmp_track_pre[indices_track[i][1]]  # pre track in detection order.
            tmp_track_cur = targets_cur[i]["tracks"]  # cur track in original order.
            # matching indices: pre_index / cur_index.
            pairs = (tmp_track_pre.unsqueeze(1) == tmp_track_cur).nonzero(as_tuple=True)
            # un-matching items in cur frame.
            target_track = torch.zeros_like(tmp_track_cur)
            target_track[pairs[1]] = 1
            targets[0][i]["binary_tracks"] = target_track.float()
            unmatched_cur = (target_track == 0).nonzero().squeeze(1)
            if len(unmatched_cur):
                # un-matching gt.
                target_cate = targets_cur[i]["labels"][unmatched_cur]
                target_bbox = targets_cur[i]["boxes"][unmatched_cur]
                # un-matching pred.
                pred_prob = out_prob[i]
                pred_bbox = out_bbox[i]
                flag_out = torch.ones_like(pred_prob[:, 0]).bool()  # [num_queries, 1]
                flag_out[indices_track[i][0][pairs[0]]] = False
                pred_prob = pred_prob[flag_out]
                pred_bbox = pred_bbox[flag_out]
                pred_queries = flag_out.nonzero().squeeze(1)
                # compute the cost matrix.
                cost_class = -pred_prob[:, target_cate]
                cost_bbox = torch.cdist(pred_bbox, target_bbox, p=1)
                cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(pred_bbox), box_cxcywh_to_xyxy(target_bbox))
                C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
                # match.
                unmatched_indices = linear_sum_assignment(C.cpu())
                matched_queries = torch.cat([indices_track[i][0][pairs[0]], pred_queries[unmatched_indices[0]].cpu()], dim=0)
                matched_gts = torch.cat([pairs[1], unmatched_cur[unmatched_indices[1]]], dim=0).cpu()
                indices_final.append((matched_queries, matched_gts, indices_track[i][2]))  # pos / gt / neg.
            else:
                indices_final.append((indices_track[i][0][pairs[0]], pairs[1].cpu(), indices_track[i][2]))

        return indices_final, targets[0]
Exemple #20
0
    def step(self, output_results):
        scores = output_results["scores"]
        classes = output_results["labels"]
        bboxes = output_results["boxes"]  # x1y1x2y2
        track_bboxes = output_results["track_boxes"] if "track_boxes" in output_results else None # x1y1x2y2
        
        results = list()
        results_dict = dict()

        tracks = list()
        
        for idx in range(scores.shape[0]):
            if idx in self.tracks_dict and track_bboxes is not None:
                self.tracks_dict[idx]["bbox"] = track_bboxes[idx, :].cpu().numpy().tolist()

            if scores[idx] >= self.score_thresh:
                obj = dict()
                obj["score"] = float(scores[idx])
                obj["bbox"] = bboxes[idx, :].cpu().numpy().tolist()               
                results.append(obj)        
                results_dict[idx] = obj
        
        tracks = [v for v in self.tracks_dict.values()] + self.unmatched_tracks
        N = len(results)
        M = len(tracks)
        
        ret = list()
        unmatched_tracks = [t for t in range(M)]
        unmatched_dets = [d for d in range(N)]
        if N > 0 and M > 0:
            det_box   = torch.stack([torch.tensor(obj['bbox']) for obj in results], dim=0) # N x 4        
            track_box = torch.stack([torch.tensor(obj['bbox']) for obj in tracks], dim=0) # M x 4                
            cost_bbox = 1.0 - box_ops.generalized_box_iou(det_box, track_box) # N x M

            matched_indices = linear_sum_assignment(cost_bbox)
            unmatched_dets = [d for d in range(N) if not (d in matched_indices[0])]
            unmatched_tracks = [d for d in range(M) if not (d in matched_indices[1])]

            matches = [[],[]]
            for (m0, m1) in zip(matched_indices[0], matched_indices[1]):
                if cost_bbox[m0, m1] > 1.2:
                    unmatched_dets.append(m0)
                    unmatched_tracks.append(m1)
                else:
                    matches[0].append(m0)
                    matches[1].append(m1)

            for (m0, m1) in zip(matches[0], matches[1]):
                track = results[m0]
                track['tracking_id'] = tracks[m1]['tracking_id']
                track['age'] = 1
                track['active'] = 1
                pre_box = tracks[m1]['bbox']
                cur_box = track['bbox']
    #             pre_cx, pre_cy = (pre_box[0] + pre_box[2]) / 2, (pre_box[1] + pre_box[3]) / 2
    #             cur_cx, cur_cy = (cur_box[0] + cur_box[2]) / 2, (cur_box[1] + cur_box[3]) / 2
    #             track['vxvy'] = [cur_cx - pre_cx, cur_cy - pre_cy]
                ret.append(track)

        for i in unmatched_dets:
            track = results[i]
            self.id_count += 1
            track['tracking_id'] = self.id_count
            track['age'] = 1
            track['active'] =  1
#             track['vxvy'] = [0.0, 0.0]
            ret.append(track)
        
        ret_unmatched_tracks = []
        for i in unmatched_tracks:
            track = tracks[i]
            if track['age'] < self.max_age:
                track['age'] += 1
                track['active'] = 0
#                 x1, y1, x2, y2 = track['bbox']
#                 vx, vy = track['vxvy']
#                 track['bbox'] = [x1+vx, y1+vy, x2+vx, y2+vy]
                ret.append(track)
                ret_unmatched_tracks.append(track)
    
        self.tracks = ret
        self.tracks_dict = results_dict
        self.unmatched_tracks = ret_unmatched_tracks
        return copy.deepcopy(ret)
Exemple #21
0
    def forward(self, outputs, targets):
        """ Performs the matching

        Params:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates

            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
                           objects in the target) containing the class labels
                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates

        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        bs, num_queries = outputs["action_pred_logits"].shape[:2]  # 2, 100

        # We flatten to compute the cost matrices in a batch
        human_out_prob = outputs["human_pred_logits"].flatten(0, 1).softmax(
            -1)  # [bs * num_queries, num_classes]
        human_out_bbox = outputs["human_pred_boxes"].flatten(
            0, 1)  # [bs * num_queries, 4]
        object_out_prob = outputs["object_pred_logits"].flatten(0, 1).softmax(
            -1)  # [bs * num_queries, num_classes]
        object_out_bbox = outputs["object_pred_boxes"].flatten(
            0, 1)  # [bs * num_queries, 4]
        action_out_prob = outputs["action_pred_logits"].flatten(0, 1).softmax(
            -1)  # [bs * num_queries, num_classes]

        # Also concat the target labels and boxes
        human_tgt_ids = torch.cat([v["human_labels"] for v in targets])
        human_tgt_box = torch.cat([v["human_boxes"] for v in targets])
        object_tgt_ids = torch.cat([v["object_labels"] for v in targets])
        object_tgt_box = torch.cat([v["object_boxes"] for v in targets])
        action_tgt_ids = torch.cat([v["action_labels"] for v in targets])

        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
        # but approximate it in 1 - proba[target class].
        # The 1 is a constant that doesn't change the matching, it can be ommitted.
        human_cost_class = -human_out_prob[:, human_tgt_ids]
        object_cost_class = -object_out_prob[:, object_tgt_ids]
        action_cost_class = -action_out_prob[:, action_tgt_ids]

        # Compute the L1 cost between boxes
        human_cost_bbox = torch.cdist(human_out_bbox, human_tgt_box, p=1)
        object_cost_bbox = torch.cdist(object_out_bbox, object_tgt_box, p=1)

        # Compute the giou cost betwen boxes
        human_cost_giou = -generalized_box_iou(
            box_cxcywh_to_xyxy(human_out_bbox),
            box_cxcywh_to_xyxy(human_tgt_box))
        object_cost_giou = -generalized_box_iou(
            box_cxcywh_to_xyxy(object_out_bbox),
            box_cxcywh_to_xyxy(object_tgt_box))

        beta_1, beta_2 = 1.2, 1
        alpha_h, alpha_o, alpha_r = 1, 1, 2
        l_cls_h = alpha_h * self.cost_class * human_cost_class
        l_cls_o = alpha_o * self.cost_class * object_cost_class
        l_cls_r = alpha_r * self.cost_class * action_cost_class
        l_box_h = self.cost_bbox * human_cost_bbox + self.cost_giou * human_cost_giou
        l_box_o = self.cost_bbox * object_cost_bbox + self.cost_giou * object_cost_giou
        l_cls_all = (l_cls_h + l_cls_o + l_cls_r) / (alpha_h + alpha_o +
                                                     alpha_r)
        l_box_all = (l_box_h + l_box_o) / 2
        C = beta_1 * l_cls_all + beta_2 * l_box_all

        C = C.view(bs, num_queries, -1).cpu()

        sizes = [len(v["human_boxes"]) for v in targets]
        indices = [
            linear_sum_assignment(c[i])
            for i, c in enumerate(C.split(sizes, -1))
        ]

        result = [(torch.as_tensor(i, dtype=torch.int64),
                   torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
        return result
Exemple #22
0
    def forward(self, outputs, targets):
        """ Performs the matching

        Params:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates

            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
                           objects in the target) containing the class labels
                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates

        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        """
We did try a few fixed orderings, including arbitrary fixed order (eg the dataset order), as well as a few lexicographical orders (sorting by position, size, class, ...), but all these attempts performed worse.

If you enforce an ordering, you are essentially putting the additional constraint that the network must output the "true" predictions in the first queries, and then all predict "no-object" for all the remaining queries. Say if there are 100 object queries, which is the default in DETR, but only 3 objects to detect, then predictions [1, 3] must match the targets (according to whichever order you chose), and predictions [4, 100] must be "no objects".
By contrast, with the Hungarian matching, it doesn't matter where the "true" predictions are, they can be scattered anywhere amongst the predictions. In my example, the network can decide to use queries say 43, 57 and 99 to predict the objects, and fill the rest with "no-object".

Here is some intuition why this works better:

Since the network doesn't have to push the predictions to the beginning, it can instead let each query specialize in its own kind of objects. We show in the paper that each object query tend to predict objects in a specific region of the image, which would not be possible with a forced ordering.
Related to the previous point, it's rather clear that the fixed ordering is a worse usage of the queries. In coco for ex, there is no image with more than ~75 objects. With a fixed ordering, it means that the object queries in [75-100] will never be used. Similarly, there are few images with more than >50 objects, so queries [50-75] could potentially overfit to the said images. By contrast, with the Hungarian matching, increasing the number of queries improve the recall and thus the AP.
Finally, Hungarian matching is more robust to noise in the annotations. Say for example there are 3 objects visible A, B and C, but for some reason only B and C are annotated. In the fixed ordering case, if the model predicts A, B, C (which would be theoretically correct), since the fixed ordering loss expects B, C, "no-object" it means that the model will be hugely penalized (none of the predictions will be correct). By contrast, with a Hungarian loss, B and C will be correctly reinforced, and the network will suffer only a small classification penalty for predicting A.
I hope this helps giving more intuition about that.
I believe I have answered your question, and as such I'm closing this issue. Feel free to reach out if you have further concerns.
        """
        bs, num_queries = outputs["pred_logits"].shape[:2]

        # We flatten to compute the cost matrices in a batch
        out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)
        # [batch_size * num_queries, num_classes]
        out_bbox = outputs["pred_boxes"].flatten(0, 1)
        # [batch_size * num_queries, 4]

        # Also concat the target labels and boxes
        tgt_ids = torch.cat([v["labels"] for v in targets])
        # [batch_size * tgt_num]
        tgt_bbox = torch.cat([v["boxes"] for v in targets])
        # [batch_size * tgt_num, 4]

        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
        # but approximate it in 1 - proba[target class].
        # The 1 is a constant that doesn't change the matching, it can be ommitted.
        cost_class = -out_prob[:, tgt_ids]
        # shape: batch_size * num_queries, tgt_num

        # Compute the L1 cost between boxes
        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
        # shape: batch_size * num_queries, tgt_num

        # Compute the giou cost betwen boxes
        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
                                         box_cxcywh_to_xyxy(tgt_bbox))
        # shape: batch_size * num_queries, tgt_num

        # Final cost matrix
        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
        C = C.view(bs, num_queries, -1).cpu()
        # shape: batch_size, num_queries, tgt_num

        sizes = [len(v["boxes"]) for v in targets]  # shape: batch_size
        indices = [
            linear_sum_assignment(c[i])
            for i, c in enumerate(C.split(sizes, -1))
        ]
        # 对最好的query进行优化
        return [(torch.as_tensor(i, dtype=torch.int64),
                 torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
Exemple #23
0
    def loss_boxes(self, outputs, targets, indices, num_boxes, boxes, visible):
        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
           The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
        """
        assert 'pred_boxes' in outputs
        # pred_boxes needs to be associated with final target. This is largest distance between frames, for the Transformer.
        idx = self._get_src_permutation_idx(indices)
        src_boxes = outputs['pred_boxes'][idx]
        target_boxes = torch.cat(
            [t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')

        losses = {}
        losses['loss_bbox'] = loss_bbox.sum() / num_boxes

        if 'hgru_boxes' in outputs:
            # boxes is a N-length list of T timestep boxes for aux supervision
            circuit_boxes = outputs['hgru_boxes']
            # There are rare instances of an inf popping up in the scores. Clamp to control this. Unclear what the source is.
            # circuit_boxes = torch.clamp(circuit_boxes, 0, 1)
            """
            # Prepare labels by downsampling
            labels = boxes.permute(0, 2, 1, 3, 4)
            label_shape = labels.shape
            labels = labels.view(label_shape[0] * label_shape[2], label_shape[1], label_shape[3], label_shape[4])
            labels = F.interpolate(labels, circuit_boxes.shape[3:])
            labels = labels.view(label_shape[0], label_shape[1], label_shape[2], labels.shape[2], labels.shape[3])
            loss = nn.BCEWithLogitsLoss(reduction='none')
            # if 0:  # not torch.all(torch.isfinite(circuit_boxes)):
            #     print("Boxes are infinite")
            #     cb = torch.isinf(circuit_boxes).reshape(label_shape[0], -1).float()
            #     cb = cb.mean(-1)
            #     print("Num_infs: {} numvis: {}".format(cb, visible.mean(-1)))
            #     os._exit(1)
            # if 0:  # torch.any(torch.isnan(circuit_boxes)):
            #    print("Boxes are nan")
            #      cb = torch.isnan(circuit_boxes).reshape(label_shape[0], -1)
            #     cb = cb.mean(-1)
            #     print(cb)
            #     os._exit(1)
            # if 0:  # not torch.all(torch.isfinite(labels)) or torch.any(torch.isnan(labels)):
            #     print("Labels are infinite")
            #     os._exit(1)
            # if 0:  # torch.any(torch.isnan(labels)):
            #     print("Labels are nan")
            #     os._exit(1)
            bce = loss(circuit_boxes, labels)

            # loss_bbox = F.l1_loss(circuit_boxes, boxes, reduction='none')
            bce = bce * visible[:, None, :, None, None]
            bce = bce.sum((1, 2, 3, 4))
            nans = torch.isnan(bce)
            if torch.any(nans):
                print("NaNs detected in loss: {}".format(bce))
                nan_mask = 1 - nans.float()
                bce = bce[~nans]  # Filter the NaNs
                visible = visible * nan_mask[:, None]
            bce = bce.sum() / ((visible.sum() * labels.shape[2] * labels.shape[3]) + 1)
            losses['loss_giou_circuit'] = bce
            losses['loss_iou_circuit'] = torch.tensor(0.)
            """

            # boxes is a N-length list of T timestep boxes for aux supervision
            circuit_boxes = outputs['hgru_boxes']

            # loss_bbox = F.l1_loss(circuit_boxes, boxes, reduction='none')
            # Minimize dissimilarity between current and next bounding box
            cbx, cby, cv = [], [], []
            gious, ious = [], []
            frame_diff = 0
            start_frame = 3
            for bidx in range(
                    start_frame, circuit_boxes.shape[1] - frame_diff
            ):  # cb, bx, vis in zip(circuit_boxes, boxes, visible):
                cb = circuit_boxes[:, bidx]
                bx = boxes[:, bidx + frame_diff]  # Predictions
                vis = visible[:, bidx + frame_diff]
                cb = box_ops.box_cxcywh_to_xyxy(cb)
                bx = box_ops.box_cxcywh_to_xyxy(bx)
                if (cb[:, 2:] >= cb[:, :2]).all() and (bx[:, 2:] >=
                                                       bx[:, :2]).all():
                    cbx.append(cb)
                    cby.append(bx)
                    cv.append(vis)
            if len(cbx) and len(cby) and len(cv):
                all_cb = torch.cat(cbx, 0)
                all_bx = torch.cat(cby, 0)
                all_vis = torch.cat(cv, 0)
                tgiou, tiou = box_ops.generalized_box_iou(all_cb, all_bx)
                cgiou = torch.diag(tgiou)
                ciou = torch.diag(tiou)
                cgious = cgiou * all_vis
                ciou = ciou.sum() / all_vis.sum()
                losses['loss_giou_circuit'] = (1 - cgiou).mean()
                losses['loss_iou_circuit'] = ciou
            else:
                losses['loss_giou_circuit'] = torch.tensor(0.)
                losses['loss_iou_circuit'] = torch.tensor(0.)

        # tgiou, tiou = box_ops.generalized_box_iou(
        #     box_ops.box_cxcywh_to_xyxy(outputs['hgru_boxes'][idx]),
        #     box_ops.box_cxcywh_to_xyxy(target_boxes))
        # cgiou = torch.diag(tgiou)
        # ciou = torch.diag(tiou)
        # losses['loss_giou_circuit'] = (1 - cgiou).sum() / num_boxes
        # losses['loss_iou_circuit'] = ciou.sum() / num_boxes

        # losses['loss_giou_circuit'] = torch.tensor(0)
        # losses['loss_iou_circuit'] = torch.tensor(0)

        giou, iou = box_ops.generalized_box_iou(
            box_ops.box_cxcywh_to_xyxy(src_boxes),
            box_ops.box_cxcywh_to_xyxy(target_boxes))
        giou = torch.diag(giou)
        iou = torch.diag(iou)
        loss_giou = 1 - giou
        iou = iou
        losses['loss_giou'] = loss_giou.sum() / num_boxes
        losses['iou'] = iou.sum() / num_boxes
        return losses
Exemple #24
0
def update_with_iou_loss(losses, src_boxes, target_boxes, num_boxes):
    loss_giou = 1 - torch.diag(
        box_ops.generalized_box_iou(box_ops.box_cxcywh_to_xyxy(src_boxes),
                                    box_ops.box_cxcywh_to_xyxy(target_boxes)))
    losses['loss_giou'] = loss_giou.sum() / num_boxes
    return losses
Exemple #25
0
    def loss_boxes(self,
                   outputs,
                   targets,
                   indices,
                   num_boxes,
                   boxes,
                   visible,
                   rnn_weight=0.5):
        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
           The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
        """
        assert 'pred_boxes' in outputs
        # pred_boxes needs to be associated with final target. This is largest distance between frames, for the Transformer.
        idx = self._get_src_permutation_idx(indices)
        src_boxes = outputs['pred_boxes'][idx]
        target_boxes = torch.cat(
            [t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')

        losses = {}
        losses['loss_bbox'] = loss_bbox.sum() / num_boxes

        if 0:  # 'hgru_boxes' in outputs:
            # boxes is a N-length list of T timestep boxes for aux supervision
            circuit_boxes = outputs['hgru_boxes']

            # loss_bbox = F.l1_loss(circuit_boxes, boxes, reduction='none')
            cbx, cby, cv = [], [], []
            gious, ious = [], []
            for cb, bx, vis in zip(circuit_boxes, boxes, visible):
                cb = box_ops.box_cxcywh_to_xyxy(cb)
                bx = box_ops.box_cxcywh_to_xyxy(bx)
                if (cb[:, 2:] >= cb[:, :2]).all() and (bx[:, 2:] >=
                                                       bx[:, :2]).all():
                    cbx.append(cb)
                    cby.append(bx)
                    cv.append(vis)
            all_cb = torch.cat(cbx, 0)
            all_bx = torch.cat(cby, 0)
            all_vis = torch.cat(cv, 0)
            tgiou, tiou = box_ops.generalized_box_iou(
                box_ops.box_cxcywh_to_xyxy(
                    all_cb),  # circuit_boxes.flatten(0, 1)),
                box_ops.box_cxcywh_to_xyxy(all_bx))  # boxes.flatten(0, 1)))
            cgiou = torch.diag(tgiou)
            ciou = torch.diag(tiou)
            cgious = cgiou * all_vis
            ciou = ciou.sum() / all_vis.sum()
            # cgiou = torch.diag(giou)
            # ciou = torch.diag(iou)
            # cgiou = cgiou * visible.flatten(0).float() # 0 out positions where there isn't a box
            losses['loss_giou_circuit'] = (1 - cgiou).mean()
            losses['loss_iou_circuit'] = ciou
            # losses['loss_giou_circuit'] = loss_bbox.mean()
            # losses['loss_iou_circuit'] = losses['loss_giou_circuit']

        # import pdb;pdb.set_trace()
        # tgiou, tiou = box_ops.generalized_box_iou(
        #     box_ops.box_cxcywh_to_xyxy(outputs['hgru_boxes'][idx]),
        #     box_ops.box_cxcywh_to_xyxy(target_boxes))
        # cgiou = torch.diag(tgiou)
        # ciou = torch.diag(tiou)
        # losses['loss_giou_circuit'] = (1 - cgiou).sum() / num_boxes
        # losses['loss_iou_circuit'] = ciou.sum() / num_boxes

        losses['loss_giou_circuit'] = torch.tensor(0)
        losses['loss_iou_circuit'] = torch.tensor(0)

        giou, iou = box_ops.generalized_box_iou(
            box_ops.box_cxcywh_to_xyxy(src_boxes),
            box_ops.box_cxcywh_to_xyxy(target_boxes))
        giou = torch.diag(giou)
        iou = torch.diag(iou)
        loss_giou = 1 - giou
        iou = iou
        losses['loss_giou'] = loss_giou.sum() / num_boxes
        losses['iou'] = iou.sum() / num_boxes
        return losses
Exemple #26
0
    def forward(self, outputs, targets):
        """ Performs the matching

        Params:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates

            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
                           objects in the target) containing the class labels
                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates

        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        # get the size of first two output dimensions, see Params
        bs, num_queries = outputs["pred_logits"].shape[:2]

        # We flatten to compute the cost matrices in a batch
        # outputs["pred_logits"] gets reshaped to [batch_size * num_queries, num_classes],
        # then apply softmax on the num_classes dimension
        out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)
        # outputs["pred_boxes"] gets reshaped to [batch_size * num_queries, 4]
        out_bbox = outputs["pred_boxes"].flatten(0, 1)

        # Also concat the target labels and boxes
        # tgt_ids is a long list of labels for each bounding box, of shape [batch_size * num_target_boxes, 1]
        tgt_ids = torch.cat([img["labels"] for img in targets])
        # tgt_bbox is a long list of boxes for each bounding box, of shape [batch_size * num_target_boxes, 4]
        tgt_bbox = torch.cat([v["boxes"] for v in targets])

        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
        # but approximate it in 1 - proba[target class].
        # The 1 is a constant that doesn't change the matching, it can be ommitted.
        # Select the class probability at num_classes[tgt_ids]
        # Higher out_prob gets lower cost
        # out_prob is a long list of class vector probabilities for each bounding box
        # cost_class shape: [batch_size * num_queries, len(tgt_ids)]
        # TODO: need intuitive, not wasting computation and memory, since only 1 out of bs will be used.
        #cost_class = -out_prob[:, tgt_ids]

        list_img_pred_probs = [probs for probs in outputs["pred_logits"]]
        list_img_target_classes = [img["labels"] for img in targets]
        list_img_cost_class = [
            self.cost_class * -queries_probs[:, target_classes]
            for queries_probs, target_classes in zip(list_img_pred_probs,
                                                     list_img_target_classes)
        ]

        # Compute the L1 cost between boxes for each value in the bounding box
        list_img_pred_bboxes = [bboxes for bboxes in outputs["pred_boxes"]]
        list_img_target_bboxes = [img["boxes"] for img in targets]
        list_img_cost_bboxes = [
            self.cost_bbox * torch.cdist(pred_bboxes, target_bboxes, p=1)
            for pred_bboxes, target_bboxes in zip(list_img_pred_bboxes,
                                                  list_img_target_bboxes)
        ]
        #cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)

        # Compute the giou cost betwen boxes
        list_img_cost_giou = [
            self.cost_giou *
            -generalized_box_iou(box_cxcywh_to_xyxy(pred_bboxes),
                                 box_cxcywh_to_xyxy(target_bboxes))
            for pred_bboxes, target_bboxes in zip(list_img_pred_bboxes,
                                                  list_img_target_bboxes)
        ]
        #cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))

        # Final cost matrix
        list_img_cost_matrix = [
            cost_class + cost_bboxes + cost_giou
            for cost_class, cost_bboxes, cost_giou in zip(
                list_img_cost_class, list_img_cost_bboxes, list_img_cost_giou)
        ]
        # Final cost matrix
        # C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
        # reshape the cost of each bounding box back into shape [batch_size, num_queries, total_num_targets_in_img_batch]
        # C = C.view(bs, num_queries, -1).cpu()

        # get best matching indices
        list_img_matching_query_target = [
            linear_sum_assignment(cost_matrix)
            for cost_matrix in list_img_cost_matrix
        ]
        return [(torch.as_tensor(i, dtype=torch.int64),
                 torch.as_tensor(j, dtype=torch.int64))
                for i, j in list_img_matching_query_target]