Esempio n. 1
0
    def forward(self, outputs, targets):
        """ Performs the matching
        Params:
                 "preds_prob": Tensor of dim [num_queries, num_classes] with the classification logits
                 "preds_boxes": Tensor of dim [num_queries, 4] with the predicted box coordinates
                 "gt_bboxes": Tensor of dim [num_target_boxes, 5] [x1, y1, x2, y2, label]
        Returns:
            list of tensor of dim [num_queries] with idx of corresponding GT, -1 for background, -2 for ignore
        """
        result = []

        for preds_prob, preds_boxes, t in zip(outputs['pred_logits'], outputs['pred_boxes'], targets):
            preds_prob = preds_prob.sigmoid()
            gt_bboxes = torch.cat((t['boxes'], t['labels'].unsqueeze(-1).float()),dim=-1)
            ig_bboxes = t['iboxes']

            K = preds_prob.shape[0]

            target_gt = gt_bboxes.new_full((K,), self.NEGATIVE_TARGET, dtype=torch.int64)
            target_gt_iou = gt_bboxes.new_full((K,), 0)
            pos_mask = gt_bboxes.new_zeros((K,), dtype=torch.bool)

            if gt_bboxes.numel() > 0:
                tgt_ids = gt_bboxes[:, 4].long()
                tgt_bbox = gt_bboxes[:, :4]


                alpha = 0.25
                gamma = 2.0
                neg_cost_class = (1 - alpha) * (preds_prob ** gamma) * (-(1 - preds_prob + 1e-8).log())
                pos_cost_class = alpha * ((1 - preds_prob) ** gamma) * (-(preds_prob + 1e-8).log())
                cost_class = pos_cost_class - neg_cost_class


                cost_bbox = torch.cdist(preds_boxes, tgt_bbox, p=1)

                cost_giou, overlaps = generalized_box_iou_(box_cxcywh_to_xyxy(preds_boxes), box_cxcywh_to_xyxy(tgt_bbox))
                cost_giou = -cost_giou

                C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
                C = C.cpu()

                src_idx, tgt_idx = linear_sum_assignment(C)

                src_idx = torch.from_numpy(src_idx).to(device=gt_bboxes.device, dtype=torch.int64)
                tgt_idx = torch.from_numpy(tgt_idx).to(device=gt_bboxes.device, dtype=torch.int64)
                target_gt[src_idx] = tgt_idx
                target_gt_iou[src_idx] = overlaps[src_idx, tgt_idx]
                pos_mask[src_idx] = True

            if ig_bboxes.numel() > 0:
                ign_bbox = ig_bboxes[:, :4]
                overlaps = box_iof(box_cxcywh_to_xyxy(preds_boxes), box_cxcywh_to_xyxy(ign_bbox))
                dt_to_ig_max, _ = overlaps.max(dim=1)
                ignored_dt_mask = dt_to_ig_max >= self.ignore_iou_thresh
                ignored_dt_mask = (ignored_dt_mask ^ (ignored_dt_mask & pos_mask))
                target_gt[ignored_dt_mask] = self.IGNORE_TARGET

            result.append(target_gt)
        return result
Esempio n. 2
0
    def loss_boxes(self, outputs, targets, indices, num_boxes):
        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
           The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
        """
        assert 'pred_boxes' in outputs
        idx = self._get_src_permutation_idx(indices)
        src_boxes = outputs['pred_boxes'][idx]  # [#matched query, 4] in order
        target_boxes = torch.cat(
            [t['boxes'][i] for t, (_, i) in zip(targets, indices)],
            dim=0)  # [#boxes, 4] in order
        # print(src_boxes.size())
        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')

        losses = {}
        losses['loss_bbox'] = loss_bbox.sum() / num_boxes

        # diag since we have already matched each src boxes to its target
        loss_giou = 1 - torch.diag(
            generalized_box_iou(  # [#matched query, #boxes]
                box_cxcywh_to_xyxy(src_boxes),
                box_cxcywh_to_xyxy(target_boxes)))

        losses['loss_giou'] = loss_giou.sum() / num_boxes

        return losses
Esempio n. 3
0
    def forward(self, outputs, targets):
        """ Performs the matching

        Params:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates

            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
                           objects in the target) containing the class labels
                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates

        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """

        bs, num_queries = outputs["pred_logits"].shape[:2]

        # We flatten to compute the cost matrices in a batch
        out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]

        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]

        # Also concat the target labels and boxes
        tgt_ids = torch.cat([v["labels"] for v in targets])
        tgt_bbox = torch.cat([v["boxes"] for v in targets])

        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
        # but approximate it in 1 - proba[target class].
        # The 1 is a constant that doesn't change the matching, it can be ommitted.
        alpha = 0.25
        gamma = 2.0
        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
        cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]


        # Compute the L1 cost between boxes
        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)

        # Compute the giou cost betwen boxes
        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))


        # Final cost matrix
        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
        C = C.view(bs, num_queries, -1).cpu()

        sizes = [len(v["boxes"]) for v in targets]
        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]

        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
Esempio n. 4
0
    def inference_coco(self, box_cls, box_pred, image_sizes):
        """
        Arguments:
            box_cls (Tensor): tensor of shape (batch_size, num_queries, K).
                The tensor predicts the classification probability for each query.
            box_pred (Tensor): tensors of shape (batch_size, num_queries, 4).
                The tensor predicts 4-vector (x,y,w,h) box
                regression values for every queryx
            image_sizes (List[torch.Size]): the input image sizes
        Returns:
            results (List[Instances]): a list of #images elements.
        """
        assert len(box_cls) == len(image_sizes)
        results = []

        # For each box we assign the best class or the second best if the best on is `no_object`.
        prob = box_cls.sigmoid()

        topk_values, topk_indexes = torch.topk(prob.view(box_cls.shape[0], -1),
                                               100,
                                               dim=1)
        scores = topk_values
        topk_boxes = topk_indexes // box_cls.shape[2]
        labels = topk_indexes % box_cls.shape[2]
        box_pred = box_cxcywh_to_xyxy(box_pred)
        box_pred = torch.gather(box_pred, 1,
                                topk_boxes.unsqueeze(-1).repeat(1, 1, 4))

        for i, (scores_per_image, labels_per_image, box_pred_per_image,
                image_size) in enumerate(
                    zip(scores, labels, box_pred, image_sizes)):
            result = Instances(image_size)
            # result.pred_boxes = box_pred_per_image
            result.pred_boxes = Boxes(box_pred_per_image)
            result.pred_boxes.scale(scale_x=image_size[1],
                                    scale_y=image_size[0])
            result.scores = scores_per_image
            result.pred_classes = labels_per_image
            results.append({"instances": result})
            # results.append(result)
        return results
Esempio n. 5
0
    def inference_ch(self, box_cls, box_pred, image_sizes):
        """
        Arguments:
            box_cls (Tensor): tensor of shape (batch_size, num_queries, K).
                The tensor predicts the classification probability for each query.
            box_pred (Tensor): tensors of shape (batch_size, num_queries, 4).
                The tensor predicts 4-vector (x,y,w,h) box
                regression values for every queryx
            image_sizes (List[torch.Size]): the input image sizes
        Returns:
            results (List[Instances]): a list of #images elements.
        """
        assert len(box_cls) == len(image_sizes)
        results = []

        # For each box we assign the best class or the second best if the best on is `no_object`.
        prob = box_cls.sigmoid()
        scores = prob  #.squeeze(-1) # [bs, num_query, 1]
        labels = torch.ones_like(
            scores, dtype=torch.int64,
            device=scores.device)  #.squeeze(-1) # [bs, num_query, 1]
        box_pred = box_cxcywh_to_xyxy(box_pred)
        img_h, img_w = image_sizes.unbind(1)
        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
        box_pred = box_pred * scale_fct[:, None, :]

        for i, (scores_per_image, labels_per_image, box_pred_per_image,
                image_size) in enumerate(
                    zip(scores, labels, box_pred, image_sizes)):
            result = Instances(image_size)
            result.pred_boxes = box_pred_per_image
            # result.pred_boxes = Boxes(box_pred_per_image)
            # result.pred_boxes.scale(scale_x=image_size[1], scale_y=image_size[0])
            result.scores = scores_per_image
            result.pred_classes = labels_per_image
            results.append({"instances": result})
            # results.append(result)
        return results
Esempio n. 6
0
    def forward(self,
                q,
                q_pos,
                k,
                k_pos,
                key_padding_mask=None,
                pos_centers=None,
                spatial_shape=None):

        M_ = self.dec_sampling_heads
        P_ = self.dec_sampling_points
        F_ = self.feature_levels
        N_, C_, S_ = k.shape  # memoy of encoder
        L_ = q.shape[0]

        spatial_shape_, valid_sizes, valid_scales = spatial_shape
        # [bs, #level, 2] -> [1, nhead*bs, 1, #level, 2]
        valid_sizes = valid_sizes.view(1, N_, 1, F_,
                                       2).repeat_interleave(M_, 1)
        valid_scales = 2 * valid_scales.view(1, N_, 1, F_,
                                             2).repeat_interleave(M_, 1)

        value = self.value_conv(k.unsqueeze(-1)).squeeze(-1)
        value = value.masked_fill(key_padding_mask.view(N_, 1, S_), float(0))

        spatial_splits = [H_ * W_ for H_, W_ in spatial_shape_]
        value_list = torch.split(value, spatial_splits, dim=-1)
        value_list = [
            value_.view(N_ * M_, C_ // M_, H_, W_)
            for value_, (H_, W_) in zip(value_list, spatial_shape_)
        ]

        weights = self.sampling_weight(q).view(L_, N_ * M_, 1,
                                               F_ * P_).softmax(3)
        # [L, bs, C] -> [L, nhead*bs, #key, #level, 2]
        grids = self.sampling_locs(q).view(L_, N_ * M_, P_, F_, 2)

        # [N * nhead, L, 4]
        pos_centers = pos_centers.permute(1, 0, 2).sigmoid().repeat_interleave(
            M_, 0)

        ##
        # [bs * nhead, L, 2 (wh)] -> [L, bs * nhead, 1, 1, 2]
        wh = pos_centers[:, :, 2:].permute(1, 0, 2).view(L_, N_ * M_, 1, 1, 2)
        # [L, nhead*bs, #key, #level, 2]
        grid_pts = torch.zeros((L_, M_, P_, F_, 2),
                               dtype=weights.dtype,
                               device=weights.device)

        for h_i in range(M_):
            for i in range(self.dec_sampling_points):
                grid_pts[:, h_i, i, :,
                         0] = ((i % int(self.pool_resolution[1])) +
                               0.5) / self.pool_resolution[1]
                grid_pts[:, h_i, i, :, 1] = (h_i + 0.5) / M_

        grid_pts = grid_pts.repeat(1, N_, 1, 1, 1)
        grid_pts *= wh

        # [N * nhead, L, 4] -> [L, bs*nhead, 1, 1, 2]
        boxes_xy = box_ops.box_cxcywh_to_xyxy(pos_centers)[:, :, :2].permute(
            1, 0, 2).view(L_, N_ * M_, 1, 1, -1)

        grids = ((grids * wh / P_) + boxes_xy + grid_pts) * valid_scales - 1

        # [L, bs*nhead, #key, #level, 2] -> [#level, bs*nhead, L, #key, 2]
        grids = grids.permute(3, 1, 0, 2, 4)

        samples_value_list = [
            F.grid_sample(value,
                          grids,
                          mode='bilinear',
                          padding_mode='zeros',
                          align_corners=False)
            for value, grids in zip(value_list, grids)
        ]

        # [bs*nhead, C / nhead, L, #key*#level]
        samples_value = torch.cat(samples_value_list, -1)
        # [bs*nhead, 1, L, #level*key]
        weights = weights.permute(1, 2, 0, 3)

        # sum all keys on all level [bs*nhead, C / nhead, L] -> [L, N, C]
        output = torch.sum(samples_value * weights,
                           -1).permute(2, 0, 1).view(L_, N_, C_)
        output = self.output_proj(output)

        # [#level, bs*nhead, #key, 2]  -> [#level, bs, nhead, #level, #key, 2] -> [bs, L, #level, nhead, #key, 2]
        output_sample_pts = ((grids + 1.0) / 2.0).view(F_, N_, M_, L_, P_,
                                                       2).permute(
                                                           1, 3, 0, 2, 4, 5)
        # [bs*nhead, 1, L, #level*key] -> [bs, #level, #level, nhead, #key]
        output_sample_weights = weights.view(N_, M_, L_, F_,
                                             P_).permute(0, 2, 3, 1, 4)
        # concat weight to sampled weight on last dim, last dim contains cx cy weight
        output_sample_attn = torch.cat(
            (output_sample_pts, output_sample_weights[..., None]), -1)

        return output, output_sample_attn
Esempio n. 7
0
    def forward(self, tgt, memory,
                tgt_mask: Optional[Tensor] = None,
                memory_mask: Optional[Tensor] = None,
                tgt_key_padding_mask: Optional[Tensor] = None,
                memory_key_padding_mask: Optional[Tensor] = None,
                pos: Optional[Tensor] = None,
                query_pos: Optional[Tensor] = None,
                pos_centers=None, spatial_shape=None):

        output = tgt

        intermediate = []
        intermediate_centers = []
        intermediate_dec_attns = []
        # intermediate_tgt = []

        for lvl, layer in enumerate(self.layers):

            # intermediate_tgt.append(output)

            if self.dense_query is True: # work around for dense query implementation

                outputs_coord = pos_centers.permute(1, 0, 2).sigmoid()
                nquery = outputs_coord.size(1)
                tgt_masks = []
                for pred in outputs_coord:

                    tgt_masks_ = torch.zeros((nquery, nquery), device=pos_centers.device)
                    boxes = box_cxcywh_to_xyxy(pred)
                    giou_score = 1 - generalized_box_iou( boxes, boxes)
                    score = giou_score
                    top_idx = torch.sort(score, dim=-1)[1][:, :100] # returns a longtensor
                    # _, top_idx = torch.topk(score, k=100, largest=False, sorted=True,dim=-1)#[nquery, topk] #torch.sort is faster on GPU
                    tgt_masks_.scatter_(1, top_idx, 1.)
                    tgt_masks.append(tgt_masks_)

                tgt_mask = torch.stack(tgt_masks, dim=0).repeat_interleave(8, 0)

            output, dec_attn = layer(output, memory, tgt_mask=tgt_mask,
                                     memory_mask=memory_mask,
                                     tgt_key_padding_mask=tgt_key_padding_mask,
                                     memory_key_padding_mask=memory_key_padding_mask,
                                     pos=pos, query_pos=query_pos,
                                     pos_centers=pos_centers,
                                     spatial_shape=spatial_shape)

            if self.return_intermediate:
                intermediate.append(self.norm(output))
                intermediate_centers.append(pos_centers)
                intermediate_dec_attns.append(dec_attn)

            if self.bbox_embed is not None:

                tmp = self.bbox_embed[lvl](self.norm(output))
                new_pos_centers = tmp + pos_centers
                pos_centers = new_pos_centers.detach()

        if self.norm is not None:
            output = self.norm(output)
            if self.return_intermediate:
                intermediate.pop()

                intermediate.append(output)

        if self.return_intermediate:

            return torch.stack(intermediate), torch.stack(intermediate_centers), torch.stack(intermediate_dec_attns)#torch.stack(intermediate_tgt)

        return output, pos_centers, dec_attn