Ejemplo n.º 1
0
 def _match_annotations(self, image_annotations, image_predictions):
     # TODO: Evaluate the number of detected instances.
     prediction_boxes = Boxes.cat(_extract_instances_property(image_predictions, "bbox"))
     annotation_boxes = Boxes.cat(_extract_instances_property(image_annotations, "bbox"))
     match_quality_matrix = pairwise_iou(annotation_boxes, prediction_boxes)
     matched_idxs, matched_labels = self._bbox_matcher(match_quality_matrix)
     matched_image_annotations = [image_annotations[i] for i in matched_idxs]
     return matched_image_annotations, matched_labels
Ejemplo n.º 2
0
    def _detectAndMap(self, image):
        '''
		image: Frame amin a játékosokat akarjuk megtalálni
		result: dict((x_cut, y_cut) -> counturList)

		'''
        st = time.time()
        # 0. Preprocess image
        frame = self.preprocess(image)

        # 1. Külön felvágom a képeket kis cellákra
        l_cells = self._cutImageToGrids(frame)

        # 2. Majd ezeket a képeket beadom a multiple prediktálóba
        l_preds = self._predictMultipleImages(l_cells)

        # 2.1 Lista a cellákon található instancokról
        l_preds = [x['instances'].to('cpu') for x in l_preds]

        # 2.2 Visszamappelem a bemeneti képre, majd felskálázom az 5K-s képre
        # 2.3 A Kamerától vett távolság függvényében változtatom a score-t (yTL)
        for inst, cell in zip(l_preds, self.gridList):
            inst.remove(
                'pred_masks'
            )  # pred_masks nincs használva TODO: TeamColor esetében lehet jól jön
            inst.pred_boxes.tensor[:, 0:4] += torch.Tensor(
                [cell[0], cell[1], cell[0], cell[1]])
            inst.boxes_before = inst.pred_boxes.clone(
            )  # Eredeti képen skálázás nélkül hol vannak
            inst.pred_boxes.tensor = inst.pred_boxes.tensor.divide(
                self.trans_value)
            inst.scores *= self.cameraDistWeight[cell[1]]

        # 2.4 Egész képre vonatkoztatott Instancok
        finalInstances = Instances(
            image_size=self.origResolution[::-1])  # (1440, 5120)
        finalInstances.pred_boxes = Boxes.cat([x.pred_boxes for x in l_preds])
        finalInstances.boxes_before = Boxes.cat(
            [x.boxes_before for x in l_preds])
        finalInstances.scores = torch.cat([x.scores for x in l_preds])
        finalInstances.pred_classes = torch.cat(
            [x.pred_classes for x in l_preds])

        # 3. Leszűröm az emberekre csak
        _person_class_ID = 0
        finalInstances = finalInstances[finalInstances.pred_classes ==
                                        _person_class_ID]

        # 4. NMS használata, hogy kiiktassam az átlapolódásokat
        iouIdx = torchvision.ops.nms(finalInstances.pred_boxes.tensor,
                                     finalInstances.scores, self.nmsThreshold)
        finalInstances = finalInstances[iouIdx]

        # 5. Félemberek leszűrése
        finalInstances = self._filterHalfMan(finalInstances)

        return finalInstances, frame
Ejemplo n.º 3
0
 def get_ground_truth(self, anchors, bbox_preds, targets):
     anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]
     N = len(anchors)
     # list[Tensor(R, 4)], one for each image
     all_anchors = Boxes.cat(anchors).tensor.reshape(N, -1, 4)
     # Boxes(Tensor(N*R, 4))
     box_delta = cat(bbox_preds, dim=1)
     # box_pred: xyxy; targets: xyxy
     box_pred = self.box2box_transform.apply_deltas(box_delta, all_anchors)
     indices = self.anchor_matcher(box_pred, all_anchors, targets)
     return indices
Ejemplo n.º 4
0
def setup(file):
    # get cfg
    cfg = get_cfg()
    cfg.merge_from_file(file)
    cfg.SOLVER.IMS_PER_BATCH = 2

    # get data loader iter
    data_loader = build_detection_train_loader(cfg)
    data_loader_iter = iter(data_loader)
    batched_inputs = next(data_loader_iter)

    # build anchors
    backbone = build_backbone(cfg).to(device)
    images = [x["image"].to(device) for x in batched_inputs]
    images = ImageList.from_tensors(images, backbone.size_divisibility)
    features = backbone(images.tensor.float())

    input_shape = backbone.output_shape()
    in_features = cfg.MODEL.RPN.IN_FEATURES
    anchor_generator = build_anchor_generator(
        cfg, [input_shape[f] for f in in_features])
    anchors = anchor_generator([features[f] for f in in_features])
    anchors = Boxes.cat(anchors).to(device)

    # build matcher
    raw_matcher = Matcher(cfg.MODEL.RPN.IOU_THRESHOLDS,
                          cfg.MODEL.RPN.IOU_LABELS,
                          allow_low_quality_matches=True)
    matcher = TopKMatcher(cfg.MODEL.RPN.IOU_THRESHOLDS,
                          cfg.MODEL.RPN.IOU_LABELS, 9)

    return cfg, data_loader_iter, anchors, matcher, raw_matcher
Ejemplo n.º 5
0
    def label_and_sample_anchors(
        self, anchors: List[Boxes], gt_instances: List[Instances]
    ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
        """
        Args:
            anchors (list[Boxes]): anchors for each feature map.
            gt_instances: the ground-truth instances for each image.

        Returns:
            list[Tensor]:
                List of #img tensors. i-th element is a vector of labels whose length is
                the total number of anchors across all feature maps R = sum(Hi * Wi * A).
                Label values are in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative
                class; 1 = positive class.
            list[Tensor]:
                i-th element is a Rx4 tensor. The values are the matched gt boxes for each
                anchor. Values are undefined for those anchors not labeled as 1.
        """
        anchors = Boxes.cat(anchors)

        gt_boxes = [x.gt_boxes for x in gt_instances]
        image_sizes = [x.image_size for x in gt_instances]
        del gt_instances

        gt_labels = []
        matched_gt_boxes = []
        for image_size_i, gt_boxes_i in zip(image_sizes, gt_boxes):
            """
            image_size_i: (h, w) for the i-th image
            gt_boxes_i: ground-truth boxes for i-th image
            """

            match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i,
                                                                   anchors)
            matched_idxs, gt_labels_i = retry_if_cuda_oom(
                self.anchor_matcher)(match_quality_matrix)
            # Matching is memory-expensive and may result in CPU tensors. But the result is small
            gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device)
            del match_quality_matrix

            if self.anchor_boundary_thresh >= 0:
                # Discard anchors that go out of the boundaries of the image
                # NOTE: This is legacy functionality that is turned off by default in Detectron2
                anchors_inside_image = anchors.inside_box(
                    image_size_i, self.anchor_boundary_thresh)
                gt_labels_i[~anchors_inside_image] = -1

            # A vector of labels (-1, 0, 1) for each anchor
            gt_labels_i = self._subsample_labels(gt_labels_i)

            if len(gt_boxes_i) == 0:
                # These values won't be used anyway since the anchor is labeled as background
                matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
            else:
                # TODO wasted indexing computation for ignored boxes
                matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor

            gt_labels.append(gt_labels_i)  # N,AHW
            matched_gt_boxes.append(matched_gt_boxes_i)
        return gt_labels, matched_gt_boxes
    def get_ground_truth(self,
                         anchors: List[Boxes],
                         gt_instances: List[Instances],
                         num_classes: int) -> Tuple[List[Tensor],
                                                    List[Tensor]]:
        """
        Extract the ground truth classes and boxes from a list of Instances objects.

        Args:
            anchors (List[Boxes]):          A list of #feature level Boxes. The Boxes contains
                                                anchors of this image on the specific feature
                                                level.
            gt_instances (List[Instances]): A list of N `Instances`s. The i-th `Instances`
                                                contains the ground-truth per-instance annotations
                                                for the i-th input image.
            num_classes (int):              The number of classes.

        Returns:
            gt_classes (List[Tensor]):          List of #img tensors. i-th element is a vector of
                                                    classes whose length is the total number of
                                                    anchors across all feature maps
                                                    (sum(Hi * Wi * A)).
                                                    Label values are in {-1, 0, ..., K}, with -1
                                                    means ignore, and K means background.
            matched_gt_boxes (List[Tensor]):    i-th element is a Rx4 tensor, where R is the total
                                                    number of anchors across feature maps.
                                                    The values are the matched gt boxes for each
                                                    anchor.
                                                    Values are undefined for those anchors not
                                                    labeled as foreground.
        """
        anchors_boxes: Boxes = Boxes.cat(anchors)

        gt_classes: List[Tensor] = []
        matched_gt_boxes: List[Tensor] = []

        for gt_instance in gt_instances:
            match_quality_matrix: Tensor = pairwise_iou(gt_instance.gt_boxes,
                                                        anchors_boxes)
            matched_idxs, anchor_classes = self.anchor_matcher(match_quality_matrix)
            del match_quality_matrix

            if len(gt_instance) > 0:
                matched_gt_boxes_i: Tensor = gt_instance.gt_boxes.tensor[matched_idxs]

                gt_classes_i: Tensor = gt_instance.gt_classes[matched_idxs]

                # Anchors with class 0 are treated as background.
                gt_classes_i[anchor_classes == 0] = num_classes
                # Anchors with class -1 are ignored.
                gt_classes_i[anchor_classes == -1] = -1

            else:
                matched_gt_boxes_i = torch.zeros_like(anchors_boxes.tensor)
                gt_classes_i = torch.zeros_like(matched_idxs) + num_classes

            gt_classes.append(gt_classes_i)
            matched_gt_boxes.append(matched_gt_boxes_i)

        return gt_classes, matched_gt_boxes
Ejemplo n.º 7
0
    def get_proposal_clusters(self, box, proposals, label, cls_prob):
        gt_boxes = []
        gt_classes = []
        gt_scores = []
        if cls_prob.numel() > 0:
            for idx, gt_class in enumerate(label):
                curr_cls_prob = cls_prob.index_select(
                    1, index=gt_class).clone().detach()
                max_gt_score, max_index = curr_cls_prob.max(dim=0)
                gt_boxes.append(box[max_index])
                gt_classes.append(gt_class)
                gt_scores.append(max_gt_score)
                cls_prob[max_index, :] = 0.0
            gt_classes = torch.stack(gt_classes)
            gt_scores = torch.cat(gt_scores)
            new_instance = Instances(box.image_size)
            new_instance.gt_boxes = copy.deepcopy(
                Boxes.cat([x.proposal_boxes for x in gt_boxes]))
            new_instance.gt_classes = label.clone().detach()
        else:
            new_instance = Instances(box.image_size)
            new_instance.gt_boxes = Boxes(torch.zeros(0, self.box_dim)).to(
                cls_prob.device)
            new_instance.gt_classes = torch.zeros(0).to(cls_prob.device)
            gt_scores = torch.zeros(0).to(cls_prob.device)

        return new_instance, gt_scores
Ejemplo n.º 8
0
    def forward(self, image_sizes, box_cls, box_regression, centerness, anchors):
        sampled_boxes = []
        # anchors = list(zip(*anchors))
        for _, (o, b, c, a) in enumerate(zip(box_cls, box_regression, centerness, anchors)):
            sampled_boxes.append(
                self.forward_for_single_feature_map(o, b, c, a)
            )

        boxlists = []
        for i, image_size in enumerate(image_sizes):
            boxlist = Instances(image_size)
            boxes = []
            scores = []
            classes = []
            for j in range(len(anchors)):
                boxes.append(sampled_boxes[j][i][0])
                scores.append(sampled_boxes[j][i][1])
                classes.append(sampled_boxes[j][i][2])
            boxes = Boxes.cat(boxes)
            boxes.clip(image_size)
            keep = boxes.nonempty(self.min_size)
            boxlist.pred_boxes = boxes[keep]
            boxlist.scores = torch.cat(scores, dim=0)[keep]
            boxlist.pred_classes = torch.cat(classes, dim=0)[keep]

            boxlists.append(boxlist)

        # boxlists = list(zip(*sampled_boxes))
        # boxlists = [cat_boxlist(boxlist) for boxlist in boxlists]
        # boxlists = Boxes.cat(boxlists)
        if not self.bbox_aug_enabled:
            boxlists = self.select_over_all_levels(boxlists)

        return boxlists
Ejemplo n.º 9
0
    def get_ground_truth(self, anchors, targets):
        """
        Args:
            anchors (list[list[Boxes]]): a list of N=#image elements. Each is a
                list of #feature level Boxes. The Boxes contains anchors of
                this image on the specific feature level.
            targets (list[Instances]): a list of N `Instances`s. The i-th
                `Instances` contains the ground-truth per-instance annotations
                for the i-th input image.  Specify `targets` during training only.

        Returns:
            gt_classes (Tensor):
                An integer tensor of shape (N, R) storing ground-truth
                labels for each anchor.
                R is the total number of anchors, i.e. the sum of Hi x Wi x A for all levels.
                Anchors with an IoU with some target higher than the foreground threshold
                are assigned their corresponding label in the [0, K-1] range.
                Anchors whose IoU are below the background threshold are assigned
                the label "K". Anchors whose IoU are between the foreground and background
                thresholds are assigned a label "-1", i.e. ignore.
            gt_anchors_deltas (Tensor):
                Shape (N, R, 4).
                The last dimension represents ground-truth box2box transform
                targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box.
                The values in the tensor are meaningful only when the corresponding
                anchor is labeled as foreground.
        """
        gt_classes = []
        gt_anchors_deltas = []
        anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]
        # list[Tensor(R, 4)], one for each image

        for anchors_per_image, targets_per_image in zip(anchors, targets):
            match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes,
                                                anchors_per_image)
            gt_matched_idxs, anchor_labels = self.matcher(match_quality_matrix)

            has_gt = len(targets_per_image) > 0
            if has_gt:
                # ground truth box regression
                matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs]
                gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas(
                    anchors_per_image.tensor, matched_gt_boxes.tensor)

                gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs]
                # Anchors with label 0 are treated as background.
                gt_classes_i[anchor_labels == 0] = self.num_classes
                # Anchors with label -1 are ignored.
                gt_classes_i[anchor_labels == -1] = -1
            else:
                gt_classes_i = torch.zeros_like(
                    gt_matched_idxs) + self.num_classes
                gt_anchors_reg_deltas_i = torch.zeros_like(
                    anchors_per_image.tensor)

            gt_classes.append(gt_classes_i)
            gt_anchors_deltas.append(gt_anchors_reg_deltas_i)

        return torch.stack(gt_classes), torch.stack(gt_anchors_deltas)
Ejemplo n.º 10
0
 def _join_elements_pred_with_gt(pred_ele, pred_indices, gt_ele,
                                 gt_indices):
     if isinstance(pred_ele, Boxes):
         return Boxes.cat(
             [_deselect(pred_ele, pred_indices), gt_ele[gt_indices]])
     else:
         return torch.cat(
             [_deselect(pred_ele, pred_indices), gt_ele[gt_indices]])
Ejemplo n.º 11
0
    def inference_single_image(self, conf_pred_per_image, loc_pred_per_image,
                               default_boxes, image_size):
        """
        Single-image inference. Return bounding-box detection results by thresholding
        on scores and applying non-maximum suppression (NMS).

        Args:
            conf_pred_per_image (list[Tensor]): list of #feature levels. Each entry contains
                tensor of size [Hi x Wi x D, C].
            loc_pred_per_image (list[Tensor]): same shape as 'conf_pred_per_image' except
                that C becomes 4.
            default_boxes (list['Boxes']):  a list of 'Boxes' elements.
                The Boxes contains default boxes of one image on the specific feature level.
            image_size (tuple(H, W)): a tuple of the image height and width.

        Returns:
            Same as `inference`, but for only one image.
        """
        # predict confidence
        conf_pred = torch.cat(conf_pred_per_image, dim=0)  # [R, C]
        conf_pred = conf_pred.softmax(dim=1)

        # predict boxes
        loc_pred = torch.cat(loc_pred_per_image, dim=0)  # [R, 4]
        default_boxes = Boxes.cat(default_boxes)  # [R, 4]
        boxes_pred = self.box2box_transform.apply_deltas(
            loc_pred, default_boxes.tensor)

        num_boxes, num_classes = conf_pred.shape
        boxes_pred = boxes_pred.view(num_boxes, 1,
                                     4).expand(num_boxes, num_classes,
                                               4)  # [R, C, 4]
        labels = torch.arange(num_classes, device=self.device)  # [0, ..., C]
        labels = labels.view(1, num_classes).expand_as(conf_pred)  # [R, C]

        # remove predictions with the background label
        boxes_pred = boxes_pred[:, :-1]
        conf_pred = conf_pred[:, :-1]
        labels = labels[:, :-1]

        # batch everything, by making every class prediction be a separate instance
        boxes_pred = boxes_pred.reshape(-1, 4)
        conf_pred = conf_pred.reshape(-1)
        labels = labels.reshape(-1)

        # remove low scoring boxes
        indices = torch.nonzero(conf_pred > self.score_threshold).squeeze(1)
        boxes_pred, conf_pred, labels = boxes_pred[indices], conf_pred[
            indices], labels[indices]

        keep = batched_nms(boxes_pred, conf_pred, labels, self.nms_threshold)
        keep = keep[:self.max_detections_per_image]

        result = Instances(image_size)
        result.pred_boxes = Boxes(boxes_pred[keep])
        result.scores = conf_pred[keep]
        result.pred_classes = labels[keep]
        return result
Ejemplo n.º 12
0
    def label_anchors(self, anchors, gt_instances):
        """
        Args:
            anchors (list[Boxes]): A list of #feature level Boxes.
                The Boxes contains anchors of this image on the specific feature level.
            gt_instances (list[Instances]): a list of N `Instances`s. The i-th
                `Instances` contains the ground-truth per-instance annotations
                for the i-th input image.

        Returns:
            list[Tensor]:
                List of #img tensors. i-th element is a vector of labels whose length is
                the total number of anchors across all feature maps (sum(Hi * Wi * A)).
                Label values are in {-1, 0, ..., K}, with -1 means ignore, and K means background.
            list[Tensor]:
                i-th element is a Rx4 tensor, where R is the total number of anchors across
                feature maps. The values are the matched gt boxes for each anchor.
                Values are undefined for those anchors not labeled as foreground.
        """
        anchors = Boxes.cat(anchors)  # Rx4

        gt_labels, gt_labels_1, gt_labels_2 = [], [], []  #change

        matched_gt_boxes = []
        for gt_per_image in gt_instances:
            match_quality_matrix = pairwise_iou(gt_per_image.gt_boxes, anchors)
            matched_idxs, anchor_labels = self.anchor_matcher(
                match_quality_matrix)
            del match_quality_matrix

            if len(gt_per_image) > 0:
                matched_gt_boxes_i = gt_per_image.gt_boxes.tensor[matched_idxs]

                gt_labels_i = gt_per_image.gt_classes[matched_idxs]
                gt_labels_i_1 = gt_per_image.gt_classes_1[matched_idxs]
                gt_labels_i_2 = gt_per_image.gt_classes_2[matched_idxs]
                # Anchors with label 0 are treated as background.
                gt_labels_i[anchor_labels == 0] = self.num_classes
                gt_labels_i_1[anchor_labels == 0] = 3
                gt_labels_i_2[anchor_labels == 0] = 3
                # Anchors with label -1 are ignored.
                gt_labels_i[anchor_labels == -1] = -1
                gt_labels_i_1[anchor_labels == -1] = -1
                gt_labels_i_2[anchor_labels == -1] = -1
            else:
                matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
                gt_labels_i = torch.zeros_like(matched_idxs) + self.num_classes
                gt_labels_i_1 = torch.zeros_like(matched_idxs) + 3
                gt_labels_i_2 = torch.zeros_like(matched_idxs) + 3

            gt_labels.append(gt_labels_i)
            gt_labels_1.append(gt_labels_i_1)
            gt_labels_2.append(gt_labels_i_2)
            matched_gt_boxes.append(matched_gt_boxes_i)

        return gt_labels, gt_labels_1, gt_labels_2, matched_gt_boxes
Ejemplo n.º 13
0
    def match_anchors(self, anchors: List[Boxes],
                      gt_instances: List[Instances]):
        """
        Match anchors with ground truth boxes.

        Args:
            anchors: #level boxes, from the highest resolution to lower resolution
            gt_instances: ground truth instances per image

        Returns:
            List[Tensor]:
                #image tensors, each is a vector of matched gt
                indices (or -1 for unmatched anchors) for all anchors.
        """
        num_anchors_per_level = [len(x) for x in anchors]
        anchors = Boxes.cat(anchors)  # Rx4
        anchor_centers = anchors.get_centers()  # Rx2
        anchor_sizes = anchors.tensor[:, 2] - anchors.tensor[:, 0]  # R

        lower_bound = anchor_sizes * 4
        lower_bound[:num_anchors_per_level[0]] = 0
        upper_bound = anchor_sizes * 8
        upper_bound[-num_anchors_per_level[-1]:] = float("inf")

        matched_indices = []
        for gt_per_image in gt_instances:
            gt_centers = gt_per_image.gt_boxes.get_centers()  # Nx2
            # FCOS with center sampling: anchor point must be close enough to gt center.
            pairwise_match = (
                anchor_centers[:, None, :] -
                gt_centers[None, :, :]).abs_().max(
                    dim=2
                ).values < self.center_sampling_radius * anchor_sizes[:, None]
            pairwise_dist = pairwise_point_box_distance(
                anchor_centers, gt_per_image.gt_boxes)

            # The original FCOS anchor matching rule: anchor point must be inside gt
            pairwise_match &= pairwise_dist.min(dim=2).values > 0

            # Multilevel anchor matching in FCOS: each anchor is only responsible
            # for certain scale range.
            pairwise_dist = pairwise_dist.max(dim=2).values
            pairwise_match &= (pairwise_dist > lower_bound[:, None]) & (
                pairwise_dist < upper_bound[:, None])

            # Match the GT box with minimum area, if there are multiple GT matches
            gt_areas = gt_per_image.gt_boxes.area()  # N
            pairwise_match = pairwise_match.to(
                torch.float32) * (1e8 - gt_areas[None, :])
            min_values, matched_idx = pairwise_match.max(
                dim=1)  # R, per-anchor match
            matched_idx[
                min_values < 1e-5] = -1  # Unmatched anchors are assigned -1

            matched_indices.append(matched_idx)
        return matched_indices
Ejemplo n.º 14
0
    def _match_anchors(self, gt_boxes: Boxes, anchors: List[Boxes]):
        """
        Match ground-truth boxes to a set of multi-level anchors.

        Args:
            gt_boxes: Ground-truth boxes from instances of an image.
            anchors: List of anchors for each feature map (of different scales).

        Returns:
            torch.Tensor
                A tensor of shape `(M, R)`, given `M` ground-truth boxes and total
                `R` anchor points from all feature levels, indicating the quality
                of match between m-th box and r-th anchor. Higher value indicates
                better match.
        """
        # Naming convention: (M = ground-truth boxes, R = anchor points)
        # Anchor points are represented as square boxes of size = stride.
        num_anchors_per_level = [len(x) for x in anchors]
        anchors = Boxes.cat(anchors)  # (R, 4)
        anchor_centers = anchors.get_centers()  # (R, 2)
        anchor_sizes = anchors.tensor[:, 2] - anchors.tensor[:, 0]  # (R, )

        lower_bound = anchor_sizes * 4
        lower_bound[:num_anchors_per_level[0]] = 0
        upper_bound = anchor_sizes * 8
        upper_bound[-num_anchors_per_level[-1]:] = float("inf")

        gt_centers = gt_boxes.get_centers()

        # FCOS with center sampling: anchor point must be close enough to
        # ground-truth box center.
        center_dists = (anchor_centers[None, :, :] -
                        gt_centers[:, None, :]).abs_()
        sampling_regions = self.center_sampling_radius * anchor_sizes[None, :]

        match_quality_matrix = center_dists.max(
            dim=2).values < sampling_regions

        pairwise_dist = pairwise_point_box_distance(anchor_centers, gt_boxes)
        pairwise_dist = pairwise_dist.permute(1, 0, 2)  # (M, R, 4)

        # The original FCOS anchor matching rule: anchor point must be inside GT.
        match_quality_matrix &= pairwise_dist.min(dim=2).values > 0

        # Multilevel anchor matching in FCOS: each anchor is only responsible
        # for certain scale range.
        pairwise_dist = pairwise_dist.max(dim=2).values
        match_quality_matrix &= (pairwise_dist > lower_bound[None, :]) & (
            pairwise_dist < upper_bound[None, :])
        # Match the GT box with minimum area, if there are multiple GT matches.
        gt_areas = gt_boxes.area()  # (M, )

        match_quality_matrix = match_quality_matrix.to(torch.float32)
        match_quality_matrix *= 1e8 - gt_areas[:, None]
        return match_quality_matrix  # (M, R)
Ejemplo n.º 15
0
    def get_graph_centers(self, box, cls_prob, label):
        gt_boxes = []
        gt_classes = []
        gt_scores = []
        for idx, gt_class in enumerate(label):
            curr_cls_prob = cls_prob.index_select(1, index=gt_class)
            non_zero_idxs = torch.where(curr_cls_prob >= 0)[0]
            top_ranking_idxs = self.get_top_ranking_proposals(
                curr_cls_prob[non_zero_idxs])
            non_zero_idxs = non_zero_idxs[top_ranking_idxs]
            curr_box = box[non_zero_idxs]
            curr_cls_prob = curr_cls_prob[non_zero_idxs]

            graph = self.build_graph(curr_box)
            count = curr_cls_prob.size(0)
            keep_idxs = []
            curr_gt_scores = []
            while True:
                order = torch.sum(graph, 1).argsort(descending=True)
                keep_idxs.append(order[0])

                graph_idx = torch.where(graph[order[0], :] > 0)[0]
                curr_gt_scores.append(torch.max(curr_cls_prob[graph_idx]))

                graph[:, graph_idx] = 0
                graph[graph_idx, :] = 0
                count = count - len(graph_idx)
                if count <= 5:
                    break
            keep_idxs = torch.stack(keep_idxs, 0)
            curr_gt_scores = torch.stack(curr_gt_scores, 0)
            curr_gt_boxes = curr_box[keep_idxs]

            keep_idxs_selected = curr_gt_scores.argsort().flip([
                0
            ])[:min(len(curr_gt_scores), self.max_pc_num)].clone().detach()
            gt_boxes.append(curr_gt_boxes[keep_idxs_selected])
            gt_scores.append(curr_gt_scores[keep_idxs_selected])
            gt_classes.append(
                (torch.zeros_like(keep_idxs_selected) + gt_class).long())

            # Delete selected proposals
            ids_to_remove = non_zero_idxs[keep_idxs][keep_idxs_selected]
            indexer = torch.ones(cls_prob.size(0)).to(cls_prob.device)
            indexer[ids_to_remove] = 0.
            indexer_mask = indexer == 1.
            cls_prob = cls_prob.clone().detach()[indexer_mask]
            box = copy.deepcopy(box)[indexer_mask]
        new_instance = Instances(box.image_size)
        new_instance.gt_boxes = copy.deepcopy(
            Boxes.cat([x.proposal_boxes for x in gt_boxes]))
        new_instance.gt_classes = torch.cat(gt_classes)
        gt_scores = torch.cat(gt_scores)
        return new_instance, gt_scores
Ejemplo n.º 16
0
    def label_anchors(self, anchors, gt_instances):
        """
        Args:
            anchors (list[Boxes]): A list of #feature level Boxes.
                The Boxes contains anchors of this image on the specific feature level.
            gt_instances (list[Instances]): a list of N `Instances`s. The i-th
                `Instances` contains the ground-truth per-instance annotations
                for the i-th input image.

        Returns:
            list[Tensor]:
                List of #img tensors. i-th element is a vector of labels whose length is
                the total number of anchors across all feature maps (sum(Hi * Wi * A)).
                Label values are in {-1, 0, ..., K}, with -1 means ignore, and K means background.
            list[Tensor]:
                i-th element is a Rx4 tensor, where R is the total number of anchors across
                feature maps. The values are the matched gt boxes for each anchor.
                Values are undefined for those anchors not labeled as foreground.
        """
        anchors = Boxes.cat(anchors)  # Rx4
        num_anchors = anchors.tensor.shape[0]

        gt_labels, matched_gt_boxes, matched_gt_marks, matched_gt_marks_labels = [[] for _ in range(4)]

        for gt_per_image in gt_instances:
            match_quality_matrix = pairwise_iou(gt_per_image.gt_boxes, anchors)
            matched_idxs, anchor_labels = self.anchor_matcher(match_quality_matrix)
            del match_quality_matrix

            if len(gt_per_image) > 0:
                matched_gt_boxes_i = gt_per_image.gt_boxes.tensor[matched_idxs]

                matched_gt_marks_iv = gt_per_image.gt_keypoints.tensor[matched_idxs]
                matched_gt_marks_i = matched_gt_marks_iv[:, :, :2].flatten(1)
                matched_gt_marks_labels_i = matched_gt_marks_iv[:, :, 2].flatten(1)
                matched_gt_marks_labels_i, _ = torch.min(matched_gt_marks_labels_i, dim=1)

                gt_labels_i = gt_per_image.gt_classes[matched_idxs]
                # Anchors with label 0 are treated as background.
                gt_labels_i[anchor_labels == 0] = self.num_classes
                # Anchors with label -1 are ignored.
                gt_labels_i[anchor_labels == -1] = -1
            else:
                matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
                gt_labels_i = torch.zeros_like(matched_idxs) + self.num_classes
                matched_gt_marks_i = torch.zeros(num_anchors, self.num_landmark * 2).to(self.device)
                matched_gt_marks_labels_i = torch.zeros(num_anchors).to(self.device)

            gt_labels.append(gt_labels_i)
            matched_gt_boxes.append(matched_gt_boxes_i)
            matched_gt_marks.append(matched_gt_marks_i)
            matched_gt_marks_labels.append(matched_gt_marks_labels_i)

        return gt_labels, matched_gt_boxes, matched_gt_marks, matched_gt_marks_labels
Ejemplo n.º 17
0
 def compute_ctrness_targets(self, anchors: List[Boxes], gt_boxes: List[torch.Tensor]):
     anchors = Boxes.cat(anchors).tensor  # Rx4
     reg_targets = [self.box2box_transform.get_deltas(anchors, m) for m in gt_boxes]
     reg_targets = torch.stack(reg_targets, dim=0)  # NxRx4
     if len(reg_targets) == 0:
         return reg_targets.new_zeros(len(reg_targets))
     left_right = reg_targets[:, :, [0, 2]]
     top_bottom = reg_targets[:, :, [1, 3]]
     ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * (
         top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]
     )
     return torch.sqrt(ctrness)
Ejemplo n.º 18
0
 def _uniform_sample_train_points(self, instances):
     assert self.training
     proposal_boxes = [x.proposal_boxes for x in instances]
     cat_boxes = Boxes.cat(proposal_boxes)
     # uniform sample
     point_coords = torch.rand(
         len(cat_boxes), self.mask_point_train_num_points, 2, device=cat_boxes.tensor.device
     )
     # sample point_labels
     point_coords_wrt_image = get_point_coords_wrt_image(cat_boxes.tensor, point_coords)
     point_labels = sample_point_labels(instances, point_coords_wrt_image)
     return point_coords, point_labels
Ejemplo n.º 19
0
    def write_priors(self, images: Tensor, output_priors: str):
        features = self.backbone(images)
        features = [features[f] for f in self.head_in_features]
        anchors = Boxes.cat(self.anchor_generator(features)).tensor.detach().cpu().numpy()

        with open(output_priors, "wb") as f:
            import struct

            shape = anchors.shape
            f.write(struct.pack("=i", len(shape)))
            f.write(struct.pack("={}".format("i" * len(shape)), *shape))
            data = anchors.reshape([-1])
            for d in data:
                f.write(struct.pack("=f", d))
Ejemplo n.º 20
0
    def _get_ground_truth(self):
        """
        Returns:
            gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the
                total number of anchors in image i (i.e., len(anchors[i])). Label values are
                in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative class; 1 = positive class.
            gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), 4).
        """
        gt_objectness_logits = []
        gt_anchor_deltas = []
        # Concatenate anchors from all feature maps into a single Boxes per image
        # anchors_i 是第 i 个 image 上的所有 feature maps 的 anchors, list(Boxes)
        # 把每个 image 上所有 feature maps 的 anchors 连接起来
        anchors = [Boxes.cat(anchors_i) for anchors_i in self.anchors]
        for image_size_i, anchors_i, gt_boxes_i in zip(self.image_sizes, anchors, self.gt_boxes):
            """
            image_size_i: (h, w) for the i-th image
            anchors_i: anchors for i-th image
            gt_boxes_i: ground-truth boxes for i-th image
            """
            match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i, anchors_i)
            # [N, ]
            matched_idxs, gt_objectness_logits_i = retry_if_cuda_oom(self.anchor_matcher)(
                match_quality_matrix
            )
            # Matching is memory-expensive and may result in CPU tensors. But the result is small
            gt_objectness_logits_i = gt_objectness_logits_i.to(device=gt_boxes_i.device)
            del match_quality_matrix

            if self.boundary_threshold >= 0:
                # Discard anchors that go out of the boundaries of the image
                # NOTE: This is legacy functionality that is turned off by default in Detectron2
                anchors_inside_image = anchors_i.inside_box(image_size_i, self.boundary_threshold)
                gt_objectness_logits_i[~anchors_inside_image] = -1

            if len(gt_boxes_i) == 0:
                # These values won't be used anyway since the anchor is labeled as background
                gt_anchor_deltas_i = torch.zeros_like(anchors_i.tensor)
            else:
                # TODO wasted computation for ignored boxes
                matched_gt_boxes = gt_boxes_i[matched_idxs]
                # [N, 4]
                gt_anchor_deltas_i = self.box2box_transform.get_deltas(
                    anchors_i.tensor, matched_gt_boxes.tensor
                )

            gt_objectness_logits.append(gt_objectness_logits_i)
            gt_anchor_deltas.append(gt_anchor_deltas_i)

        return gt_objectness_logits, gt_anchor_deltas
Ejemplo n.º 21
0
 def ga_sampled_approxs(self, images, features, gt_instances):
     
     approxs = self.approx_anchor_generator(features)
     approxs = Boxes.cat(approxs)
     image_sizes = [x.image_size for x in gt_instances]
     inside_flags_list = []
     for image_size_i in image_sizes:
         if self.boundary_threshold >= 0:
             inside_flags = approxs.inside_box(image_size_i, self.boundary_threshhold)
             inside_flags_list.append(inside_flags)
     inside_flags = (
                 torch.stack(inside_flags_list, 0).sum(dim=0) > 0)        
     
     return approxs, inside_flags_list
Ejemplo n.º 22
0
    def __call__(self, box_cls, box_regression, centerness, gt_instances, anchors):
        labels, reg_targets = self.prepare_targets(gt_instances, anchors)

        N = len(labels)
        box_cls_flatten, box_regression_flatten = concat_box_prediction_layers(box_cls, box_regression)
        centerness_flatten = [ct.permute(0, 2, 3, 1).reshape(N, -1, 1) for ct in centerness]
        centerness_flatten = torch.cat(centerness_flatten, dim=1).reshape(-1)

        labels_flatten = torch.cat(labels, dim=0)
        reg_targets_flatten = torch.cat(reg_targets, dim=0)
        # anchors_flatten = torch.cat([cat_boxlist(anchors_per_image).bbox for anchors_per_image in anchors], dim=0)
        anchors_flatten = torch.cat([Boxes.cat(anchors).tensor for _ in range(N)], dim=0)

        pos_inds = torch.nonzero(labels_flatten != self.num_classes).squeeze(1)

        num_gpus = get_num_gpus()
        total_num_pos = reduce_sum(pos_inds.new_tensor([pos_inds.numel()])).item()
        num_pos_avg_per_gpu = max(total_num_pos / float(num_gpus), 1.0)

        # one hot label for focal loss
        class_target = torch.zeros_like(box_cls_flatten)
        class_target[pos_inds, labels_flatten[pos_inds]] = 1

        cls_loss = sigmoid_focal_loss_jit(
            box_cls_flatten,
            class_target,
            alpha=self.focal_loss_alpha,
            gamma=self.focal_loss_gamma,
            reduction="sum"
        ) / num_pos_avg_per_gpu

        box_regression_flatten = box_regression_flatten[pos_inds]
        reg_targets_flatten = reg_targets_flatten[pos_inds]
        anchors_flatten = anchors_flatten[pos_inds]
        centerness_flatten = centerness_flatten[pos_inds]
        centerness_targets = self.compute_centerness_targets(reg_targets_flatten, anchors_flatten)
        sum_centerness_targets_avg_per_gpu = reduce_sum(centerness_targets.sum()).item() / float(num_gpus)

        if pos_inds.numel() > 0:
            reg_loss = self.DIoULoss(box_regression_flatten, reg_targets_flatten, anchors_flatten,
                                     weight=centerness_targets) / sum_centerness_targets_avg_per_gpu
            centerness_loss = self.centerness_loss_func(centerness_flatten, centerness_targets) / num_pos_avg_per_gpu
        else:
            reg_loss = box_regression_flatten.sum()
            centerness_loss = centerness_flatten.sum()

        return cls_loss, reg_loss * self.cfg.MODEL.ATSS.REG_LOSS_WEIGHT, centerness_loss
Ejemplo n.º 23
0
def point_sample_fine_grained_features(features_list, feature_scales, boxes,
                                       point_coords):
    """
    Get features from feature maps in `features_list` that correspond to specific point coordinates
        inside each bounding box from `boxes`.

    Args:
        features_list (list[Tensor]): A list of feature map tensors to get features from.
        feature_scales (list[float]): A list of scales for tensors in `features_list`.
        boxes (list[Boxes]): A list of I Boxes  objects that contain R_1 + ... + R_I = R boxes all
            together.
        point_coords (Tensor): A tensor of shape (R, P, 2) that contains
            [0, 1] x [0, 1] box-normalized coordinates of the P sampled points.

    Returns:
        point_features (Tensor): A tensor of shape (R, C, P) that contains features sampled
            from all features maps in feature_list for P sampled points for all R boxes in `boxes`.
        point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains image-level
            coordinates of P points.
    """
    cat_boxes = Boxes.cat(boxes)
    num_boxes = [len(b) for b in boxes]

    point_coords_wrt_image = get_point_coords_wrt_image(
        cat_boxes.tensor, point_coords)
    split_point_coords_wrt_image = torch.split(point_coords_wrt_image,
                                               num_boxes)

    point_features = []
    for idx_img, point_coords_wrt_image_per_image in enumerate(
            split_point_coords_wrt_image):
        point_features_per_image = []
        for idx_feature, feature_map in enumerate(features_list):
            h, w = feature_map.shape[-2:]
            scale = torch.tensor(
                [w, h],
                device=feature_map.device) / feature_scales[idx_feature]
            point_coords_scaled = point_coords_wrt_image_per_image / scale
            point_features_per_image.append(
                point_sample(
                    feature_map[idx_img].unsqueeze(0),
                    point_coords_scaled.unsqueeze(0),
                    align_corners=False,
                ).squeeze(0).transpose(1, 0))
        point_features.append(cat(point_features_per_image, dim=1))

    return cat(point_features, dim=0), point_coords_wrt_image
Ejemplo n.º 24
0
    def label_anchors(self, anchors, gt_instances):
        """
        Args:
            anchors (list[Boxes]): A list of #feature level Boxes.
                The Boxes contains anchors of this image on the specific feature level.
            gt_instances (list[Instances]): a list of N `Instances`s. The i-th
                `Instances` contains the ground-truth per-instance annotations
                for the i-th input image.

        Returns:
            list[Tensor]:
                List of #img tensors. i-th element is a vector of labels whose length is
                the total number of anchors across all feature maps (sum(Hi * Wi * A)).
                Label values are in {-1, 0, ..., K}, with -1 means ignore, and K means background.
            list[Tensor]:
                i-th element is a Rx4 tensor, where R is the total number of anchors across
                feature maps. The values are the matched gt boxes for each anchor.
                Values are undefined for those anchors not labeled as foreground.
        """

        # generate strides: [R]
        strides = []
        backbone_shape = self.backbone.output_shape()
        feature_shapes = [backbone_shape[f] for f in self.in_features]
        for i in range(len(feature_shapes)):
            stride = feature_shapes[i].stride
            anchor_num_i = anchors[i].tensor.shape[0]
            stride = torch.full((anchor_num_i,), stride, device=anchors[i].tensor.device)
            strides.append(stride)
        anchors = Boxes.cat(anchors).tensor
        centers = torch.stack(((anchors[:, 0] + anchors[:, 2]) // 2, (anchors[:, 1] + anchors[:, 3]) // 2), dim=1)
        strides = torch.cat(strides, 0)

        gt_labels = []
        matched_gt_boxes = []
        for gt_per_image in gt_instances:
            image_size = gt_per_image.image_size
            centers_invalid = (centers[:, 0] >= image_size[1]).logical_or(
                centers[:, 1] >= image_size[0])

            objectness_label_i, bbox_label_i = rep_points_match_with_classes(
                centers, strides, gt_per_image.gt_boxes, gt_per_image.gt_classes)
            objectness_label_i[centers_invalid] = -1
            gt_labels.append(objectness_label_i)
            matched_gt_boxes.append(bbox_label_i)
        return gt_labels, matched_gt_boxes
Ejemplo n.º 25
0
    def inference(self, pred_logits, pred_deltas, pred_masks, anchors, indexes,
                  images):
        """
        Arguments:
            pred_logits, pred_deltas, pred_masks: Same as the output of:
                meth:`TensorMaskHead.forward`
            anchors, indexes: Same as the input of meth:`TensorMask.get_ground_truth`
            images (ImageList): the input images

        Returns:
            results (List[Instances]): a list of #images elements.
        """
        assert len(anchors) == len(images)
        results = []

        pred_logits = [
            permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits
        ]
        pred_deltas = [permute_to_N_HWA_K(x, 4) for x in pred_deltas]

        pred_logits = cat(pred_logits, dim=1)
        pred_deltas = cat(pred_deltas, dim=1)

        for img_idx, (anchors_im,
                      indexes_im) in enumerate(zip(anchors, indexes)):
            # Get the size of the current image
            image_size = images.image_sizes[img_idx]

            logits_im = pred_logits[img_idx]
            deltas_im = pred_deltas[img_idx]

            if self.mask_on:
                masks_im = [[mla[img_idx] for mla in ml] for ml in pred_masks]
            else:
                masks_im = [None] * self.num_levels
            results_im = self.inference_single_image(
                logits_im,
                deltas_im,
                masks_im,
                Boxes.cat(anchors_im),
                cat(indexes_im),
                tuple(image_size),
            )
            results.append(results_im)
        return results
Ejemplo n.º 26
0
 def _sample_train_points(self, coarse_mask, instances):
     assert self.training
     gt_classes = cat([x.gt_classes for x in instances])
     with torch.no_grad():
         # sample point_coords
         point_coords = get_uncertain_point_coords_with_randomness(
             coarse_mask,
             lambda logits: calculate_uncertainty(logits, gt_classes),
             self.mask_point_train_num_points,
             self.mask_point_oversample_ratio,
             self.mask_point_importance_sample_ratio,
         )
         # sample point_labels
         proposal_boxes = [x.proposal_boxes for x in instances]
         cat_boxes = Boxes.cat(proposal_boxes)
         point_coords_wrt_image = get_point_coords_wrt_image(cat_boxes.tensor, point_coords)
         point_labels = sample_point_labels(instances, point_coords_wrt_image)
     return point_coords, point_labels
Ejemplo n.º 27
0
    def label_anchors(self, anchors: List[Boxes],
                      gt_instances: List[Instances]):
        """
        Same interface as :meth:`RetinaNet.label_anchors`, but implemented with FCOS
        anchor matching rule.

        Unlike RetinaNet, there are no ignored anchors.
        """

        gt_labels, matched_gt_boxes = [], []

        for inst in gt_instances:
            if len(inst) > 0:
                match_quality_matrix = self._match_anchors(
                    inst.gt_boxes, anchors)

                # Find matched ground-truth box per anchor. Un-matched anchors are
                # assigned -1. This is equivalent to using an anchor matcher as used
                # in R-CNN/RetinaNet: `Matcher(thresholds=[1e-5], labels=[0, 1])`
                match_quality, matched_idxs = match_quality_matrix.max(dim=0)
                matched_idxs[match_quality < 1e-5] = -1

                matched_gt_boxes_i = inst.gt_boxes.tensor[matched_idxs.clip(
                    min=0)]
                gt_labels_i = inst.gt_classes[matched_idxs.clip(min=0)]

                # Anchors with matched_idxs = -1 are labeled background.
                gt_labels_i[matched_idxs < 0] = self.num_classes
            else:
                matched_gt_boxes_i = torch.zeros_like(
                    Boxes.cat(anchors).tensor)
                gt_labels_i = torch.full(
                    (len(matched_gt_boxes_i), ),
                    fill_value=self.num_classes,
                    dtype=torch.long,
                    device=matched_gt_boxes_i.device,
                )

            gt_labels.append(gt_labels_i)
            matched_gt_boxes.append(matched_gt_boxes_i)

        return gt_labels, matched_gt_boxes
Ejemplo n.º 28
0
def _get_boxes_from_image(image, scale_xy=None):
    """Extract boxes from image created by `_get_image_with_box()`"""
    cur_img_int = ((image / 10.0 + 0.5).int().float() * 10.0).int()
    values = torch.unique(cur_img_int)
    gt_values = [x * 10 for x in range(len(values))]
    assert set(values.tolist()) == set(gt_values)
    boxes = []
    for idx in range(cur_img_int.shape[0]):
        val = torch.unique(cur_img_int[idx]).tolist()
        val = max(val)
        if val == 0:
            continue
        # mask = (cur_img_int[idx, :, :] == val).int()
        mask = (cur_img_int[idx, :, :] > 0).int()
        box_xywh = bu.get_box_from_mask(mask.numpy())
        boxes.append(bu.to_boxes_from_xywh(box_xywh))
    ret = Boxes.cat(boxes)
    if scale_xy is not None:
        ret.scale(*scale_xy)
    return ret
Ejemplo n.º 29
0
    def prepare_iou_based_targets(self, targets, anchors):
        """Compute IoU-based targets"""

        cls_labels = []
        reg_targets = []
        matched_idx_all = []
        for im_i in range(len(targets)):
            targets_per_im = targets[im_i]
            assert targets_per_im.mode == "xyxy"
            bboxes_per_im = targets_per_im.bbox
            labels_per_im = targets_per_im.get_field("labels")
            anchors_per_im = Boxes.cat(anchors[im_i])
            num_gt = bboxes_per_im.shape[0]

            match_quality_matrix = boxlist_iou(targets_per_im, anchors_per_im)
            matched_idxs = self.matcher(match_quality_matrix)
            targets_per_im = targets_per_im.copy_with_fields(['labels'])
            matched_targets = targets_per_im[matched_idxs.clamp(min=0)]

            cls_labels_per_im = matched_targets.get_field("labels")
            cls_labels_per_im = cls_labels_per_im.to(dtype=torch.float32)

            # Background (negative examples)
            bg_indices = matched_idxs == Matcher.BELOW_LOW_THRESHOLD
            cls_labels_per_im[bg_indices] = 0

            # discard indices that are between thresholds
            inds_to_discard = matched_idxs == Matcher.BETWEEN_THRESHOLDS
            cls_labels_per_im[inds_to_discard] = -1

            matched_gts = matched_targets.bbox
            matched_idx_all.append(matched_idxs.view(1, -1))

            reg_targets_per_im = self.box_coder.encode(matched_gts,
                                                       anchors_per_im.bbox)
            cls_labels.append(cls_labels_per_im)
            reg_targets.append(reg_targets_per_im)

        return cls_labels, reg_targets, matched_idx_all
Ejemplo n.º 30
0
 def func_cat(x: torch.Tensor):
     boxes1 = Boxes(x)
     boxes2 = Boxes(x)
     # boxes3 = Boxes.cat([boxes1, boxes2])  # this is not supported by torchsript for now.
     boxes3 = boxes1.cat([boxes1, boxes2])
     return boxes3