def convert_output(output):
     r = Instances(tuple(output[0]))
     r.pred_classes = output[1]
     r.pred_boxes = Boxes(output[2])
     r.scores = output[3]
     return r
Exemple #2
0
def get_empty_instance(h, w):
    inst = Instances((h, w))
    inst.gt_boxes = Boxes(torch.rand(0, 4))
    inst.gt_classes = torch.tensor([]).to(dtype=torch.int64)
    inst.gt_masks = BitMasks(torch.rand(0, h, w))
    return inst
Exemple #3
0
    def _inference_one_image(self, input):
        """
        Args:
            input (dict): one dataset dict

        Returns:
            dict: one output dict
        """
        augmented_inputs = self.tta_mapper(input)

        do_hflip = [k.pop("horiz_flip", False) for k in augmented_inputs]
        heights = [k["height"] for k in augmented_inputs]
        widths = [k["width"] for k in augmented_inputs]
        assert (
            len(set(heights)) == 1 and len(set(widths)) == 1
        ), "Augmented version of the inputs should have the same original resolution!"
        height = heights[0]
        width = widths[0]

        # 1. Detect boxes from all augmented versions
        # 1.1: forward with all augmented images
        with self._turn_off_roi_head("mask_on"), self._turn_off_roi_head(
                "keypoint_on"):
            # temporarily disable mask/keypoint head
            outputs = self._batch_inference(augmented_inputs,
                                            do_postprocess=False)
        # 1.2: union the results
        all_boxes = []
        all_scores = []
        all_classes = []
        for idx, output in enumerate(outputs):
            rescaled_output = detector_postprocess(output, height, width)
            pred_boxes = rescaled_output.pred_boxes.tensor
            if do_hflip[idx]:
                pred_boxes[:, [0, 2]] = width - pred_boxes[:, [2, 0]]
            all_boxes.append(pred_boxes)
            all_scores.extend(rescaled_output.scores)
            all_classes.extend(rescaled_output.pred_classes)
        all_boxes = torch.cat(all_boxes, dim=0).cpu()
        num_boxes = len(all_boxes)

        # 1.3: select from the union of all results
        num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES
        # +1 because fast_rcnn_inference expects background scores as well
        all_scores_2d = torch.zeros(num_boxes,
                                    num_classes + 1,
                                    device=all_boxes.device)
        for idx, cls, score in zip(count(), all_classes, all_scores):
            all_scores_2d[idx, cls] = score

        merged_instances, _ = fast_rcnn_inference_single_image(
            all_boxes,
            all_scores_2d,
            (height, width),
            1e-8,
            self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
            self.cfg.TEST.DETECTIONS_PER_IMAGE,
        )

        if not self.cfg.MODEL.MASK_ON:
            return {"instances": merged_instances}

        # 2. Use the detected boxes to obtain masks
        # 2.1: rescale the detected boxes
        augmented_instances = []
        for idx, input in enumerate(augmented_inputs):
            actual_height, actual_width = input["image"].shape[1:3]
            scale_x = actual_width * 1.0 / width
            scale_y = actual_height * 1.0 / height
            pred_boxes = merged_instances.pred_boxes.clone()
            pred_boxes.tensor[:, 0::2] *= scale_x
            pred_boxes.tensor[:, 1::2] *= scale_y
            if do_hflip[idx]:
                pred_boxes.tensor[:, [
                    0, 2
                ]] = actual_width - pred_boxes.tensor[:, [2, 0]]

            aug_instances = Instances(
                image_size=(actual_height, actual_width),
                pred_boxes=pred_boxes,
                pred_classes=merged_instances.pred_classes,
                scores=merged_instances.scores,
            )
            augmented_instances.append(aug_instances)
        # 2.2: run forward on the detected boxes
        outputs = self._batch_inference(augmented_inputs,
                                        augmented_instances,
                                        do_postprocess=False)
        for idx, output in enumerate(outputs):
            if do_hflip[idx]:
                output.pred_masks = output.pred_masks.flip(dims=[3])
        # 2.3: average the predictions
        all_pred_masks = torch.stack([o.pred_masks for o in outputs], dim=0)
        avg_pred_masks = torch.mean(all_pred_masks, dim=0)
        output = outputs[0]
        output.pred_masks = avg_pred_masks
        output = detector_postprocess(output, height, width)
        return {"instances": output}
Exemple #4
0
    def inference_single_image(self, pred_logits, pred_deltas, pred_masks,
                               anchors, indexes, image_size):
        """
        Single-image inference. Return bounding-box detection results by thresholding
        on scores and applying non-maximum suppression (NMS).

        Arguments:
            pred_logits (list[Tensor]): list of #feature levels. Each entry contains
                tensor of size (AxHxW, K)
            pred_deltas (list[Tensor]): Same shape as 'pred_logits' except that K becomes 4.
            pred_masks (list[list[Tensor]]): List of #feature levels, each is a list of #anchors.
                Each entry contains tensor of size (M_i*M_i, H, W). `None` if mask_on=False.
            anchors (list[Boxes]): list of #feature levels. Each entry contains
                a Boxes object, which contains all the anchors for that
                image in that feature level.
            image_size (tuple(H, W)): a tuple of the image height and width.

        Returns:
            Same as `inference`, but for only one image.
        """
        pred_logits = pred_logits.flatten().sigmoid_()
        # We get top locations across all levels to accelerate the inference speed,
        # which does not seem to affect the accuracy.
        # First select values above the threshold
        logits_top_idxs = torch.where(pred_logits > self.score_threshold)[0]
        # Then get the top values
        num_topk = min(self.topk_candidates, logits_top_idxs.shape[0])
        pred_prob, topk_idxs = pred_logits[logits_top_idxs].sort(
            descending=True)
        # Keep top k scoring values
        pred_prob = pred_prob[:num_topk]
        # Keep top k values
        top_idxs = logits_top_idxs[topk_idxs[:num_topk]]

        # class index
        cls_idxs = top_idxs % self.num_classes
        # HWA index
        top_idxs //= self.num_classes
        # predict boxes
        pred_boxes = self.box2box_transform.apply_deltas(
            pred_deltas[top_idxs], anchors[top_idxs].tensor)
        # apply nms
        keep = batched_nms(pred_boxes, pred_prob, cls_idxs, self.nms_threshold)
        # pick the top ones
        keep = keep[:self.detections_im]

        results = Instances(image_size)
        results.pred_boxes = Boxes(pred_boxes[keep])
        results.scores = pred_prob[keep]
        results.pred_classes = cls_idxs[keep]

        # deal with masks
        result_masks, result_anchors = [], None
        if self.mask_on:
            # index and anchors, useful for masks
            top_indexes = indexes[top_idxs]
            top_anchors = anchors[top_idxs]
            result_indexes = top_indexes[keep]
            result_anchors = top_anchors[keep]
            # Get masks and do sigmoid
            for lvl, _, h, w, anc in result_indexes.tolist():
                cur_size = self.mask_sizes[anc] * (2**lvl
                                                   if self.bipyramid_on else 1)
                result_masks.append(
                    torch.sigmoid(pred_masks[lvl][anc][:, h, w].view(
                        1, cur_size, cur_size)))

        return results, (result_masks, result_anchors)
Exemple #5
0
    def inference_single_image(self, box_cls, box_delta, anchors, image_size):
        """
        Single-image inference. Return bounding-box detection results by thresholding
        on scores and applying non-maximum suppression (NMS).

        Arguments:
            box_cls (list[Tensor]): list of #feature levels. Each entry contains
                tensor of size (H x W x A, K)
            box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4.
            anchors (list[Boxes]): list of #feature levels. Each entry contains
                a Boxes object, which contains all the anchors for that
                image in that feature level.
            image_size (tuple(H, W)): a tuple of the image height and width.

        Returns:
            Same as `inference`, but for only one image.
        """
        boxes_all = []
        scores_all = []
        class_idxs_all = []

        # Iterate over every feature level
        for box_cls_i, box_reg_i, anchors_i in zip(box_cls, box_delta,
                                                   anchors):
            # (HxWxAxK,)
            box_cls_i = box_cls_i.flatten().sigmoid_()

            # Keep top k top scoring indices only.
            num_topk = min(self.topk_candidates, box_reg_i.size(0))
            # torch.sort is actually faster than .topk (at least on GPUs)
            predicted_prob, topk_idxs = box_cls_i.sort(descending=True)
            predicted_prob = predicted_prob[:num_topk]
            topk_idxs = topk_idxs[:num_topk]

            # filter out the proposals with low confidence score
            keep_idxs = predicted_prob > self.score_threshold
            predicted_prob = predicted_prob[keep_idxs]
            topk_idxs = topk_idxs[keep_idxs]

            anchor_idxs = topk_idxs // self.num_classes
            classes_idxs = topk_idxs % self.num_classes

            box_reg_i = box_reg_i[anchor_idxs]
            anchors_i = anchors_i[anchor_idxs]
            # predict boxes
            predicted_boxes = self.box2box_transform.apply_deltas(
                box_reg_i, anchors_i.tensor)

            boxes_all.append(predicted_boxes)
            scores_all.append(predicted_prob)
            class_idxs_all.append(classes_idxs)

        boxes_all, scores_all, class_idxs_all = [
            cat(x) for x in [boxes_all, scores_all, class_idxs_all]
        ]
        keep = batched_nms(boxes_all, scores_all, class_idxs_all,
                           self.nms_threshold)
        keep = keep[:self.max_detections_per_image]

        result = Instances(image_size)
        result.pred_boxes = Boxes(boxes_all[keep])
        result.scores = scores_all[keep]
        result.pred_classes = class_idxs_all[keep]
        return result
    def test_rrpn(self):
        torch.manual_seed(121)
        cfg = get_cfg()
        cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RRPN"
        cfg.MODEL.ANCHOR_GENERATOR.NAME = "RotatedAnchorGenerator"
        cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]]
        cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1]]
        cfg.MODEL.ANCHOR_GENERATOR.ANGLES = [[0, 60]]
        cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1, 1)
        cfg.MODEL.RPN.HEAD_NAME = "StandardRPNHead"
        backbone = build_backbone(cfg)
        proposal_generator = build_proposal_generator(cfg,
                                                      backbone.output_shape())
        num_images = 2
        images_tensor = torch.rand(num_images, 20, 30)
        image_sizes = [(10, 10), (20, 30)]
        images = ImageList(images_tensor, image_sizes)
        image_shape = (15, 15)
        num_channels = 1024
        features = {"res4": torch.rand(num_images, num_channels, 1, 2)}
        gt_boxes = torch.tensor([[2, 2, 2, 2, 0], [4, 4, 4, 4, 0]],
                                dtype=torch.float32)
        gt_instances = Instances(image_shape)
        gt_instances.gt_boxes = RotatedBoxes(gt_boxes)
        with EventStorage():  # capture events in a new storage to discard them
            proposals, proposal_losses = proposal_generator(
                images, features, [gt_instances[0], gt_instances[1]])

        expected_losses = {
            "loss_rpn_cls": torch.tensor(0.0432923734),
            "loss_rpn_loc": torch.tensor(0.1552739739),
        }
        for name in expected_losses.keys():
            self.assertTrue(
                torch.allclose(proposal_losses[name], expected_losses[name]))

        expected_proposal_boxes = [
            RotatedBoxes(
                torch.tensor([
                    [
                        0.60189795, 1.24095452, 61.98131943, 18.03621292,
                        -4.07244873
                    ],
                    [
                        15.64940453, 1.69624567, 59.59749603, 16.34339333,
                        2.62692475
                    ],
                    [
                        -3.02982378, -2.69752932, 67.90952301, 59.62455750,
                        59.97010040
                    ],
                    [
                        16.71863365, 1.98309708, 35.61507797, 32.81484985,
                        62.92267227
                    ],
                    [
                        0.49432933, -7.92979717, 67.77606201, 62.93098450,
                        -1.85656738
                    ],
                    [
                        8.00880814, 1.36017394, 121.81007385, 32.74150467,
                        50.44297409
                    ],
                    [
                        16.44299889, -4.82221127, 63.39775848, 61.22503662,
                        54.12270737
                    ],
                    [
                        5.00000000, 5.00000000, 10.00000000, 10.00000000,
                        -0.76943970
                    ],
                    [
                        17.64130402, -0.98095351, 61.40377808, 16.28918839,
                        55.53118134
                    ],
                    [
                        0.13016054, 4.60568953, 35.80157471, 32.30180359,
                        62.52872086
                    ],
                    [
                        -4.26460743, 0.39604485, 124.30079651, 31.84611320,
                        -1.58203125
                    ],
                    [
                        7.52815342, -0.91636634, 62.39784622, 15.45565224,
                        60.79549789
                    ],
                ])),
            RotatedBoxes(
                torch.tensor([
                    [
                        0.07734215, 0.81635046, 65.33510590, 17.34688377,
                        -1.51821899
                    ],
                    [
                        -3.41833067, -3.11320257, 64.17595673, 60.55617905,
                        58.27033234
                    ],
                    [
                        20.67383385, -6.16561556, 63.60531998, 62.52315903,
                        54.85546494
                    ],
                    [
                        15.00000000, 10.00000000, 30.00000000, 20.00000000,
                        -0.18218994
                    ],
                    [
                        9.22646523, -6.84775209, 62.09895706, 65.46472931,
                        -2.74307251
                    ],
                    [
                        15.00000000, 4.93451595, 30.00000000, 9.86903191,
                        -0.60272217
                    ],
                    [
                        8.88342094, 2.65560246, 120.95362854, 32.45022202,
                        55.75970078
                    ],
                    [
                        16.39088631, 2.33887148, 34.78761292, 35.61492920,
                        60.81977463
                    ],
                    [
                        9.78298569, 10.00000000, 19.56597137, 20.00000000,
                        -0.86660767
                    ],
                    [
                        1.28576660, 5.49873352, 34.93610382, 33.22600174,
                        60.51599884
                    ],
                    [
                        17.58912468, -1.63270092, 62.96052551, 16.45713997,
                        52.91245270
                    ],
                    [
                        5.64749718, -1.90428460, 62.37649155, 16.19474792,
                        61.09543991
                    ],
                    [
                        0.82255805, 2.34931135, 118.83985901, 32.83671188,
                        56.50753784
                    ],
                    [
                        -5.33874989, 1.64404404, 125.28501892, 33.35424042,
                        -2.80731201
                    ],
                ])),
        ]

        expected_objectness_logits = [
            torch.tensor([
                0.10111768,
                0.09112845,
                0.08466332,
                0.07589971,
                0.06650183,
                0.06350251,
                0.04299347,
                0.01864817,
                0.00986163,
                0.00078543,
                -0.04573630,
                -0.04799230,
            ]),
            torch.tensor([
                0.11373727,
                0.09377633,
                0.05281663,
                0.05143715,
                0.04040275,
                0.03250912,
                0.01307789,
                0.01177734,
                0.00038105,
                -0.00540255,
                -0.01194804,
                -0.01461012,
                -0.03061717,
                -0.03599222,
            ]),
        ]

        torch.set_printoptions(precision=8, sci_mode=False)

        for proposal, expected_proposal_box, im_size, expected_objectness_logit in zip(
                proposals, expected_proposal_boxes, image_sizes,
                expected_objectness_logits):
            self.assertEqual(len(proposal), len(expected_proposal_box))
            self.assertEqual(proposal.image_size, im_size)
            # It seems that there's some randomness in the result across different machines:
            # This test can be run on a local machine for 100 times with exactly the same result,
            # However, a different machine might produce slightly different results,
            # thus the atol here.
            err_msg = "computed proposal boxes = {}, expected {}".format(
                proposal.proposal_boxes.tensor, expected_proposal_box.tensor)
            self.assertTrue(
                torch.allclose(proposal.proposal_boxes.tensor,
                               expected_proposal_box.tensor,
                               atol=1e-5),
                err_msg,
            )

            err_msg = "computed objectness logits = {}, expected {}".format(
                proposal.objectness_logits, expected_objectness_logit)
            self.assertTrue(
                torch.allclose(proposal.objectness_logits,
                               expected_objectness_logit,
                               atol=1e-5),
                err_msg,
            )
Exemple #7
0
    def forward(self, outputs, target_sizes):
        """ Perform the computation
        Parameters:
            outputs: raw outputs of the model
            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
                          this must be the original image size (before any data augmentation)
        """
        out_logits, out_bboxes = outputs['pred_logits'], outputs['pred_boxes']

        assert len(out_logits) == len(target_sizes)
        target_sizes = out_bboxes.new_tensor(target_sizes)
        assert target_sizes.shape[1] == 2

        prob = F.softmax(out_logits, -1)
        scores, labels = prob[..., :-1].max(-1)

        if self.scale_normalize:
            num_ins = out_bboxes.shape[1]
            num_loc = num_ins // self.num_stages
            strides = []
            for stride in self.strides:
                strides.append(out_bboxes.new_tensor(stride).repeat(num_loc))
            strides = torch.cat(strides, dim=0)
            out_bboxes[:, :,
                       2:] = out_bboxes[:, :,
                                        2:] * strides[None, :,
                                                      None] * self.scale_coef

        # convert to [x0, y0, x1, y1] format
        boxes = box_cxcywh_to_xyxy(out_bboxes)
        # and from relative [0, 1] to absolute [0, height] coordinates
        img_h, img_w = target_sizes.unbind(1)
        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
        boxes = boxes * scale_fct[:, None, :]

        results = [{
            'scores': s,
            'labels': l,
            'boxes': b
        } for s, l, b in zip(scores, labels, boxes)]

        # post process.
        processed_results = []
        for result_per_image, image_size in zip(results, target_sizes):
            result = Instances(image_size)
            boxes = result_per_image["boxes"].float()
            scores = result_per_image["scores"].float()
            labels = result_per_image["labels"].long()

            # filter.
            keep = scores > self.score_thr
            boxes = boxes[keep, :]
            scores = scores[keep]
            labels = labels[keep]

            # sort and keep tok_k.
            if len(scores) > self.max_per_img:
                sort_inds = torch.argsort(scores, descending=True)
                sort_inds = sort_inds[:self.max_per_img]
                boxes = boxes[sort_inds, :]
                scores = scores[sort_inds]
                labels = labels[sort_inds]

            # append.
            result.pred_boxes = Boxes(boxes)
            result.scores = scores
            result.pred_classes = labels
            # clip boxes.
            if result.has("pred_boxes"):
                output_boxes = result.pred_boxes
            output_boxes.clip(result.image_size)
            result = result[output_boxes.nonempty()]

            processed_results.append({"instances": result})

        return processed_results
Exemple #8
0
    def construct_hopairs_with_features(self, instances: List[Instances],
                                        crop_features) -> List[Instances]:
        """
        Prepare person-object pairs to be used to train HOI heads.
        At training, it returns union regions of person-object proposals and assigns
            training labels. It returns ``self.hoi_batch_size_per_image`` random samples
            from pesron-object pairs, with a fraction of positives that is no larger than
            ``self.hoi_positive_sample_fraction``.
        At inference, it returns union regions of predicted person boxes and object boxes.

        Args:
            instances (list[Instances]):
                At training, proposals_with_gt. See ``self.label_and_sample_proposals``
                At inference, predicted box instances. See ``self._forward_box``

        Returns:
            list[Instances]:
                length `N` list of `Instances`s containing the human-object pairs.
                Each `Instances` has the following fields:

                - union_boxes: the union region of person boxes and object boxes
                - person_boxes: person boxes in a matched sequences with union_boxes
                - object_boxes: object boxes in a matched sequences with union_boxes
                - gt_actions: the ground-truth actions that the pair is assigned.
                    Used for training HOI head.
                - person_box_scores: person box scores from box instances. Used at inference.
                - object_box_scores: object box scores from box instances. Used at inference.
                - object_box_classes: predicted box classes from box instances. Used at inference.
        """
        hopairs = []
        for img_idx, instances_per_image in enumerate(instances):
            with torch.no_grad():
                if self.training:
                    # Proposals generated from person branch in HORPN will be seen as person boxes;
                    # Proposals generated from object branch in HORPN will be object boxes.
                    boxes = instances_per_image.proposal_boxes
                    person_idxs = (instances_per_image.is_person == 1
                                   ).nonzero().squeeze(1)
                    object_idxs = (instances_per_image.is_person == 0
                                   ).nonzero().squeeze(1)
                else:
                    # At inference, split person/object boxes based on predicted classes by box head
                    boxes = instances_per_image.pred_boxes
                    person_idxs = torch.nonzero(
                        instances_per_image.pred_classes == 0).squeeze(1)
                    object_idxs = torch.nonzero(
                        instances_per_image.pred_classes > 0).squeeze(1)

                if self.allow_person_to_person:
                    # Allow person to person interactions. Then all boxes will be used.
                    object_idxs = torch.arange(len(instances_per_image),
                                               device=object_idxs.device)

                num_pboxes, num_oboxes = person_idxs.numel(
                ), object_idxs.numel()

                union_boxes = _pairwise_union_regions(boxes[person_idxs],
                                                      boxes[object_idxs])
                # Indexing person/object boxes in a matched order.
                person_idxs = person_idxs[:,
                                          None].repeat(1,
                                                       num_oboxes).flatten()
                object_idxs = object_idxs[None, :].repeat(num_pboxes,
                                                          1).flatten()
                # Remove self-to-self interaction.
                keep = (person_idxs != object_idxs).nonzero().squeeze(1)
                union_boxes = union_boxes[keep]
                person_idxs = person_idxs[keep]
                object_idxs = object_idxs[keep]

                hopairs_per_image = Instances(instances_per_image.image_size)
                hopairs_per_image.union_boxes = union_boxes
                hopairs_per_image.person_boxes = boxes[person_idxs]
                hopairs_per_image.object_boxes = boxes[object_idxs]
                if self.training:
                    # `person_idxs` and `object_idxs` are used in self.label_and_sample_hopairs()
                    hopairs_per_image.person_idxs = person_idxs
                    hopairs_per_image.object_idxs = object_idxs
                else:
                    hopairs_per_image.person_box_scores = instances_per_image.scores[
                        person_idxs]
                    hopairs_per_image.object_box_scores = instances_per_image.scores[
                        object_idxs]
                    hopairs_per_image.object_box_classes = instances_per_image.pred_classes[
                        object_idxs]

            hopairs_per_image.person_feats = crop_features[img_idx][
                person_idxs]
            hopairs_per_image.object_feats = crop_features[img_idx][
                object_idxs]
            hopairs.append(hopairs_per_image)

        if self.training:
            hopairs = self.label_and_sample_hopairs(hopairs, instances)

        return hopairs
Exemple #9
0
def extract_feat(split_idx, img_list, cfg, args, actor: ActorHandle):
    num_images = len(img_list)
    print('Number of images on split{}: {}.'.format(split_idx, num_images))

    model = DefaultTrainer.build_model(cfg)
    DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
        cfg.MODEL.WEIGHTS, resume=args.resume
    )
    model.eval()

    for im_file in (img_list):
        if os.path.exists(os.path.join(args.output_dir, im_file.split('.')[0]+'.npz')):
            actor.update.remote(1)
            continue
        im = cv2.imread(os.path.join(args.image_dir, im_file))
        if im is None:
            print(os.path.join(args.image_dir, im_file), "is illegal!")
            actor.update.remote(1)
            continue
        dataset_dict = get_image_blob(im, cfg.MODEL.PIXEL_MEAN)
        # extract roi features
        if cfg.MODEL.BUA.EXTRACTOR.MODE == 1:
            attr_scores = None
            with torch.set_grad_enabled(False):
                if cfg.MODEL.BUA.ATTRIBUTE_ON:
                    boxes, scores, features_pooled, attr_scores = model([dataset_dict])
                else:
                    boxes, scores, features_pooled = model([dataset_dict])
            boxes = [box.tensor.cpu() for box in boxes]
            scores = [score.cpu() for score in scores]
            features_pooled = [feat.cpu() for feat in features_pooled]
            if not attr_scores is None:
                attr_scores = [attr_score.cpu() for attr_score in attr_scores]
            generate_npz(4, 
                args, cfg, im_file, im, dataset_dict, 
                boxes, scores, features_pooled, attr_scores)
        # extract bbox only
        elif cfg.MODEL.BUA.EXTRACTOR.MODE == 2:
            with torch.set_grad_enabled(False):
                boxes, scores = model([dataset_dict])
            boxes = [box.cpu() for box in boxes]
            scores = [score.cpu() for score in scores]
            generate_npz(2,
                args, cfg, im_file, im, dataset_dict, 
                boxes, scores)
        # extract roi features by bbox
        elif cfg.MODEL.BUA.EXTRACTOR.MODE == 3:
            if not os.path.exists(os.path.join(args.bbox_dir, im_file.split('.')[0]+'.npz')):
                actor.update.remote(1)
                continue
            bbox = torch.from_numpy(np.load(os.path.join(args.bbox_dir, im_file.split('.')[0]+'.npz'))['bbox']) * dataset_dict['im_scale']
            proposals = Instances(dataset_dict['image'].shape[-2:])
            proposals.proposal_boxes = BUABoxes(bbox)
            dataset_dict['proposals'] = proposals

            attr_scores = None
            with torch.set_grad_enabled(False):
                if cfg.MODEL.BUA.ATTRIBUTE_ON:
                    boxes, scores, features_pooled, attr_scores = model([dataset_dict])
                else:
                    boxes, scores, features_pooled = model([dataset_dict])
            boxes = [box.tensor.cpu() for box in boxes]
            scores = [score.cpu() for score in scores]
            features_pooled = [feat.cpu() for feat in features_pooled]
            if not attr_scores is None:
                attr_scores = [attr_score.data.cpu() for attr_score in attr_scores]
            generate_npz(3, 
                args, cfg, im_file, im, dataset_dict, 
                boxes, scores, features_pooled, attr_scores)

        actor.update.remote(1)
Exemple #10
0
def trend_rcnn_inference_single_image(boxes, scores, attributes, image_shape,
                                      score_thresh, nms_thresh, topk_per_image,
                                      attr_score_thresh, num_attr_classes,
                                      max_attr_pred):
    """
    Single-image inference. Return bounding-box detection results by thresholding
    on scores and applying non-maximum suppression (NMS).

    Args:
        Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes
        per image.

    Returns:
        Same as `fast_rcnn_inference`, but for only one image.
    """
    valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(
        dim=1)
    if not valid_mask.all():
        boxes = boxes[valid_mask]
        scores = scores[valid_mask]
        attributes = attributes[valid_mask]

    scores = scores[:, :-1]
    num_bbox_reg_classes = boxes.shape[1] // 4
    #print("Printing the number of classes in the box: ", num_bbox_reg_classes)
    # Convert to Boxes to use the `clip` function ...
    boxes = Boxes(boxes.reshape(-1, 4))
    boxes.clip(image_shape)
    boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4)  # R x C x 4

    num_attr_reg_classes = attributes.shape[1] // num_attr_classes
    # [ANMOL] this just prints the number of object classes that we have... here its 46
    attributes = attributes.view(-1, num_attr_reg_classes, num_attr_classes)
    # [ANMOL] reshaped the attributes [proposals, objectclass, attrclass]

    # Filter results based on detection scores
    filter_mask = scores > score_thresh  # R x K
    # filter mask shape is same as score shape: [proposals, obj classes]
    # R' x 2. First column contains indices of the R predictions;
    # Second column contains indices of classes.
    filter_inds = filter_mask.nonzero()
    # there would be more indices/proposals after this compared as more number of scores might be >
    # greater than threshold would be interesting to check how it would work class agnostic attr classification
    # might fail there.. In the current example: R=1000, but R'=45806
    #print("filter ind shape: ", filter_inds.shape)

    if num_bbox_reg_classes == 1:
        boxes = boxes[filter_inds[:, 0], 0]
    else:
        boxes = boxes[filter_mask]
    scores = scores[filter_mask]
    #before this scores shape was [R,num_classes], after filter mask it will just convert to [R']

    if num_attr_reg_classes == 1:
        attributes = attributes[filter_inds[:, 0], 0]
    else:
        attributes = attributes[filter_mask]
    #BOTH of these should produce attribute of shape [R', attr_classes]

    # Apply per-class NMS
    keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh)
    if topk_per_image >= 0:
        keep = keep[:topk_per_image]
    boxes, scores, filter_inds, attributes = boxes[keep], scores[
        keep], filter_inds[keep], attributes[keep]

    attributes[attributes < attr_score_thresh] = 0
    attr_scores_sorted, attr_indices = torch.sort(attributes,
                                                  1,
                                                  descending=True)
    attr_indices[attr_scores_sorted < attr_score_thresh] = 294
    attributes_inds = attr_indices[:, 0:max_attr_pred]
    #del attr_indices

    result = Instances(image_shape)
    result.pred_boxes = Boxes(boxes)
    result.scores = scores
    result.attr_scores = attributes
    result.attr_classes = attributes_inds
    result.pred_classes = filter_inds[:, 1]
    return result, filter_inds[:, 0]
Exemple #11
0
    def _forward_hoi(
        self, features: Dict[str, torch.Tensor], instances: List[Instances]
    ) -> Union[Dict[str, torch.Tensor], List[Instances]]:
        """
        Forward logic of the interaction prediction branch.

        Args:
            features (dict[str, Tensor]): mapping from feature map names to tensor.
                Same as in :meth:`ROIHeads.forward`.
            instances (list[Instances]): 
                At training, the per-image object proposals with matching ground truth. Each has
                    fields "proposal_boxes", and "interactness_logits", "gt_classes", "gt_actions".
                At inference, the per-image predicted box instances from box head. Each has fields
                    "pred_boxes", "pred_classes", "scores"

        Returns:
            In training, a dict of losses.
            In inference, a list of `Instances`, the predicted hoi instances. Each has fields
                "person_boxes", "object_boxes", "object_classes", "action_classes", "scores"
        """
        if not self.hoi_on:
            return {} if self.training else []

        features = [features[f] for f in self.in_features]
        hopairs = self.construct_hopairs(instances)
        union_features = self.hoi_pooler(features,
                                         [x.union_boxes for x in hopairs])
        person_features = self.hoi_pooler(features,
                                          [x.person_boxes for x in hopairs])
        object_features = self.hoi_pooler(features,
                                          [x.object_boxes for x in hopairs])
        person_features = self.hoi_head(person_features)
        object_features = self.hoi_head(object_features)
        union_features = self.hoi_head(union_features)

        if self.compose_learning != 0 and self.training:
            #
            # pass

            gt_obj_classes = [x.gt_classes[:, 1:2] for x in hopairs]
            ohot_gt_obj_labels = []
            for gt_obj in gt_obj_classes:
                ohot_gt_obj = torch.FloatTensor(len(gt_obj), 81)
                ohot_gt_obj.zero_()
                ohot_gt_obj = ohot_gt_obj.to(gt_obj.device)
                ohot_gt_obj.scatter_(1, gt_obj, 1)
                torch.scatter(ohot_gt_obj, 1, gt_obj, 1)
                ohot_gt_obj_labels.append(ohot_gt_obj)
            ohot_gt_obj_labels = torch.cat(ohot_gt_obj_labels, dim=0)
            gt_verbs = torch.cat([x.gt_actions for x in hopairs], dim=0)
            new_obj_features = object_features
            ohot_gt_obj_labels = ohot_gt_obj_labels[:, :]
            union_features_cl_arr = []
            person_features_cl_arr = []
            gt_verbs_cl_arr = []
            ohot_gt_obj_labels_cl_arr = []
            obj_features_cl_arr = []
            # import ipdb;ipdb.set_trace()

            per_img_hoi_lengths = [len(x.gt_actions) for x in hopairs]
            sum_rolls = len(per_img_hoi_lengths)
            if self.compose_learning == 4:
                sum_rolls = 3
            if self.compose_learning == 5:
                ohot_gt_obj_labels = torch.flip(ohot_gt_obj_labels, dims=[0])
                object_features = torch.flip(object_features, dims=[0])
            for ii in range(sum_rolls):
                """
                here, we roll the object list, to match different HOIs.
                """
                union_features_cl_arr.append(union_features)
                person_features_cl_arr.append(person_features)
                gt_verbs_cl_arr.append(gt_verbs)
                ohot_gt_obj_labels_cl_arr.append(
                    torch.roll(ohot_gt_obj_labels,
                               -sum(per_img_hoi_lengths[:ii + 1]), 0))
                obj_features_cl_arr.append(
                    torch.roll(object_features,
                               -sum(per_img_hoi_lengths[:ii + 1]), 0))
            # ohot_gt_obj_labels = torch.flip(ohot_gt_obj_labels, [0])
            # new_obj_features = torch.flip(object_features, [0])
            ohot_gt_obj_labels = torch.cat(ohot_gt_obj_labels_cl_arr, dim=0)
            union_features_cl = torch.cat(union_features_cl_arr, dim=0)
            person_features_cl = torch.cat(person_features_cl_arr, dim=0)
            gt_verbs_cl = torch.cat(gt_verbs_cl_arr, dim=0)
            ohot_gt_obj_labels_cl = torch.cat(ohot_gt_obj_labels_cl_arr, dim=0)
            obj_features_cl = torch.cat(obj_features_cl_arr, dim=0)

            HOI_labels = (
                torch.matmul(
                    ohot_gt_obj_labels,
                    self.obj_to_HO_matrix.to(ohot_gt_obj_labels.device)) +
                torch.matmul(
                    gt_verbs_cl.to(ohot_gt_obj_labels.device),
                    self.verb_to_HO_matrix.to(ohot_gt_obj_labels.device))) > 1.
            HOI_labels = HOI_labels.type(torch.float32)
            HOI_labels = torch.sum(HOI_labels, dim=-1) > 0

            union_features_cl = union_features_cl[HOI_labels]
            person_features_cl = person_features_cl[HOI_labels]
            new_obj_features = obj_features_cl[HOI_labels]
            gt_verbs = gt_verbs_cl[HOI_labels]
            ohot_gt_obj_cl = ohot_gt_obj_labels_cl[HOI_labels]
            cl_hopair = Instances((512, 512))
            cl_hopair.gt_actions = gt_verbs
            # gt_classes
            hopairs.append(cl_hopair)
            # print(len(gt_verbs), len(new_obj_features))
            union_features = torch.cat([union_features, union_features_cl],
                                       dim=0)
            person_features = torch.cat([person_features, person_features_cl],
                                        dim=0)
            object_features = torch.cat([object_features, new_obj_features],
                                        dim=0)

        hoi_predictions = self.hoi_predictor(union_features, person_features,
                                             object_features)

        del union_features, person_features, object_features, features

        if self.training:

            if self.compose_learning in [1, 4, 5]:
                if len(union_features_cl) > 0:
                    losses = self.hoi_predictor.losses(
                        hoi_predictions[:-len(union_features_cl)],
                        hopairs[:-1])
                    cl_losses = HoiOutputs(
                        hoi_predictions[-len(union_features_cl):],
                        hopairs[-1:], self.hoi_predictor.pos_weights).losses()
                    losses['loss_action_cl'] = cl_losses[
                        'loss_action'] * self.cl_weight
                else:
                    losses = self.hoi_predictor.losses(hoi_predictions,
                                                       hopairs[:-1])
                    losses['loss_action_cl'] = losses['loss_action'] * 0.
            elif self.compose_learning == 3:
                weights = torch.ones_like(hoi_predictions, dtype=torch.float32)
                weights[-len(union_features_cl):] = weights[
                    -len(union_features_cl):] * self.cl_weight

                losses = self.hoi_predictor.losses(hoi_predictions, hopairs,
                                                   weights)
            else:
                losses = self.hoi_predictor.losses(hoi_predictions, hopairs)
            return losses
        else:
            # if self.is_hoi_prediction:
            #     hoi_predictions = torch.matmul(hoi_predictions,
            #                                    torch.transpose(self.verb_to_HO_matrix, 1, 0).to(hoi_predictions.device))
            pred_interactions = self.hoi_predictor.inference(
                hoi_predictions, hopairs)
            if self.is_hoi_prediction:
                # convert to 117 classes
                pass
            return pred_interactions
Exemple #12
0
    def forward(self, batched_inputs):
        # Batched inputs is a list of mapped dictionaries
        # Note that this depends on the shape being 224x224: this is handled in the mapper
        # print(batched_inputs[0].keys())

        # Get out the images
        batched_images = [b["image"] for b in batched_inputs]

        # Normalise (required for yolo)
        # batched_images = [self.normalise(im) for im in batched_images]

        # Stack into one big tensor
        images_tensor = torch.stack(batched_images)

        # Vgg forward takes in a tensor, get out some logits
        # self.yolov3_model = self.yolov3_model.to(device)

        self.to(self.device)
        images_tensor = images_tensor.to(self.device)

        # print(type(images_tensor))
        # print(images_tensor.shape)
        # print(images_tensor)

        if self.training:
            # Get the height and widths
            batched_images_w_h = [(b["width"], b["height"])
                                  for b in batched_inputs]

            # batched_images_w_h = [ (b["image"].shape[1],b["image"].shape[1]) for b in batched_inputs]
            # Get the target classes
            target_classes = [b["classID"] for b in batched_inputs]
            # Compute the bboxes
            target_bboxes = [b["instances"] for b in batched_inputs]
            # print(target_bboxes)
            target_bboxes = [b.get("gt_boxes") for b in target_bboxes]
            target_centers = [
                b.get_centers().tolist()[0] for b in target_bboxes
            ]
            target_bboxes = [b.tensor.tolist()[0] for b in target_bboxes]
            target_w_h = []
            for b in target_bboxes:
                x0, y0, x1, y1 = b
                w = abs(x0 - x1)
                h = abs(y0 - y1)
                target_w_h.append((w, h))

            target_bboxes = []
            for bbox_center, bbox_w_h, img_w_h, target_class in zip(
                    target_centers, target_w_h, batched_images_w_h,
                    target_classes):
                img_width = img_w_h[0]
                img_height = img_w_h[1]
                center_x = bbox_center[0] / img_width
                center_y = bbox_center[1] / img_height
                width = bbox_w_h[0] / img_width
                height = bbox_w_h[1] / img_height
                target_bboxes.append(
                    (0, target_class, center_x, center_y, width, height))

            targets_tensor = torch.tensor(target_bboxes).to(self.device)

            # targets need to be in form:
            # tensor([[0.0000, 0.0000, 0.4900, 0.5000, 0.1454, 0.1829]])
            # index_of_bbox_in_image?, label_idx x_center y_center width height
            # The coordinates should be scaled [0, 1]

            # print(images_tensor.shape)
            # print(targets_tensor.shape)

            losses, _ = super().forward(images_tensor, targets_tensor)

            return {"total_loss": losses}
        else:
            with torch.no_grad():
                # Forward pass the model
                outputs = super().forward(images_tensor)
                # nms
                outputs = non_max_suppression(outputs,
                                              conf_thres=self.conf_threshold,
                                              nms_thres=self.nms_threshold)

                # For each output,batchedim (Note: only 1 image per batch in evaluation)
                for output, batched_input in zip(outputs, batched_inputs):
                    # height,width
                    im_height = batched_input["height"]
                    im_width = batched_input["width"]

                    # Get out predictions
                    try:
                        pred_boxes = output[:, :4]
                        pred_scores = output[:, 4]
                        pred_classes = output[:, -1].int()
                    except:
                        new_instance = Instances((im_height, im_width))
                        new_instance.pred_boxes = Boxes(torch.tensor([]))
                        new_instance.scores = torch.tensor([])
                        new_instance.pred_classes = torch.tensor([]).int()
                        return [{"instances": new_instance}]

                    # a "box" is len 4: center_x,center_y,width,height
                    # scaled between 0 and 1

                    pred_boxes = Boxes(pred_boxes)

                    # Add the predictions
                    new_instance = Instances((im_height, im_width))
                    new_instance.pred_boxes = pred_boxes
                    new_instance.scores = pred_scores
                    new_instance.pred_classes = pred_classes

                    # Immediately return for this loop as testing only involves 1 batch
                    return [{"instances": new_instance}]
    def forward_for_single_feature_map(self, locations, box_cls, reg_pred,
                                       ctrness, mask_regression, image_sizes):
        N, C, H, W = box_cls.shape

        # put in the same format as locations
        box_cls = box_cls.view(N, C, H, W).permute(0, 2, 3, 1)
        box_cls = box_cls.reshape(N, -1, C).sigmoid()
        box_regression = reg_pred.view(N, 4, H, W).permute(0, 2, 3, 1)
        box_regression = box_regression.reshape(N, -1, 4)
        ctrness = ctrness.view(N, 1, H, W).permute(0, 2, 3, 1)
        ctrness = ctrness.reshape(N, -1).sigmoid()
        mask_regression = mask_regression.view(N, self.num_codes, H,
                                               W).permute(0, 2, 3, 1)
        mask_regression = mask_regression.reshape(N, -1, self.num_codes)

        # if self.thresh_with_ctr is True, we multiply the classification
        # scores with centerness scores before applying the threshold.
        if self.thresh_with_ctr:
            box_cls = box_cls * ctrness[:, :, None]
        candidate_inds = box_cls > self.pre_nms_thresh
        # pre_nms_top_n = candidate_inds.view(N, -1).sum(1)
        pre_nms_top_n = candidate_inds.reshape(N, -1).sum(
            1)  # this is suggested when running on my machine
        pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n)

        if not self.thresh_with_ctr:
            box_cls = box_cls * ctrness[:, :, None]

        results = []
        for i in range(N):
            per_box_cls = box_cls[i]
            per_candidate_inds = candidate_inds[i]
            per_box_cls = per_box_cls[per_candidate_inds]

            per_candidate_nonzeros = per_candidate_inds.nonzero()
            per_box_loc = per_candidate_nonzeros[:, 0]
            per_class = per_candidate_nonzeros[:, 1]

            per_box_regression = box_regression[i]
            per_box_regression = per_box_regression[per_box_loc]
            per_locations = locations[per_box_loc]

            per_box_mask = mask_regression[i]
            per_box_mask = per_box_mask[per_box_loc]

            per_pre_nms_top_n = pre_nms_top_n[i]

            if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
                per_box_cls, top_k_indices = \
                    per_box_cls.topk(per_pre_nms_top_n, sorted=False)
                per_class = per_class[top_k_indices]
                per_box_regression = per_box_regression[top_k_indices]
                per_locations = per_locations[top_k_indices]
                per_box_mask = per_box_mask[top_k_indices]

            detections = torch.stack([
                per_locations[:, 0] - per_box_regression[:, 0],
                per_locations[:, 1] - per_box_regression[:, 1],
                per_locations[:, 0] + per_box_regression[:, 2],
                per_locations[:, 1] + per_box_regression[:, 3],
            ],
                                     dim=1)

            boxlist = Instances(image_sizes[i])
            boxlist.pred_boxes = Boxes(detections)
            boxlist.scores = torch.sqrt(per_box_cls)
            boxlist.pred_classes = per_class
            boxlist.locations = per_locations
            boxlist.pred_masks = per_box_mask

            results.append(boxlist)

        return results
Exemple #14
0
    def losses(self,
               logits_pred,
               reg_pred,
               ctrness_pred,
               locations,
               gt_instances,
               top_feats=None):
        """
        Return the losses from a set of FCOS predictions and their associated ground-truth.

        Returns:
            dict[loss name -> loss value]: A dict mapping from loss name to loss value.
        """

        training_targets = self._get_ground_truth(locations, gt_instances)

        # Collect all logits and regression predictions over feature maps
        # and images to arrive at the same shape as the labels and targets
        # The final ordering is L, N, H, W from slowest to fastest axis.

        instances = Instances((0, 0))
        instances.labels = cat(
            [
                # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
                x.reshape(-1) for x in training_targets["labels"]
            ],
            dim=0)
        instances.gt_inds = cat(
            [
                # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
                x.reshape(-1) for x in training_targets["target_inds"]
            ],
            dim=0)
        instances.im_inds = cat(
            [x.reshape(-1) for x in training_targets["im_inds"]], dim=0)
        instances.reg_targets = cat(
            [
                # Reshape: (N, Hi, Wi, 4) -> (N*Hi*Wi, 4)
                x.reshape(-1, 4) for x in training_targets["reg_targets"]
            ],
            dim=0,
        )
        instances.locations = cat(
            [x.reshape(-1, 2) for x in training_targets["locations"]], dim=0)
        instances.fpn_levels = cat(
            [x.reshape(-1) for x in training_targets["fpn_levels"]], dim=0)

        instances.logits_pred = cat(
            [
                # Reshape: (N, C, Hi, Wi) -> (N, Hi, Wi, C) -> (N*Hi*Wi, C)
                x.permute(0, 2, 3, 1).reshape(-1, self.num_classes)
                for x in logits_pred
            ],
            dim=0,
        )
        instances.reg_pred = cat(
            [
                # Reshape: (N, B, Hi, Wi) -> (N, Hi, Wi, B) -> (N*Hi*Wi, B)
                x.permute(0, 2, 3, 1).reshape(-1, 4) for x in reg_pred
            ],
            dim=0,
        )
        instances.ctrness_pred = cat(
            [
                # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
                x.permute(0, 2, 3, 1).reshape(-1) for x in ctrness_pred
            ],
            dim=0,
        )

        if len(top_feats) > 0:
            instances.top_feats = cat(
                [
                    # Reshape: (N, -1, Hi, Wi) -> (N*Hi*Wi, -1)
                    x.permute(0, 2, 3, 1).reshape(-1, x.size(1))
                    for x in top_feats
                ],
                dim=0,
            )

        return self.fcos_losses(instances)
Exemple #15
0
def find_top_st_rpn_proposals(
    proposals,
    pred_objectness_logits,
    reference_frame_idx,
    image_size,
    nms_thresh,
    pre_nms_topk,
    post_nms_topk,
    min_box_side_len,
    training,
):
    """
    For each feature map, select the `pre_nms_topk` highest scoring proposals,
    apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
    highest scoring proposals among all the feature maps if `training` is True,
    otherwise, returns the highest `post_nms_topk` scoring proposals for each
    feature map.
    Args:
        proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4).
            All proposal predictions on the feature maps.
        pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
        reference_frame_idx (int): Reference frame index used to select boxes/scores to execute
            NMS. 
        image_size: Input images size in (h, w) order.
        nms_thresh (float): IoU threshold to use for NMS
        pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
            When RPN is run on multiple feature maps (as in FPN) this number is per
            feature map.
        post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
            When RPN is run on multiple feature maps (as in FPN) this number is total,
            over all feature maps.
        min_box_side_len (float): minimum proposal box side length in pixels (absolute units
            wrt input images).
        training (bool): True if proposals are to be used in training, otherwise False.
            This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
            comment.
    Returns:
        proposals (list[Instances]): list of N Instances. The i-th Instances
            stores post_nms_topk object proposals for frame i, sorted by the
            objectness score in the reference frame in descending order.
    """
    device = proposals[0].device

    # 1. Select top-k anchor for every level and every image
    topk_scores = []  # #lvl Tensor, each of shape N x topk
    topk_proposals = []
    level_ids = []  # #lvl Tensor, each of shape (topk,)
    for level_id, proposals_i, logits_i in zip(itertools.count(), proposals,
                                               pred_objectness_logits):
        Hi_Wi_A = logits_i.shape[1]
        num_proposals_i = min(pre_nms_topk, Hi_Wi_A)

        # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812)
        # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
        logits_i, idx = logits_i[reference_frame_idx].sort(descending=True,
                                                           dim=0)
        topk_scores_i = logits_i[:num_proposals_i]
        topk_idx = idx[:num_proposals_i]

        topk_proposals_i = proposals_i[:, topk_idx]  # N x topk x 4

        topk_proposals.append(topk_proposals_i)
        topk_scores.append(topk_scores_i)
        level_ids.append(
            torch.full((num_proposals_i, ),
                       level_id,
                       dtype=torch.int64,
                       device=device))

    # 2. Concat all levels together
    topk_scores = cat(topk_scores, dim=0)
    topk_proposals = cat(topk_proposals, dim=1)
    level_ids = cat(level_ids, dim=0)

    # 3. For the reference frame, run a per-level NMS, and choose topk results for
    # every input frame.
    st_boxes = []

    # TODO: cache valid proposals mask for previous frames
    lvl = level_ids
    valid_mask = torch.isfinite(topk_proposals).all(dim=2).all(
        dim=0) & torch.isfinite(topk_scores)
    if not valid_mask.all():
        topk_proposals = topk_proposals[:, valid_mask]
        topk_scores = topk_scores[valid_mask]
        lvl = lvl[valid_mask]

    keep = None
    st_boxes = []
    for proposal_boxes_f in topk_proposals:
        boxes = Boxes(proposal_boxes_f)
        boxes.clip(image_size)

        # filter empty boxes
        keep_f = boxes.nonempty(threshold=min_box_side_len)
        keep = keep_f if keep is None else keep & keep_f

        st_boxes.append(boxes)

    if keep.sum().item() != len(st_boxes[0]):
        topk_scores, lvl = topk_scores[keep], lvl[keep]

    filtered_st_boxes = []
    for boxes in st_boxes:
        if keep.sum().item() != len(boxes):
            boxes = boxes[keep]

        filtered_st_boxes.append(boxes)

    keep = batched_nms(filtered_st_boxes[reference_frame_idx].tensor,
                       topk_scores, lvl, nms_thresh)
    # In Detectron1, there was different behavior during training vs. testing.
    # (https://github.com/facebookresearch/Detectron/issues/459)
    # During training, topk is over the proposals from *all* images in the training batch.
    # During testing, it is over the proposals for each image separately.
    # As a result, the training behavior becomes batch-dependent,
    # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
    # This bug is addressed in Detectron2 to make the behavior independent of batch size.
    keep = keep[:post_nms_topk]  # keep is already sorted

    scores = topk_scores[keep]
    results = []
    for boxes in filtered_st_boxes:
        res = Instances(image_size)
        res.proposal_boxes = boxes[keep]
        res.objectness_logits = scores
        results.append(res)

    return results
    def inference_single_image(self, locations, box_cls, box_reg, center_score,
                               image_size):
        boxes_all = []
        scores_all = []
        class_idxs_all = []

        # Iterate over every feature level
        for box_cls_i, box_reg_i, locs_i, center_score_i in zip(
                box_cls, box_reg, locations, center_score):
            # (HxW, C)
            box_cls_i = box_cls_i.sigmoid_()
            keep_idxs = box_cls_i > self.pre_nms_thresh

            # multiply the classification scores with center scores
            box_cls_i *= center_score_i.sigmoid_()

            box_cls_i = box_cls_i[keep_idxs]
            keep_idxs_nonzero_i = keep_idxs.nonzero()

            box_loc_i = keep_idxs_nonzero_i[:, 0]
            class_i = keep_idxs_nonzero_i[:, 1]

            box_reg_i = box_reg_i[box_loc_i]
            locs_i = locs_i[box_loc_i]

            per_pre_nms_top_n = keep_idxs.sum().clamp(max=self.pre_nms_top_n)
            if keep_idxs.sum().item() > per_pre_nms_top_n.item():
                box_cls_i, topk_idxs = box_cls_i.topk(per_pre_nms_top_n,
                                                      sorted=False)

                class_i = class_i[topk_idxs]
                box_reg_i = box_reg_i[topk_idxs]
                locs_i = locs_i[topk_idxs]

            # predict boxes
            predicted_boxes = torch.stack([
                locs_i[:, 0] - box_reg_i[:, 0],
                locs_i[:, 1] - box_reg_i[:, 1],
                locs_i[:, 0] + box_reg_i[:, 2],
                locs_i[:, 1] + box_reg_i[:, 3],
            ],
                                          dim=1)
            box_cls_i = torch.sqrt(box_cls_i)

            boxes_all.append(predicted_boxes)
            scores_all.append(box_cls_i)
            class_idxs_all.append(class_i)

        boxes_all, scores_all, class_idxs_all = [
            cat(x) for x in [boxes_all, scores_all, class_idxs_all]
        ]

        # Apply per-class nms for each image
        keep = batched_nms(boxes_all, scores_all, class_idxs_all,
                           self.nms_thresh)
        keep = keep[:self.max_detections_per_image]

        result = Instances(image_size)
        result.pred_boxes = Boxes(boxes_all[keep])
        result.scores = scores_all[keep]
        result.pred_classes = class_idxs_all[keep]

        return result
    def test_rpn(self):
        torch.manual_seed(121)
        cfg = get_cfg()
        cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RPN"
        cfg.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator"
        cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1)
        backbone = build_backbone(cfg)
        proposal_generator = build_proposal_generator(cfg,
                                                      backbone.output_shape())
        num_images = 2
        images_tensor = torch.rand(num_images, 20, 30)
        image_sizes = [(10, 10), (20, 30)]
        images = ImageList(images_tensor, image_sizes)
        image_shape = (15, 15)
        num_channels = 1024
        features = {"res4": torch.rand(num_images, num_channels, 1, 2)}
        gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]],
                                dtype=torch.float32)
        gt_instances = Instances(image_shape)
        gt_instances.gt_boxes = Boxes(gt_boxes)
        with EventStorage():  # capture events in a new storage to discard them
            proposals, proposal_losses = proposal_generator(
                images, features, [gt_instances[0], gt_instances[1]])

        expected_losses = {
            "loss_rpn_cls": torch.tensor(0.0804563984),
            "loss_rpn_loc": torch.tensor(0.0990132466),
        }
        for name in expected_losses.keys():
            self.assertTrue(
                torch.allclose(proposal_losses[name], expected_losses[name]))

        expected_proposal_boxes = [
            Boxes(torch.tensor([[0, 0, 10, 10], [7.3365392685, 0, 10, 10]])),
            Boxes(
                torch.tensor([
                    [0, 0, 30, 20],
                    [0, 0, 16.7862777710, 13.1362524033],
                    [0, 0, 30, 13.3173446655],
                    [0, 0, 10.8602609634, 20],
                    [7.7165775299, 0, 27.3875980377, 20],
                ])),
        ]

        expected_objectness_logits = [
            torch.tensor([0.1225359365, -0.0133192837]),
            torch.tensor([
                0.1415634006, 0.0989848152, 0.0565387346, -0.0072308783,
                -0.0428492837
            ]),
        ]

        for proposal, expected_proposal_box, im_size, expected_objectness_logit in zip(
                proposals, expected_proposal_boxes, image_sizes,
                expected_objectness_logits):
            self.assertEqual(len(proposal), len(expected_proposal_box))
            self.assertEqual(proposal.image_size, im_size)
            self.assertTrue(
                torch.allclose(proposal.proposal_boxes.tensor,
                               expected_proposal_box.tensor))
            self.assertTrue(
                torch.allclose(proposal.objectness_logits,
                               expected_objectness_logit))
Exemple #18
0
    def compute_targets_for_locations(self, locations, targets, size_ranges):
        labels = []
        reg_targets = []
        mask_targets = []
        mask_indices = []
        xs, ys = locations[:, 0], locations[:, 1]

        for im_i in range(len(targets)):
            targets_per_im = targets[im_i]
            bboxes = targets_per_im.gt_boxes.tensor
            labels_per_im = targets_per_im.gt_classes

            # no gt
            if bboxes.numel() == 0:
                labels.append(
                    labels_per_im.new_zeros(locations.size(0)) +
                    self.num_classes)
                reg_targets.append(locations.new_zeros((locations.size(0), 4)))
                continue

            area = targets_per_im.gt_boxes.area()

            l = xs[:, None] - bboxes[:, 0][None]
            t = ys[:, None] - bboxes[:, 1][None]
            r = bboxes[:, 2][None] - xs[:, None]
            b = bboxes[:, 3][None] - ys[:, None]
            reg_targets_per_im = torch.stack([l, t, r, b], dim=2)

            if self.center_sample:
                is_in_boxes = self.get_sample_region(bboxes,
                                                     self.strides,
                                                     self.num_loc_list,
                                                     xs,
                                                     ys,
                                                     radius=self.radius)
            else:
                is_in_boxes = reg_targets_per_im.min(dim=2)[0] > 0

            max_reg_targets_per_im = reg_targets_per_im.max(dim=2)[0]
            # limit the regression range for each location
            is_cared_in_the_level = \
                (max_reg_targets_per_im >= size_ranges[:, [0]]) & \
                (max_reg_targets_per_im <= size_ranges[:, [1]])

            locations_to_gt_area = area[None].repeat(len(locations), 1)
            locations_to_gt_area[is_in_boxes == 0] = INF
            locations_to_gt_area[is_cared_in_the_level == 0] = INF

            # if there are still more than one objects for a location,
            # we choose the one with minimal area
            locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(
                dim=1)

            reg_targets_per_im = reg_targets_per_im[range(len(locations)),
                                                    locations_to_gt_inds]

            labels_per_im = labels_per_im[locations_to_gt_inds]
            labels_per_im[locations_to_min_area == INF] = self.num_classes

            labels.append(labels_per_im)
            reg_targets.append(reg_targets_per_im)

            # Mask Encoding.
            pos_inds = torch.nonzero(
                labels_per_im != self.num_classes).squeeze(1)
            pos_labels = labels_per_im[pos_inds]
            pos_reg_targets = reg_targets_per_im[pos_inds]
            pos_locations = locations[pos_inds]
            bbs = torch.stack([
                pos_locations[:, 0] - pos_reg_targets[:, 0],
                pos_locations[:, 1] - pos_reg_targets[:, 1],
                pos_locations[:, 0] + pos_reg_targets[:, 2],
                pos_locations[:, 1] + pos_reg_targets[:, 3],
            ],
                              dim=1)
            bbs = Boxes(bbs)

            mask_targets_per_im = Instances(targets_per_im.image_size)
            mask_targets_per_im.set("pos_classes", pos_labels)
            mask_targets_per_im.set("pos_boxes", bbs)

            mask_targets.append(mask_targets_per_im)
            mask_indices.append(pos_inds)

        return {
            "labels": labels,
            "reg_targets": reg_targets,
            "mask_targets": mask_targets,
            "mask_indices": mask_indices
        }
Exemple #19
0
def assemble_rcnn_outputs_by_name(image_sizes, tensor_outputs, force_mask_on=False):
    """
    A function to assemble caffe2 model's outputs (i.e. Dict[str, Tensor])
    to detectron2's format (i.e. list of Instances instance).
    This only works when the model follows the Caffe2 detectron's naming convention.

    Args:
        image_sizes (List[List[int, int]]): [H, W] of every image.
        tensor_outputs (Dict[str, Tensor]): external_output to its tensor.

        force_mask_on (Bool): if true, the it make sure there'll be pred_masks even
            if the mask is not found from tensor_outputs (usually due to model crash)
    """

    results = [Instances(image_size) for image_size in image_sizes]

    batch_splits = tensor_outputs.get("batch_splits", None)
    if batch_splits:
        raise NotImplementedError()
    assert len(image_sizes) == 1
    result = results[0]

    bbox_nms = tensor_outputs["bbox_nms"]
    score_nms = tensor_outputs["score_nms"]
    class_nms = tensor_outputs["class_nms"]
    # Detection will always success because Conv support 0-batch
    assert bbox_nms is not None
    assert score_nms is not None
    assert class_nms is not None
    if bbox_nms.shape[1] == 5:
        result.pred_boxes = RotatedBoxes(bbox_nms)
    else:
        result.pred_boxes = Boxes(bbox_nms)
    result.scores = score_nms
    result.pred_classes = class_nms.to(torch.int64)

    mask_fcn_probs = tensor_outputs.get("mask_fcn_probs", None)
    if mask_fcn_probs is not None:
        # finish the mask pred
        mask_probs_pred = mask_fcn_probs
        num_masks = mask_probs_pred.shape[0]
        class_pred = result.pred_classes
        indices = torch.arange(num_masks, device=class_pred.device)
        mask_probs_pred = mask_probs_pred[indices, class_pred][:, None]
        result.pred_masks = mask_probs_pred
    elif force_mask_on:
        # NOTE: there's no way to know the height/width of mask here, it won't be
        # used anyway when batch size is 0, so just set them to 0.
        result.pred_masks = torch.zeros([0, 1, 0, 0], dtype=torch.uint8)

    keypoints_out = tensor_outputs.get("keypoints_out", None)
    kps_score = tensor_outputs.get("kps_score", None)
    if keypoints_out is not None:
        # keypoints_out: [N, 4, #kypoints], where 4 is in order of (x, y, score, prob)
        keypoints_tensor = keypoints_out
        # NOTE: it's possible that prob is not calculated if "should_output_softmax"
        # is set to False in HeatmapMaxKeypoint, so just using raw score, seems
        # it doesn't affect mAP. TODO: check more carefully.
        keypoint_xyp = keypoints_tensor.transpose(1, 2)[:, :, [0, 1, 2]]
        result.pred_keypoints = keypoint_xyp
    elif kps_score is not None:
        # keypoint heatmap to sparse data structure
        pred_keypoint_logits = kps_score
        keypoint_head.keypoint_rcnn_inference(pred_keypoint_logits, [result])

    return results
Exemple #20
0
    def get_pgt(self, prev_pred_boxes, prev_pred_scores, proposals, suffix):
        if isinstance(prev_pred_scores, torch.Tensor):
            num_preds_per_image = [len(p) for p in proposals]
            prev_pred_scores = prev_pred_scores.split(num_preds_per_image,
                                                      dim=0)
        else:
            assert isinstance(prev_pred_scores, list)
            assert isinstance(prev_pred_scores[0], torch.Tensor)

        prev_pred_scores = [
            torch.index_select(prev_pred_score, 1,
                               gt_int) for prev_pred_score, gt_int in zip(
                                   prev_pred_scores, self.gt_classes_img_int)
        ]
        pgt_scores_idxs = [
            torch.max(prev_pred_score, dim=0)
            for prev_pred_score in prev_pred_scores
        ]
        pgt_scores = [item[0] for item in pgt_scores_idxs]
        pgt_idxs = [item[1] for item in pgt_scores_idxs]

        assert isinstance(prev_pred_boxes, tuple) or isinstance(
            prev_pred_boxes, list)
        if isinstance(prev_pred_boxes[0], Boxes):
            pgt_boxes = [
                prev_pred_box[pgt_idx]
                for prev_pred_box, pgt_idx in zip(prev_pred_boxes, pgt_idxs)
            ]
        else:
            assert isinstance(prev_pred_boxes[0], torch.Tensor)
            if self.cls_agnostic_bbox_reg:
                num_preds = [
                    prev_pred_box.size(0) for prev_pred_box in prev_pred_boxes
                ]
                prev_pred_boxes = [
                    prev_pred_box.unsqueeze(1).expand(num_pred,
                                                      self.num_classes, 4) for
                    num_pred, prev_pred_box in zip(num_preds, prev_pred_boxes)
                ]
            prev_pred_boxes = [
                prev_pred_box.view(-1, self.num_classes, 4)
                for prev_pred_box in prev_pred_boxes
            ]
            prev_pred_boxes = [
                torch.index_select(prev_pred_box, 1, gt_int) for prev_pred_box,
                gt_int in zip(prev_pred_boxes, self.gt_classes_img_int)
            ]
            pgt_boxes = [
                torch.index_select(prev_pred_box, 0, pgt_idx)
                for prev_pred_box, pgt_idx in zip(prev_pred_boxes, pgt_idxs)
            ]
            pgt_boxes = [pgt_box.view(-1, 4) for pgt_box in pgt_boxes]
            diags = [
                torch.tensor(
                    [
                        i * gt_split.numel() + i
                        for i in range(gt_split.numel())
                    ],
                    dtype=torch.int64,
                    device=pgt_boxes[0].device,
                ) for gt_split in self.gt_classes_img_int
            ]
            pgt_boxes = [
                torch.index_select(pgt_box, 0, diag)
                for pgt_box, diag in zip(pgt_boxes, diags)
            ]
            pgt_boxes = [Boxes(pgt_box) for pgt_box in pgt_boxes]

        pgt_classes = self.gt_classes_img_int
        pgt_weights = [
            torch.index_select(pred_logits, 1, pgt_class).reshape(-1)
            for pred_logits, pgt_class in zip(
                self.pred_class_img_logits.split(1, dim=0), pgt_classes)
        ]

        targets = [
            Instances(
                proposals[i].image_size,
                gt_boxes=pgt_box,
                gt_classes=pgt_class,
                gt_scores=pgt_score,
                gt_weights=pgt_weight,
            ) for i, (pgt_box, pgt_class, pgt_score, pgt_weight) in enumerate(
                zip(pgt_boxes, pgt_classes, pgt_scores, pgt_weights))
        ]

        self._vis_pgt(targets, "pgt", suffix)

        return targets
Exemple #21
0
def find_top_rrpn_proposals(
    proposals,
    pred_objectness_logits,
    images,
    nms_thresh,
    pre_nms_topk,
    post_nms_topk,
    min_box_side_len,
    training,
):
    """
    For each feature map, select the `pre_nms_topk` highest scoring proposals,
    apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
    highest scoring proposals among all the feature maps if `training` is True,
    otherwise, returns the highest `post_nms_topk` scoring proposals for each
    feature map.

    Args:
        proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 5).
            All proposal predictions on the feature maps.
        pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
        images (ImageList): Input images as an :class:`ImageList`.
        nms_thresh (float): IoU threshold to use for NMS
        pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
            When RRPN is run on multiple feature maps (as in FPN) this number is per
            feature map.
        post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
            When RRPN is run on multiple feature maps (as in FPN) this number is total,
            over all feature maps.
        min_box_side_len (float): minimum proposal box side length in pixels (absolute units
            wrt input images).
        training (bool): True if proposals are to be used in training, otherwise False.
            This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
            comment.

    Returns:
        proposals (list[Instances]): list of N Instances. The i-th Instances
            stores post_nms_topk object proposals for image i.
    """
    image_sizes = images.image_sizes  # in (h, w) order
    num_images = len(image_sizes)
    device = proposals[0].device

    # 1. Select top-k anchor for every level and every image
    topk_scores = []  # #lvl Tensor, each of shape N x topk
    topk_proposals = []
    level_ids = []  # #lvl Tensor, each of shape (topk,)
    batch_idx = torch.arange(num_images, device=device)
    for level_id, proposals_i, logits_i in zip(
        itertools.count(), proposals, pred_objectness_logits
    ):
        Hi_Wi_A = logits_i.shape[1]
        num_proposals_i = min(pre_nms_topk, Hi_Wi_A)

        # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812)
        # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
        logits_i, idx = logits_i.sort(descending=True, dim=1)
        topk_scores_i = logits_i[batch_idx, :num_proposals_i]
        topk_idx = idx[batch_idx, :num_proposals_i]

        # each is N x topk
        topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx]  # N x topk x 5

        topk_proposals.append(topk_proposals_i)
        topk_scores.append(topk_scores_i)
        level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device))

    # 2. Concat all levels together
    topk_scores = cat(topk_scores, dim=1)
    topk_proposals = cat(topk_proposals, dim=1)
    level_ids = cat(level_ids, dim=0)

    # 3. For each image, run a per-level NMS, and choose topk results.
    results = []
    for n, image_size in enumerate(image_sizes):
        boxes = RotatedBoxes(topk_proposals[n])
        scores_per_img = topk_scores[n]
        boxes.clip(image_size)

        # filter empty boxes
        keep = boxes.nonempty(threshold=min_box_side_len)
        lvl = level_ids
        if keep.sum().item() != len(boxes):
            boxes, scores_per_img, lvl = (boxes[keep], scores_per_img[keep], level_ids[keep])

        keep = batched_nms_rotated(boxes.tensor, scores_per_img, lvl, nms_thresh)
        # In Detectron1, there was different behavior during training vs. testing.
        # (https://github.com/facebookresearch/Detectron/issues/459)
        # During training, topk is over the proposals from *all* images in the training batch.
        # During testing, it is over the proposals for each image separately.
        # As a result, the training behavior becomes batch-dependent,
        # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
        # This bug is addressed in Detectron2 to make the behavior independent of batch size.
        keep = keep[:post_nms_topk]

        res = Instances(image_size)
        res.proposal_boxes = boxes[keep]
        res.objectness_logits = scores_per_img[keep]
        results.append(res)
    return results
Exemple #22
0
    def get_pgt_top_k(
        self,
        prev_pred_boxes,
        prev_pred_scores,
        proposals,
        top_k=1,
        thres=0,
        need_instance=True,
        need_weight=True,
        suffix="",
    ):
        assert isinstance(prev_pred_boxes, tuple) or isinstance(
            prev_pred_boxes, list)
        if isinstance(prev_pred_boxes[0], Boxes):
            num_preds = [
                len(prev_pred_box) for prev_pred_box in prev_pred_boxes
            ]
            prev_pred_boxes = [
                prev_pred_box.tensor.unsqueeze(1).expand(
                    num_pred, self.num_classes, 4)
                for num_pred, prev_pred_box in zip(num_preds, prev_pred_boxes)
            ]
        else:
            assert isinstance(prev_pred_boxes[0], torch.Tensor)
            if self.cls_agnostic_bbox_reg:
                num_preds = [
                    prev_pred_box.size(0) for prev_pred_box in prev_pred_boxes
                ]
                prev_pred_boxes = [
                    prev_pred_box.unsqueeze(1).expand(num_pred,
                                                      self.num_classes, 4) for
                    num_pred, prev_pred_box in zip(num_preds, prev_pred_boxes)
                ]
        prev_pred_boxes = [
            prev_pred_box.view(-1, self.num_classes, 4)
            for prev_pred_box in prev_pred_boxes
        ]

        if isinstance(prev_pred_scores, torch.Tensor):
            num_preds_per_image = [len(p) for p in proposals]
            prev_pred_scores = prev_pred_scores.split(num_preds_per_image,
                                                      dim=0)
        else:
            assert isinstance(prev_pred_scores, list)
            assert isinstance(prev_pred_scores[0], torch.Tensor)

        prev_pred_scores = [
            torch.index_select(prev_pred_score, 1,
                               gt_int) for prev_pred_score, gt_int in zip(
                                   prev_pred_scores, self.gt_classes_img_int)
        ]
        prev_pred_boxes = [
            torch.index_select(prev_pred_box, 1,
                               gt_int) for prev_pred_box, gt_int in zip(
                                   prev_pred_boxes, self.gt_classes_img_int)
        ]

        # get top k
        num_preds = [
            prev_pred_score.size(0) for prev_pred_score in prev_pred_scores
        ]
        if top_k >= 1:
            top_ks = [min(num_pred, int(top_k)) for num_pred in num_preds]
        elif top_k < 1 and top_k > 0:
            top_ks = [max(int(num_pred * top_k), 1) for num_pred in num_preds]
        else:
            top_ks = [min(num_pred, 1) for num_pred in num_preds]
        pgt_scores_idxs = [
            torch.topk(prev_pred_score, top_k, dim=0)
            for prev_pred_score, top_k in zip(prev_pred_scores, top_ks)
        ]
        pgt_scores = [item[0] for item in pgt_scores_idxs]
        pgt_idxs = [item[1] for item in pgt_scores_idxs]
        pgt_idxs = [
            torch.unsqueeze(pgt_idx, 2).expand(top_k, gt_int.numel(), 4)
            for pgt_idx, top_k, gt_int in zip(pgt_idxs, top_ks,
                                              self.gt_classes_img_int)
        ]
        pgt_boxes = [
            torch.gather(prev_pred_box, 0, pgt_idx)
            for prev_pred_box, pgt_idx in zip(prev_pred_boxes, pgt_idxs)
        ]
        pgt_classes = [
            torch.unsqueeze(gt_int, 0).expand(top_k, gt_int.numel())
            for gt_int, top_k in zip(self.gt_classes_img_int, top_ks)
        ]
        if need_weight:
            pgt_weights = [
                torch.index_select(pred_logits, 1,
                                   gt_int).expand(top_k, gt_int.numel())
                for pred_logits, gt_int, top_k in zip(
                    self.pred_class_img_logits.split(1, dim=0),
                    self.gt_classes_img_int, top_ks)
            ]

        if thres > 0:
            # get large scores
            masks = [pgt_score.ge(thres) for pgt_score in pgt_scores]
            masks = [
                torch.cat([torch.full_like(mask[0:1, :], True), mask[1:, :]],
                          dim=0) for mask in masks
            ]
            pgt_scores = [
                torch.masked_select(pgt_score, mask)
                for pgt_score, mask in zip(pgt_scores, masks)
            ]
            pgt_boxes = [
                torch.masked_select(
                    pgt_box,
                    torch.unsqueeze(mask, 2).expand(top_k, gt_int.numel(), 4))
                for pgt_box, mask, top_k, gt_int in zip(
                    pgt_boxes, masks, top_ks, self.gt_classes_img_int)
            ]
            pgt_classes = [
                torch.masked_select(pgt_class, mask)
                for pgt_class, mask in zip(pgt_classes, masks)
            ]
            if need_weight:
                pgt_weights = [
                    torch.masked_select(pgt_weight, mask)
                    for pgt_weight, mask in zip(pgt_weights, masks)
                ]

        pgt_scores = [pgt_score.reshape(-1) for pgt_score in pgt_scores]
        pgt_boxes = [pgt_box.reshape(-1, 4) for pgt_box in pgt_boxes]
        pgt_classes = [pgt_class.reshape(-1) for pgt_class in pgt_classes]
        if need_weight:
            pgt_weights = [
                pgt_weight.reshape(-1) for pgt_weight in pgt_weights
            ]

        if not need_instance and need_weight:
            return pgt_scores, pgt_boxes, pgt_classes, pgt_weights
        elif not need_instance and not need_weight:
            return pgt_scores, pgt_boxes, pgt_classes

        pgt_boxes = [Boxes(pgt_box) for pgt_box in pgt_boxes]

        targets = [
            Instances(
                proposals[i].image_size,
                gt_boxes=pgt_box,
                gt_classes=pgt_class,
                gt_scores=pgt_score,
                gt_weights=pgt_weight,
            ) for i, (pgt_box, pgt_class, pgt_score, pgt_weight) in enumerate(
                zip(pgt_boxes, pgt_classes, pgt_scores, pgt_weights))
        ]

        self._vis_pgt(targets, "pgt_top_k", suffix)

        return targets
Exemple #23
0
def annotations_to_instances(annos, image_size, mask_format="polygon"):
    """
    Create an :class:`Instances` object used by the models,
    from instance annotations in the dataset dict.

    Args:
        annos (list[dict]): a list of instance annotations in one image, each
            element for one instance.
        image_size (tuple): height, width

    Returns:
        Instances:
            It will contain fields "gt_boxes", "gt_classes",
            "gt_masks", "gt_keypoints", if they can be obtained from `annos`.
            This is the format that builtin models expect.
    """
    boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
    target = Instances(image_size)
    target.gt_boxes = Boxes(boxes)

    classes = [int(obj["category_id"]) for obj in annos]
    classes = torch.tensor(classes, dtype=torch.int64)
    target.gt_classes = classes

    if len(annos) and "segmentation" in annos[0]:
        segms = [obj["segmentation"] for obj in annos]
        if mask_format == "polygon":
            # TODO check type and provide better error
            masks = PolygonMasks(segms)
        else:
            assert mask_format == "bitmask", mask_format
            masks = []
            for segm in segms:
                if isinstance(segm, list):
                    # polygon
                    masks.append(polygons_to_bitmask(segm, *image_size))
                elif isinstance(segm, dict):
                    # COCO RLE
                    masks.append(mask_util.decode(segm))
                elif isinstance(segm, np.ndarray):
                    assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
                        segm.ndim
                    )
                    # mask array
                    masks.append(segm)
                else:
                    raise ValueError(
                        "Cannot convert segmentation of type '{}' to BitMasks!"
                        "Supported types are: polygons as list[list[float] or ndarray],"
                        " COCO-style RLE as a dict, or a binary segmentation mask "
                        " in a 2D numpy array of shape HxW.".format(type(segm))
                    )
            # torch.from_numpy does not support array with negative stride.
            masks = BitMasks(
                torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks])
            )
        target.gt_masks = masks

    if len(annos) and "keypoints" in annos[0]:
        kpts = [obj.get("keypoints", []) for obj in annos]
        target.gt_keypoints = Keypoints(kpts)

    return target
def general_black_box_ensembles_post_processing(
        input_im,
        ensemble_pred_box_list,
        ensembles_class_idxs_list,
        ensemble_pred_prob_vectors_list,
        ensembles_pred_box_covariance_list,
        nms_threshold=0.5,
        max_detections_per_image=100,
        affinity_threshold=0.7,
        is_generalized_rcnn=False):
    """

    Args:
        input_im (list): an input im list generated from dataset handler.
        ensemble_pred_box_list (list): predicted box list
        ensembles_class_idxs_list (list): predicted classes list
        ensemble_pred_prob_vectors_list (list): predicted probability vector list
        ensembles_pred_box_covariance_list (list): predicted covariance matrices
        nms_threshold (float): non-maximum suppression threshold between 0-1
        affinity_threshold (float): cluster affinity threshold between 0-1
        is_generalized_rcnn (bool): used to handle category selection by removing background class.
    Returns:
        result (Instances): final results after nms

    """

    predicted_boxes = torch.cat(ensemble_pred_box_list, 0)
    predicted_boxes_covariance = torch.cat(ensembles_pred_box_covariance_list,
                                           0)
    predicted_prob_vectors = torch.cat(ensemble_pred_prob_vectors_list, 0)
    predicted_class_idxs = torch.cat(ensembles_class_idxs_list, 0)

    # Compute iou between all output boxes and each other output box.
    match_quality_matrix = pairwise_iou(Boxes(predicted_boxes),
                                        Boxes(predicted_boxes))

    # Perform basic sequential clustering.
    clusters = []
    for i in range(match_quality_matrix.shape[0]):
        # Check if current box is already a member of any previous cluster.
        if i != 0:
            all_clusters = torch.cat(clusters, 0)
            if (all_clusters == i).any():
                continue
        # Only add if boxes have the same category.
        cluster_membership_test = (
            match_quality_matrix[i, :] >= affinity_threshold) & (
                predicted_class_idxs == predicted_class_idxs[i])
        inds = torch.where(cluster_membership_test)
        clusters.extend(inds)

    # Compute mean and covariance for every cluster.
    predicted_boxes_list = []
    predicted_boxes_covariance_list = []
    predicted_prob_vectors_list = []

    # Compute cluster mean and covariance matrices.
    for cluster in clusters:
        box_cluster = predicted_boxes[cluster]
        box_cluster_covariance = predicted_boxes_covariance[cluster]
        if box_cluster.shape[0] >= 2:
            cluster_mean = box_cluster.mean(0)

            # Compute epistemic covariance
            residuals = (box_cluster - cluster_mean).unsqueeze(2)
            predicted_covariance = torch.sum(
                torch.matmul(residuals, torch.transpose(residuals, 2, 1)),
                0) / (box_cluster.shape[0] - 1)

            # Add epistemic covariance
            predicted_covariance = predicted_covariance + \
                box_cluster_covariance.mean(0)

            predicted_boxes_list.append(cluster_mean)
            predicted_boxes_covariance_list.append(predicted_covariance)
            predicted_prob_vectors_list.append(
                predicted_prob_vectors[cluster].mean(0))
        else:
            predicted_boxes_list.append(predicted_boxes[cluster].mean(0))
            predicted_boxes_covariance_list.append(
                predicted_boxes_covariance[cluster].mean(0))
            predicted_prob_vectors_list.append(
                predicted_prob_vectors[cluster].mean(0))

    result = Instances(
        (input_im[0]['image'].shape[1], input_im[0]['image'].shape[2]))

    if len(predicted_boxes_list) > 0:
        predicted_prob_vectors = torch.stack(predicted_prob_vectors_list, 0)

        # Remove background class if generalized rcnn
        if is_generalized_rcnn:
            predicted_prob_vectors_no_bkg = predicted_prob_vectors[:, :-1]
        else:
            predicted_prob_vectors_no_bkg = predicted_prob_vectors

        predicted_prob, classes_idxs = torch.max(predicted_prob_vectors_no_bkg,
                                                 1)
        predicted_boxes = torch.stack(predicted_boxes_list, 0)

        # We want to keep the maximum allowed boxes per image to be consistent
        # with the rest of the methods. However, just sorting will lead to quite alot of
        # redundant detections so we have to use one more NMS step.
        keep = batched_nms(predicted_boxes, predicted_prob, classes_idxs,
                           nms_threshold)
        keep = keep[:max_detections_per_image]

        result.pred_boxes = Boxes(predicted_boxes[keep])
        result.scores = predicted_prob[keep]
        result.pred_classes = classes_idxs[keep]
        result.pred_cls_probs = predicted_prob_vectors[keep]
        result.pred_boxes_covariance = torch.stack(
            predicted_boxes_covariance_list, 0)[keep]
    else:
        result.pred_boxes = Boxes(predicted_boxes)
        result.scores = torch.zeros(predicted_boxes.shape[0]).to(device)
        result.pred_classes = predicted_class_idxs
        result.pred_cls_probs = predicted_prob_vectors
        result.pred_boxes_covariance = torch.empty(
            (predicted_boxes.shape + (4, ))).to(device)
    return result
Exemple #25
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:
                   * "image": Tensor, image in (C, H, W) format.
                   * "sem_seg": semantic segmentation ground truth
                   * "center": center points heatmap ground truth
                   * "offset": pixel offsets to center points ground truth
                   * Other information that's included in the original dicts, such as:
                     "height", "width" (int): the output resolution of the model (may be different
                     from input resolution), used in inference.
        Returns:
            list[dict]:
                each dict is the results for one image. The dict contains the following keys:

                * "panoptic_seg", "sem_seg": see documentation
                    :doc:`/tutorials/models` for the standard output format
                * "instances": available if ``predict_instances is True``. see documentation
                    :doc:`/tutorials/models` for the standard output format
        """
        images = [x["image"].to(self.device) for x in batched_inputs]
        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
        # To avoid error in ASPP layer when input has different size.
        size_divisibility = (
            self.size_divisibility
            if self.size_divisibility > 0
            else self.backbone.size_divisibility
        )
        images = ImageList.from_tensors(images, size_divisibility)

        features = self.backbone(images.tensor)

        losses = {}
        if "sem_seg" in batched_inputs[0]:
            targets = [x["sem_seg"].to(self.device) for x in batched_inputs]
            targets = ImageList.from_tensors(
                targets, size_divisibility, self.sem_seg_head.ignore_value
            ).tensor
            if "sem_seg_weights" in batched_inputs[0]:
                # The default D2 DatasetMapper may not contain "sem_seg_weights"
                # Avoid error in testing when default DatasetMapper is used.
                weights = [x["sem_seg_weights"].to(self.device) for x in batched_inputs]
                weights = ImageList.from_tensors(weights, size_divisibility).tensor
            else:
                weights = None
        else:
            targets = None
            weights = None
        sem_seg_results, sem_seg_losses = self.sem_seg_head(features, targets, weights)
        losses.update(sem_seg_losses)

        if "center" in batched_inputs[0] and "offset" in batched_inputs[0]:
            center_targets = [x["center"].to(self.device) for x in batched_inputs]
            center_targets = ImageList.from_tensors(
                center_targets, size_divisibility
            ).tensor.unsqueeze(1)
            center_weights = [x["center_weights"].to(self.device) for x in batched_inputs]
            center_weights = ImageList.from_tensors(center_weights, size_divisibility).tensor

            offset_targets = [x["offset"].to(self.device) for x in batched_inputs]
            offset_targets = ImageList.from_tensors(offset_targets, size_divisibility).tensor
            offset_weights = [x["offset_weights"].to(self.device) for x in batched_inputs]
            offset_weights = ImageList.from_tensors(offset_weights, size_divisibility).tensor
        else:
            center_targets = None
            center_weights = None

            offset_targets = None
            offset_weights = None

        center_results, offset_results, center_losses, offset_losses = self.ins_embed_head(
            features, center_targets, center_weights, offset_targets, offset_weights
        )
        losses.update(center_losses)
        losses.update(offset_losses)

        if self.training:
            return losses

        if self.benchmark_network_speed:
            return []

        processed_results = []
        for sem_seg_result, center_result, offset_result, input_per_image, image_size in zip(
            sem_seg_results, center_results, offset_results, batched_inputs, images.image_sizes
        ):
            height = input_per_image.get("height")
            width = input_per_image.get("width")
            r = sem_seg_postprocess(sem_seg_result, image_size, height, width)
            c = sem_seg_postprocess(center_result, image_size, height, width)
            o = sem_seg_postprocess(offset_result, image_size, height, width)
            # Post-processing to get panoptic segmentation.
            panoptic_image, _ = get_panoptic_segmentation(
                r.argmax(dim=0, keepdim=True),
                c,
                o,
                thing_ids=self.meta.thing_dataset_id_to_contiguous_id.values(),
                label_divisor=self.meta.label_divisor,
                stuff_area=self.stuff_area,
                void_label=-1,
                threshold=self.threshold,
                nms_kernel=self.nms_kernel,
                top_k=self.top_k,
            )
            # For semantic segmentation evaluation.
            processed_results.append({"sem_seg": r})
            panoptic_image = panoptic_image.squeeze(0)
            semantic_prob = F.softmax(r, dim=0)

            # Write results to disk:
            img = input_per_image["image"]
            from detectron2.utils.visualizer import Visualizer
            from detectron2.data.detection_utils import convert_image_to_rgb
            from PIL import Image 
            import os

            img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format).astype("uint8")
            img = np.array(Image.fromarray(img).resize((width, height)))
            v_panoptic = Visualizer(img, self.meta)
            v_panoptic = v_panoptic.draw_panoptic_seg_predictions(panoptic_image.cpu(), None)
            pan_img = v_panoptic.get_image()
            image_path = input_per_image['file_name'].split(os.sep)
            image_name = os.path.splitext(image_path[-1])[0] 
            Image.fromarray(pan_img).save(os.path.join('/home/ahabbas/projects/conseg/affinityNet/output_pdl/coco/eval_vis', image_name + '_panoptic.png'))

            # For panoptic segmentation evaluation.
            processed_results[-1]["panoptic_seg"] = (panoptic_image, None)
            # For instance segmentation evaluation.
            if self.predict_instances:
                instances = []
                panoptic_image_cpu = panoptic_image.cpu().numpy()
                for panoptic_label in np.unique(panoptic_image_cpu):
                    if panoptic_label == -1:
                        continue
                    pred_class = panoptic_label // self.meta.label_divisor
                    isthing = pred_class in list(
                        self.meta.thing_dataset_id_to_contiguous_id.values()
                    )
                    # Get instance segmentation results.
                    if isthing:
                        instance = Instances((height, width))
                        # Evaluation code takes continuous id starting from 0
                        instance.pred_classes = torch.tensor(
                            [pred_class], device=panoptic_image.device
                        )
                        mask = panoptic_image == panoptic_label
                        instance.pred_masks = mask.unsqueeze(0)
                        # Average semantic probability
                        sem_scores = semantic_prob[pred_class, ...]
                        sem_scores = torch.mean(sem_scores[mask])
                        # Center point probability
                        mask_indices = torch.nonzero(mask).float()
                        center_y, center_x = (
                            torch.mean(mask_indices[:, 0]),
                            torch.mean(mask_indices[:, 1]),
                        )
                        center_scores = c[0, int(center_y.item()), int(center_x.item())]
                        # Confidence score is semantic prob * center prob.
                        instance.scores = torch.tensor(
                            [sem_scores * center_scores], device=panoptic_image.device
                        )
                        # Get bounding boxes
                        instance.pred_boxes = BitMasks(instance.pred_masks).get_bounding_boxes()
                        instances.append(instance)
                if len(instances) > 0:
                    processed_results[-1]["instances"] = Instances.cat(instances)

        return processed_results
def general_anchor_statistics_postprocessing(input_im,
                                             outputs,
                                             nms_threshold=0.5,
                                             max_detections_per_image=100,
                                             affinity_threshold=0.7):
    """

    Args:
        input_im (list): an input im list generated from dataset handler.
        outputs (list): output list form model specific inference function
        nms_threshold (float): non-maximum suppression threshold between 0-1
        max_detections_per_image (int): maximum allowed number of detections per image.
        affinity_threshold (float): cluster affinity threshold between 0-1
    Returns:
        result (Instances): final results after nms

    """

    predicted_boxes, predicted_boxes_covariance, predicted_prob, classes_idxs, predicted_prob_vectors = outputs

    # Get pairwise iou matrix
    match_quality_matrix = pairwise_iou(Boxes(predicted_boxes),
                                        Boxes(predicted_boxes))

    # Get cluster centers using standard nms. Much faster than sequential
    # clustering.
    keep = batched_nms(predicted_boxes, predicted_prob, classes_idxs,
                       nms_threshold)

    keep = keep[:max_detections_per_image]

    clusters_inds = match_quality_matrix[keep, :]
    clusters_inds = clusters_inds > affinity_threshold

    # Compute mean and covariance for every cluster.
    predicted_prob_vectors_list = []
    predicted_boxes_list = []
    predicted_boxes_covariance_list = []

    for cluster_idxs, center_idx in zip(clusters_inds, keep):

        if cluster_idxs.sum(0) >= 2:
            # Make sure to only select cluster members of same class as center
            cluster_center_classes_idx = classes_idxs[center_idx]
            cluster_classes_idxs = classes_idxs[cluster_idxs]
            class_similarity_idxs = cluster_classes_idxs == cluster_center_classes_idx

            # Grab cluster
            box_cluster = predicted_boxes[cluster_idxs, :][
                class_similarity_idxs, :]

            cluster_mean = box_cluster.mean(0)

            residuals = (box_cluster - cluster_mean).unsqueeze(2)
            cluster_covariance = torch.sum(
                torch.matmul(residuals, torch.transpose(residuals, 2, 1)),
                0) / max((box_cluster.shape[0] - 1), 1.0)

            # Assume final result as mean and covariance of gaussian mixture of cluster members if covariance is provided
            # by neural network
            if predicted_boxes_covariance is not None:
                if len(predicted_boxes_covariance) > 0:
                    cluster_covariance = cluster_covariance + \
                        predicted_boxes_covariance[cluster_idxs, :][class_similarity_idxs, :].mean(0)

            # Compute average over cluster probabilities
            cluster_probs_vector = predicted_prob_vectors[cluster_idxs, :][
                class_similarity_idxs, :].mean(0)
        else:
            cluster_mean = predicted_boxes[center_idx]
            cluster_probs_vector = predicted_prob_vectors[center_idx]
            cluster_covariance = 1e-4 * torch.eye(4, 4).to(device)
            if predicted_boxes_covariance is not None:
                if len(predicted_boxes_covariance) > 0:
                    cluster_covariance = predicted_boxes_covariance[center_idx]

        predicted_boxes_list.append(cluster_mean)
        predicted_boxes_covariance_list.append(cluster_covariance)
        predicted_prob_vectors_list.append(cluster_probs_vector)

    result = Instances(
        (input_im[0]['image'].shape[1], input_im[0]['image'].shape[2]))

    if len(predicted_boxes_list) > 0:
        # We do not average the probability vectors for this post processing method. Averaging results in
        # very low mAP due to mixing with low scoring detection instances.
        result.pred_boxes = Boxes(torch.stack(predicted_boxes_list, 0))
        predicted_prob_vectors = torch.stack(predicted_prob_vectors_list, 0)
        predicted_prob, classes_idxs = torch.max(predicted_prob_vectors, 1)
        result.scores = predicted_prob
        result.pred_classes = classes_idxs
        result.pred_cls_probs = predicted_prob_vectors
        result.pred_boxes_covariance = torch.stack(
            predicted_boxes_covariance_list, 0)
    else:
        result.pred_boxes = Boxes(predicted_boxes)
        result.scores = torch.zeros(predicted_boxes.shape[0]).to(device)
        result.pred_classes = classes_idxs
        result.pred_cls_probs = predicted_prob_vectors
        result.pred_boxes_covariance = torch.empty(
            (predicted_boxes.shape + (4, ))).to(device)
    return result
Exemple #27
0
    def forward_for_single_feature_map(self,
                                       locations,
                                       logits_pred,
                                       reg_pred,
                                       ctrness_pred,
                                       image_sizes,
                                       top_feat=None):
        N, C, H, W = logits_pred.shape

        # put in the same format as locations
        logits_pred = logits_pred.view(N, C, H, W).permute(0, 2, 3, 1)
        logits_pred = logits_pred.reshape(N, -1, C).sigmoid()
        box_regression = reg_pred.view(N, 4, H, W).permute(0, 2, 3, 1)
        box_regression = box_regression.reshape(N, -1, 4)
        ctrness_pred = ctrness_pred.view(N, 1, H, W).permute(0, 2, 3, 1)
        ctrness_pred = ctrness_pred.reshape(N, -1).sigmoid()
        if top_feat is not None:
            top_feat = top_feat.view(N, -1, H, W).permute(0, 2, 3, 1)
            top_feat = top_feat.reshape(N, H * W, -1)

        # if self.thresh_with_ctr is True, we multiply the classification
        # scores with centerness scores before applying the threshold.
        if self.thresh_with_ctr:
            logits_pred = logits_pred * ctrness_pred[:, :, None]
        candidate_inds = logits_pred > self.pre_nms_thresh
        pre_nms_top_n = candidate_inds.view(N, -1).sum(1)
        pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_topk)

        if not self.thresh_with_ctr:
            logits_pred = logits_pred * ctrness_pred[:, :, None]

        results = []
        for i in range(N):
            per_box_cls = logits_pred[i]
            per_candidate_inds = candidate_inds[i]
            per_box_cls = per_box_cls[per_candidate_inds]

            per_candidate_nonzeros = per_candidate_inds.nonzero()
            per_box_loc = per_candidate_nonzeros[:, 0]
            per_class = per_candidate_nonzeros[:, 1]

            per_box_regression = box_regression[i]
            per_box_regression = per_box_regression[per_box_loc]
            per_locations = locations[per_box_loc]
            if top_feat is not None:
                per_top_feat = top_feat[i]
                per_top_feat = per_top_feat[per_box_loc]

            per_pre_nms_top_n = pre_nms_top_n[i]

            if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
                per_box_cls, top_k_indices = \
                    per_box_cls.topk(per_pre_nms_top_n, sorted=False)
                per_class = per_class[top_k_indices]
                per_box_regression = per_box_regression[top_k_indices]
                per_locations = per_locations[top_k_indices]
                if top_feat is not None:
                    per_top_feat = per_top_feat[top_k_indices]

            detections = torch.stack([
                per_locations[:, 0] - per_box_regression[:, 0],
                per_locations[:, 1] - per_box_regression[:, 1],
                per_locations[:, 0] + per_box_regression[:, 2],
                per_locations[:, 1] + per_box_regression[:, 3],
            ],
                                     dim=1)

            boxlist = Instances(image_sizes[i])
            boxlist.pred_boxes = Boxes(detections)
            boxlist.scores = torch.sqrt(per_box_cls)
            boxlist.pred_classes = per_class
            boxlist.locations = per_locations
            if top_feat is not None:
                boxlist.top_feat = per_top_feat
            results.append(boxlist)

        return results
Exemple #28
0
    "roi_heads.box_head.cls_score.bias":
    "roi_heads.box_predictor.cls_score.bias",
    "roi_heads.box_head.bbox_pred.weight":
    "roi_heads.box_predictor.bbox_pred.weight",
    "roi_heads.box_head.bbox_pred.bias":
    "roi_heads.box_predictor.bbox_pred.bias",
}

temp = torch.load("weight.pt")
temp = {state_dict_map.get(k, k): v for k, v in temp.items()}
print("Problems with:\n" +
      "\n".join([k for k in net.state_dict() if k not in temp]))
net.load_state_dict({k: temp.get(k, v) for k, v in net.state_dict().items()})
#net.eval()

targets = Instances((512, 512))
targets.gt_boxes = Boxes(torch.load("targets.pt")["boxes"])
targets.gt_classes = torch.load("targets.pt")["classes"]

data = [{"image": torch.load("data.pt").cuda(), "instances": targets}]

storage_4del = EventStorage(0).__enter__()

torch.random.manual_seed(0)
torch.cuda.manual_seed(0)

# with torch.no_grad():
for i in range(3):
    torch.cuda.synchronize()
    t = time.time()
    losses = net(data)
Exemple #29
0
def detector_postprocess(results: Instances,
                         output_height: int,
                         output_width: int,
                         mask_threshold: float = 0.5):
    """
    Resize the output instances.
    The input images are often resized when entering an object detector.
    As a result, we often need the outputs of the detector in a different
    resolution from its inputs.

    This function will resize the raw outputs of an R-CNN detector
    to produce outputs according to the desired output resolution.

    Args:
        results (Instances): the raw outputs from the detector.
            `results.image_size` contains the input image resolution the detector sees.
            This object might be modified in-place.
        output_height, output_width: the desired output resolution.

    Returns:
        Instances: the resized output from the model, based on the output resolution
    """
    # Change to 'if is_tracing' after PT1.7
    if isinstance(output_height, torch.Tensor):
        # Converts integer tensors to float temporaries to ensure true
        # division is performed when computing scale_x and scale_y.
        output_width_tmp = output_width.float()
        output_height_tmp = output_height.float()
        new_size = torch.stack([output_height, output_width])
    else:
        new_size = (output_height, output_width)
        output_width_tmp = output_width
        output_height_tmp = output_height

    scale_x, scale_y = (
        output_width_tmp / results.image_size[1],
        output_height_tmp / results.image_size[0],
    )
    results = Instances(new_size, **results.get_fields())

    if results.has("pred_boxes"):
        output_boxes = results.pred_boxes
    elif results.has("proposal_boxes"):
        output_boxes = results.proposal_boxes
    else:
        output_boxes = None
    assert output_boxes is not None, "Predictions must contain boxes!"

    output_boxes.scale(scale_x, scale_y)
    output_boxes.clip(results.image_size)

    results = results[output_boxes.nonempty()]

    if results.has("pred_masks"):
        results.pred_masks = retry_if_cuda_oom(paste_masks_in_image)(
            results.pred_masks[:, 0, :, :],  # N, 1, M, M
            results.pred_boxes,
            results.image_size,
            threshold=mask_threshold,
        )

    if results.has("pred_keypoints"):
        results.pred_keypoints[:, :, 0] *= scale_x
        results.pred_keypoints[:, :, 1] *= scale_y

    return results
Exemple #30
0
 def f(x: Tensor):
     image_shape = (15, 15)
     # __init__ can take arguments
     inst = Instances(image_shape, a=x, proposal_boxes=Boxes(x))
     inst2 = Instances(image_shape, a=x)
     return inst.a, inst2.a