Esempio n. 1
0
    def preprocess_training_image(self, batched_inputs):
        images = [x["image"].to(self.device) for x in batched_inputs]
        images_norm = [self.normalizer(x) for x in images]
        images_norm = ImageList.from_tensors(images_norm,
                                             self.backbone.size_divisibility)
        # get shape infomation
        unpadded_im_shape = [x.shape[1] for x in images]
        ori_im_shape = [x["height"] for x in batched_inputs]
        # build padded_mask for ignore bottom edge and extra padded edge
        padded_mask = [
            images[0].new_ones(x.shape[1], x.shape[2], dtype=torch.float)
            for x in images
        ]
        # padding, downsampling upsampled images and transforming to LAB space
        images = ImageList.from_tensors(images,
                                        self.backbone.size_divisibility)
        downsampled_images = F.avg_pool2d(images.tensor.float(),
                                          kernel_size=self.mask_out_stride,
                                          stride=self.mask_out_stride,
                                          padding=0)
        lab_images = torch.stack(
            [self.bgr_to_lab(x) for x in downsampled_images])

        # Mask out the bottom area where the COCO dataset probably has wrong annotations
        # This trick is adopted by adelaidet's boxinst codebase.
        # In fact, this trick has no influence to final result.
        for i in range(len(padded_mask)):
            pixels_removed = int(self.bottom_pixels_removed *
                                 unpadded_im_shape[i] / ori_im_shape[i])
            if pixels_removed > 0:
                padded_mask[i][-pixels_removed:, :] = 0
        padded_mask = ImageList.from_tensors(padded_mask,
                                             self.backbone.size_divisibility)
        padded_mask = padded_mask.tensor.unsqueeze(1)  # B,H,W -> B,1,H,W
        return images_norm, lab_images, padded_mask
Esempio n. 2
0
    def test_imagelist_padding_shape(self):
        ret = ImageList.from_tensors(
            [torch.ones((3, 15, 20), dtype=torch.float32)], 4).tensor
        self.assertEqual(list(ret.shape), [1, 3, 16, 20], str(ret.shape))

        ret = ImageList.from_tensors([
            torch.ones((3, 25, 20), dtype=torch.float32),
            torch.ones((3, 10, 10), dtype=torch.float32),
        ], 4).tensor
        self.assertEqual(list(ret.shape), [2, 3, 28, 20], str(ret.shape))
Esempio n. 3
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
            Each item in the list contains the inputs for one image.

                For now, each item in the list is a dict that contains:

                   * "image": Tensor, image in (C, H, W) format.
                   * "sem_seg": semantic segmentation ground truth
                   * Other information that's included in the original dicts, such as:
                     "height", "width" (int): the output resolution of the model, used in inference.
                     See :meth:`postprocess` for details.

            list[dict]:
              Each dict is the output for one input image.
              The dict contains one key "sem_seg" whose value is a
              Tensor of the output resolution that represents the
              per-pixel segmentation prediction.
        """
        images = [x["image"].to(self.device) for x in batched_inputs]
        images = [self.normalizer(x) for x in images]
        images = ImageList.from_tensors(images,
                                        self.backbone.size_divisibility)

        features = self.backbone(images.tensor)
        if "sem_seg" in batched_inputs[0]:
            targets = [x["sem_seg"].to(self.device) for x in batched_inputs]
            targets = ImageList.from_tensors(
                targets,
                self.backbone.size_divisibility,
                pad_value=self.sem_seg_head.ignore_value).tensor
        else:
            targets = None

        if self.training:
            _, losses = self.sem_seg_head(features, targets)
            return losses
        else:
            results, _ = self.sem_seg_head(features, images.tensor)

        processed_results = []
        for result, input_per_image, image_size in zip(results, batched_inputs,
                                                       images.image_sizes):
            height = input_per_image.get("height")
            width = input_per_image.get("width")
            r = sem_seg_postprocess(result, image_size, height, width)
            processed_results.append({"sem_seg": r})
        return processed_results
Esempio n. 4
0
def convert_batched_inputs_to_c2_format(batched_inputs, size_divisibility,
                                        device):
    """
    See get_caffe2_inputs() below.
    """
    assert all(isinstance(x, dict) for x in batched_inputs)
    assert all(x["image"].dim() == 3 for x in batched_inputs)

    images = [x["image"] for x in batched_inputs]
    images = ImageList.from_tensors(images, size_divisibility)

    im_info = []
    for input_per_image, image_size in zip(batched_inputs, images.image_sizes):
        target_height = input_per_image.get("height", image_size[0])
        target_width = input_per_image.get("width", image_size[1])  # noqa
        # NOTE: The scale inside im_info is kept as convention and for providing
        # post-processing information if further processing is needed. For
        # current Caffe2 model definitions that don't include post-processing inside
        # the model, this number is not used.
        # NOTE: There can be a slight difference between width and height
        # scales, using a single number can results in numerical difference
        # compared with D2's post-processing.
        scale = target_height / image_size[0]
        im_info.append([image_size[0], image_size[1], scale])
    im_info = torch.Tensor(im_info)

    return images.tensor.to(device), im_info.to(device)
Esempio n. 5
0
 def preprocess_image(self, batched_inputs):
     """
     Normalize, pad and batch the input images.
     """
     images = [x["image"].float().to(self.device) for x in batched_inputs]
     images = [self.normalizer(x.div(255)) for x in images]
     images = ImageList.from_tensors(images, self.network.size_divisibility)
     return images
Esempio n. 6
0
 def preprocess_image(self, batched_inputs):
     """
     Normalize, pad and batch the input images.
     """
     images = [x["image"].to(self.device) for x in batched_inputs]
     images = [(x - self.pixel_mean) / self.pixel_std for x in images]
     images = ImageList.from_tensors(images,
                                     self.backbone.size_divisibility)
     return images
Esempio n. 7
0
 def test_rpn_proposals_inf(self):
     N, Hi, Wi, A = 3, 3, 3, 3
     images = ImageList.from_tensors(
         [torch.rand(3, 20, 30) for i in range(N)])
     proposals = [torch.rand(N, Hi * Wi * A, 4)]
     pred_logits = [torch.rand(N, Hi * Wi * A)]
     pred_logits[0][1][3:5].fill_(float("inf"))
     find_top_rpn_proposals(proposals, pred_logits, images, 0.5, "normal",
                            1000, 1000, 0, False)
Esempio n. 8
0
    def preprocess_image(self, batched_inputs):
        """
        Normalize, pad and batch the input images.
        """
        images = [x['image'].to(self.device) for x in batched_inputs]
        images = [self.normalizer(x) for x in images]
        images = ImageList.from_tensors(images,
                                        self.backbone.size_divisibility)

        return images
Esempio n. 9
0
    def forward(self, batched_inputs):
        """
        Args:
            Same as in :class:`GeneralizedRCNN.forward`

        Returns:
            list[dict]:
                Each dict is the output for one input image.
                The dict contains one key "proposals" whose value is a
                :class:`Instances` with keys "proposal_boxes" and "objectness_logits".
        """
        images = [x["image"].to(self.device) for x in batched_inputs]
        images = [self.normalizer(x) for x in images]
        images = ImageList.from_tensors(images,
                                        self.backbone.size_divisibility)
        features = self.backbone(images.tensor)

        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                logging.WARN,
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None
        proposals, proposal_losses = self.proposal_generator(
            images, features, gt_instances)
        # In training, the proposals are not useful at all but we generate them anyway.
        # This makes RPN-only models about 5% slower.
        if self.training:
            return proposal_losses

        processed_results = []
        for results_per_image, input_per_image, image_size in zip(
                proposals, batched_inputs, images.image_sizes):
            height = input_per_image.get("height", image_size[0])
            width = input_per_image.get("width", image_size[1])
            r = detector_postprocess(results_per_image, height, width)
            processed_results.append({"proposals": r})
        return processed_results
Esempio n. 10
0
    def preprocess_image(self, batched_inputs):
        """
        Normalize, pad and batch the input images.
        """
        images = [x["image"].float().to(self.device) for x in batched_inputs]
        images = [self.normalizer(img) for img in images]
        images = ImageList.from_tensors(images,
                                        self.backbone.size_divisibility)

        images_whwh = list()
        for bi in batched_inputs:
            h, w = bi["image"].shape[-2:]
            images_whwh.append(
                torch.tensor([w, h, w, h],
                             dtype=torch.float32,
                             device=self.device))
        images_whwh = torch.stack(images_whwh)

        return images, images_whwh
Esempio n. 11
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
        For now, each item in the list is a dict that contains:
            image: Tensor, image in (C, H, W) format.
            sem_seg: semantic segmentation ground truth
            Other information that's included in the original dicts, such as:
                "height", "width" (int): the output resolution of the model, used in inference.
                    See :meth:`postprocess` for details.
        Returns:
            list[dict]: Each dict is the output for one input image.
                The dict contains one key "sem_seg" whose value is a
                Tensor of the output resolution that represents the
                per-pixel segmentation prediction.
        """
        images = [x["image"].to(self.device) for x in batched_inputs]
        images = [self.normalizer(x) for x in images]
        images = ImageList.from_tensors(images,
                                        self.backbone.size_divisibility)

        # step_rate: a float, calculated by current_step/total_step,
        #         This parameter is used for Scheduled Drop Path.
        step_rate = self.iter * 1.0 / self.max_iter
        self.iter += 1
        features, expt_flops, real_flops = self.backbone(
            images.tensor, step_rate)

        if "sem_seg" in batched_inputs[0]:
            targets = [x["sem_seg"].to(self.device) for x in batched_inputs]
            targets = ImageList.from_tensors(
                targets, self.backbone.size_divisibility, False,
                self.sem_seg_head.ignore_value).tensor
        else:
            targets = None

        results, losses = self.sem_seg_head(features, targets)
        # calculate flops
        real_flops += self.sem_seg_head.flops
        # remove grad, avoid adding flops to the loss sum
        real_flops = real_flops.detach().requires_grad_(False)
        expt_flops = expt_flops.detach().requires_grad_(False)
        flops = {'real_flops': real_flops, 'expt_flops': expt_flops}
        # use budget constraint for training
        if self.training:
            if self.constrain_on and step_rate >= self.unupdate_rate:
                warm_up_rate = min(1.0,
                                   (step_rate - self.unupdate_rate) / 0.02)
                loss_budget = self.budget_constrint(expt_flops,
                                                    warm_up_rate=warm_up_rate)
                losses.update({'loss_budget': loss_budget})

            losses.update(flops)
            return losses

        processed_results = []
        for result, input_per_image, image_size in zip(results, batched_inputs,
                                                       images.image_sizes):
            height = input_per_image.get("height")
            width = input_per_image.get("width")
            r = sem_seg_postprocess(result, image_size, height, width)
            processed_results.append({"sem_seg": r, "flops": flops})
        return processed_results
Esempio n. 12
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
                Each item in the list contains the inputs for one image.

                For now, each item in the list is a dict that contains:

                * "image": Tensor, image in (C, H, W) format.
                * "instances": Instances
                * "sem_seg": semantic segmentation ground truth.
                * Other information that's included in the original dicts, such as:
                  "height", "width" (int): the output resolution of the model, used in inference.
                  See :meth:`postprocess` for details.

        Returns:
            list[dict]:
                each dict is the results for one image. The dict contains the following keys:

                * "instances": see :meth:`GeneralizedRCNN.forward` for its format.
                * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
                * "panoptic_seg": available when `PANOPTIC_FPN.COMBINE.ENABLED`.
                  See the return value of
                  :func:`combine_semantic_and_instance_outputs` for its format.
        """
        images = [x["image"].to(self.device) for x in batched_inputs]
        images = [self.normalizer(x) for x in images]
        images = ImageList.from_tensors(images,
                                        self.backbone.size_divisibility)
        features = self.backbone(images.tensor)

        if "proposals" in batched_inputs[0]:
            proposals = [
                x["proposals"].to(self.device) for x in batched_inputs
            ]
            proposal_losses = {}

        if "sem_seg" in batched_inputs[0]:
            gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs]
            gt_sem_seg = ImageList.from_tensors(
                gt_sem_seg,
                self.backbone.size_divisibility,
                pad_value=self.sem_seg_head.ignore_value).tensor
        else:
            gt_sem_seg = None
        sem_seg_results, sem_seg_losses = self.sem_seg_head(
            features, gt_sem_seg)

        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None
        if self.proposal_generator:
            proposals, proposal_losses = self.proposal_generator(
                images, features, gt_instances)
        detector_results, detector_losses = self.roi_heads(
            images, features, proposals, gt_instances)

        if self.training:
            losses = {}
            losses.update(sem_seg_losses)
            losses.update({
                k: v * self.instance_loss_weight
                for k, v in detector_losses.items()
            })
            losses.update(proposal_losses)
            return losses

        processed_results = []
        for sem_seg_result, detector_result, input_per_image, image_size in zip(
                sem_seg_results, detector_results, batched_inputs,
                images.image_sizes):
            height = input_per_image.get("height", image_size[0])
            width = input_per_image.get("width", image_size[1])
            sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height,
                                            width)
            detector_r = detector_postprocess(detector_result, height, width)

            processed_results.append({
                "sem_seg": sem_seg_r,
                "instances": detector_r
            })

            if self.combine_on:
                panoptic_r = combine_semantic_and_instance_outputs(
                    detector_r,
                    sem_seg_r.argmax(dim=0),
                    self.combine_overlap_threshold,
                    self.combine_stuff_area_limit,
                    self.combine_instances_confidence_threshold,
                )
                processed_results[-1]["panoptic_seg"] = panoptic_r
        return processed_results
Esempio n. 13
0
    def preprocess_image(self, batched_inputs, training):
        """
        Normalize, pad and batch the input images.
        """
        images = [x["image"].to(self.device) for x in batched_inputs]
        bs = len(images)
        images = [self.normalizer(x) for x in images]

        images = ImageList.from_tensors(images,
                                        size_divisibility=0,
                                        pad_ref_long=True)

        # sync image size for all gpus
        comm.synchronize()
        if training and self.iter % self.change_iter == 0:
            if self.iter < self.max_iter - 20000:
                meg = torch.LongTensor(1).to(self.device)
                comm.synchronize()
                if comm.is_main_process():
                    size = np.random.choice(self.multi_size)
                    meg.fill_(size)

                if comm.get_world_size() > 1:
                    comm.synchronize()
                    dist.broadcast(meg, 0)
                self.size = meg.item()

                comm.synchronize()
            else:
                self.size = 608

        if training:

            # resize image inputs
            modes = ['bilinear', 'nearest', 'bicubic', 'area']
            mode = modes[random.randrange(4)]
            if mode == 'bilinear' or mode == 'bicubic':
                images.tensor = F.interpolate(images.tensor,
                                              size=[self.size, self.size],
                                              mode=mode,
                                              align_corners=False)
            else:
                images.tensor = F.interpolate(images.tensor,
                                              size=[self.size, self.size],
                                              mode=mode)

            if "instances" in batched_inputs[0]:
                gt_instances = [
                    x["instances"].to(self.device) for x in batched_inputs
                ]
            elif "targets" in batched_inputs[0]:
                log_first_n(
                    logging.WARN,
                    "'targets' in the model inputs is now renamed to 'instances'!",
                    n=10)
                gt_instances = [
                    x["targets"].to(self.device) for x in batched_inputs
                ]
            else:
                gt_instances = None

            targets = [
                torch.cat([
                    instance.gt_classes.float().unsqueeze(-1),
                    instance.gt_boxes.tensor
                ],
                          dim=-1) for instance in gt_instances
            ]
            labels = torch.zeros((bs, 100, 5))
            for i, target in enumerate(targets):
                labels[i][:target.shape[0]] = target
            labels[:, :, 1:] = labels[:, :, 1:] / 512. * self.size
        else:
            labels = None

        self.iter += 1
        return images, labels