Example #1
0
    def __init__(self, cfg):
        super(DynamicMaskHead, self).__init__()
        # fmt: off
        self.num_layers = cfg.MODEL.CONDINST.MASK_HEAD.NUM_LAYERS
        self.channels = cfg.MODEL.CONDINST.MASK_HEAD.HEAD_CHANNELS
        self.in_channels = cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS
        self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
        self.disable_rel_coords = cfg.MODEL.CONDINST.MASK_HEAD.DISABLE_REL_COORDS
        soi = [soi[1] for soi in cfg.MODEL.FCOS.OBJECT_SIZES_OF_INTEREST[:-1]]
        # fmt: on
        if self.disable_rel_coords:
            log_first_n("WARNING", "Training CondInst without Coord", n=1)
        self.sizes_of_interest = torch.tensor(
            soi + [soi[-1] * 2])  # 64 128 256 512 1024

        weight_nums, bias_nums = [], []
        for layer_ind in range(self.num_layers):
            if layer_ind == 0:
                if not self.disable_rel_coords:
                    weight_nums.append((self.in_channels + 2) * self.channels)
                else:
                    weight_nums.append(self.in_channels * self.channels)
                bias_nums.append(self.channels)
            elif layer_ind == self.num_layers - 1:
                weight_nums.append(self.channels * 1)
                bias_nums.append(1)
            else:
                weight_nums.append(self.channels * self.channels)
                bias_nums.append(self.channels)

        self.weight_nums = weight_nums
        self.bias_nums = bias_nums
        self.num_gen_params = sum(weight_nums) + sum(bias_nums)
        assert len(weight_nums) == len(bias_nums)
Example #2
0
File: fcos.py Project: zyg11/DeFCN
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:

                * image: Tensor, image in (C, H, W) format.
                * instances: Instances

                Other information that's included in the original dicts, such as:

                * "height", "width" (int): the output resolution of the model, used in inference.
                    See :meth:`postprocess` for details.
        Returns:
            dict[str: Tensor]:
                mapping from a named loss to a tensor storing the loss. Used during training only.
        """
        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                logging.WARN,
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]
        box_cls, box_delta, box_center = self.head(features)
        shifts = self.shift_generator(features)

        if self.training:
            # remove gt_instances with ignore label
            gt_instances = [
                inst[inst.gt_classes >= 0] for inst in gt_instances
            ]
            gt_classes, gt_shifts_reg_deltas, gt_centerness = self.get_ground_truth(
                shifts, gt_instances)
            return self.losses(gt_classes, gt_shifts_reg_deltas, gt_centerness,
                               box_cls, box_delta, box_center)
        else:
            results = self.inference(box_cls, box_delta, box_center, shifts,
                                     images)
            processed_results = []
            for results_per_image, input_per_image, image_size in zip(
                    results, batched_inputs, images.image_sizes):
                height = input_per_image.get("height", image_size[0])
                width = input_per_image.get("width", image_size[1])
                r = detector_postprocess(results_per_image, height, width)
                processed_results.append({"instances": r})
            return processed_results
Example #3
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:

                * image: Tensor, image in (C, H, W) format.
                * instances: Instances

                Other information that's included in the original dicts, such as:

                * "height", "width" (int): the output resolution of the model, used in inference.
                  See :meth:`postprocess` for details.
        Returns:
            dict[str: Tensor]:
                mapping from a named loss to a tensor storing the loss. Used during training only.
        """
        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                logging.WARN,
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]
        box_cls, box_delta = self.decoder(self.encoder(features[0]))
        anchors = self.anchor_generator(features)
        # Transpose the Hi*Wi*A dimension to the middle:
        pred_logits = [permute_to_N_HWA_K(box_cls, self.num_classes)]
        pred_anchor_deltas = [permute_to_N_HWA_K(box_delta, 4)]

        if self.training:
            indices = self.get_ground_truth(anchors, pred_anchor_deltas,
                                            gt_instances)
            losses = self.losses(indices, gt_instances, anchors, pred_logits,
                                 pred_anchor_deltas)
            return losses
        else:
            results = self.inference([box_cls], [box_delta], anchors,
                                     images.image_sizes)
            processed_results = []
            for results_per_image, input_per_image, image_size in zip(
                    results, batched_inputs, images.image_sizes):
                height = input_per_image.get("height", image_size[0])
                width = input_per_image.get("width", image_size[1])
                r = detector_postprocess(results_per_image, height, width)
                processed_results.append({"instances": r})
            return processed_results
Example #4
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:

                * image: Tensor, image in (C, H, W) format.
                * instances: Instances

                Other information that's included in the original dicts, such as:

                * "height", "width" (int): the output resolution of the model, used in inference.
                    See :meth:`postprocess` for details.
        Returns:
            dict[str: Tensor]:
                mapping from a named loss to a tensor storing the loss. Used during training only.
        """
        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                "WARNING",
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]

        if self.training:
            ins_preds_x, ins_preds_y, cate_preds = self.head(features,
                                                             eval=False)
            featmap_sizes = [featmap.size()[-2:] for featmap in ins_preds_x]
            ins_label_list, cate_label_list, ins_ind_label_list, ins_ind_label_list_xy = \
                self.get_ground_truth(gt_instances, featmap_sizes)
            return self.losses(ins_preds_x, ins_preds_y, cate_preds,
                               ins_label_list, cate_label_list,
                               ins_ind_label_list, ins_ind_label_list_xy)
        else:
            ins_preds_x, ins_preds_y, cate_preds = self.head(features,
                                                             eval=True)
            results = self.inference(ins_preds_x, ins_preds_y, cate_preds,
                                     batched_inputs)
            processed_results = [{"instances": r} for r in results]
            return processed_results
Example #5
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of : class:`DatasetMapper`.
            Each item in the list contains the input for one image.
            For now, each item in the list is a dict that contains:
             * images: Tensor, image in (C, H, W) format.
             * instances: Instances.
             Other information that' s included in the original dict ,such as:
             * "height", "width"(int): the output resolution of the model,
             used in inference.See  `postprocess` for detail
        Return:
            dict[str: Tensor]:
                mapping from a named loss to a tensor storing the loss, Used
                during training only.
                At inference stage, return predicted bboxes.
        """
        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [
                x['instances'].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                "WARNING",
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x['instances'].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]
        cls_outs, pts_outs_init, pts_outs_refine = self.head(features)
        center_pts = self.shift_generator(features)

        if self.training:
            return self.losses(center_pts, cls_outs, pts_outs_init,
                               pts_outs_refine, gt_instances)
        else:
            results = self.inference(center_pts, cls_outs, pts_outs_init,
                                     pts_outs_refine, images)
            processed_results = []
            for results_per_image, input_per_image, image_size in zip(
                    results, batched_inputs, images.image_sizes):
                height = input_per_image.get("height", image_size[0])
                width = input_per_image.get("width", image_size[1])
                r = detector_postprocess(results_per_image, height, width)
                processed_results.append({"instances": r})
            return processed_results
Example #6
0
    def forward(self, batched_inputs):
        """
        Args:
            Same as in :class:`GeneralizedRCNN.forward`

        Returns:
            list[dict]:
                Each dict is the output for one input image.
                The dict contains one key "proposals" whose value is a
                :class:`Instances` with keys "proposal_boxes" and "objectness_logits".
        """
        images = [x["image"].to(self.device) for x in batched_inputs]
        images = [self.normalizer(x) for x in images]
        images = ImageList.from_tensors(images,
                                        self.backbone.size_divisibility)
        features = self.backbone(images.tensor)

        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                logging.WARN,
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None
        proposals, proposal_losses = self.proposal_generator(
            images, features, gt_instances)
        # In training, the proposals are not useful at all but we generate them anyway.
        # This makes RPN-only models about 5% slower.
        if self.training:
            return proposal_losses

        processed_results = []
        for results_per_image, input_per_image, image_size in zip(
                proposals, batched_inputs, images.image_sizes):
            height = input_per_image.get("height", image_size[0])
            width = input_per_image.get("width", image_size[1])
            r = detector_postprocess(results_per_image, height, width)
            processed_results.append({"proposals": r})
        return processed_results
Example #7
0
def print_instances_class_histogram(dataset_dicts, class_names):
    """
    Args:
        dataset_dicts (list[dict]): list of dataset dicts.
        class_names (list[str]): list of class names (zero-indexed).
    """
    num_classes = len(class_names)
    hist_bins = np.arange(num_classes + 1)
    histogram = np.zeros((num_classes, ), dtype=np.int)
    for entry in dataset_dicts:
        annos = entry["annotations"]
        classes = [x["category_id"] for x in annos if not x.get("iscrowd", 0)]
        histogram += np.histogram(classes, bins=hist_bins)[0]

    N_COLS = min(6, len(class_names) * 2)

    def short_name(x):
        # make long class names shorter. useful for lvis
        if len(x) > 13:
            return x[:11] + ".."
        return x

    data = list(
        itertools.chain(
            *[[short_name(class_names[i]), int(v)]
              for i, v in enumerate(histogram)]))
    total_num_instances = sum(data[1::2])
    data.extend([None] * (N_COLS - (len(data) % N_COLS)))
    if num_classes > 1:
        data.extend(["total", total_num_instances])
    data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)])
    table = tabulate(
        data,
        headers=["category", "#instances"] * (N_COLS // 2),
        tablefmt="pipe",
        numalign="left",
        stralign="center",
    )
    log_first_n(
        logging.INFO,
        "Distribution of instances among all {} categories:\n".format(
            num_classes) + colored(table, "cyan"),
        key="message",
    )
Example #8
0
def _flatten_to_tuple(outputs):
    result = []
    if isinstance(outputs, torch.Tensor):
        result.append(outputs)
    elif isinstance(outputs, (list, tuple)):
        for v in outputs:
            result.extend(_flatten_to_tuple(v))
    elif isinstance(outputs, dict):
        for _, v in outputs.items():
            result.extend(_flatten_to_tuple(v))
    elif isinstance(outputs, Instances):
        result.extend(_flatten_to_tuple(outputs.get_fields()))
    elif isinstance(outputs, (Boxes, BitMasks, ImageList)):
        result.append(outputs.tensor)
    else:
        log_first_n(
            "WARNING",
            f"Output of type {type(outputs)} not included in flops/activations count.",
            n=10,
        )
    return tuple(result)
Example #9
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:

                * image: Tensor, image in (C, H, W) format.
                * instances: Instances

                Other information that's included in the original dicts, such as:

                * "height", "width" (int): the output resolution of the model, used in inference.
                  See :meth:`postprocess` for details.
        Returns:
            result (list[BoxList] or dict[Tensor]): the output from the model.
                During training, it returns a dict[Tensor] which contains the losses.
                During testing, it returns list[BoxList] contains additional fields
                like `scores`, `labels` and `mask` (for Mask R-CNN models).

        """
        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                logging.WARN,
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]
        shifts = self.shift_generator(features)
        (box_cls, box_delta, box_center, bd_box_cls, bd_box_delta,
         bd_based_box) = self.head(features, shifts)

        if self.training:
            (gt_classes, gt_shifts_reg_deltas, gt_centerness,
             gt_border_classes,
             gt_border_shifts_deltas) = self.get_ground_truth(
                 shifts, gt_instances, bd_based_box)
            return self.losses(
                gt_classes,
                gt_shifts_reg_deltas,
                gt_centerness,
                gt_border_classes,
                gt_border_shifts_deltas,
                box_cls,
                box_delta,
                box_center,
                bd_box_cls,
                bd_box_delta,
            )
        else:
            results = self.inference(box_cls, box_center, bd_box_cls,
                                     bd_box_delta, bd_based_box,
                                     images.image_sizes)
            processed_results = []
            for results_per_image, input_per_image, image_size in zip(
                    results, batched_inputs, images.image_sizes):
                height = input_per_image.get("height", image_size[0])
                width = input_per_image.get("width", image_size[1])
                r = detector_postprocess(results_per_image, height, width)
                processed_results.append({"instances": r})
            return processed_results
Example #10
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DetectionTransform` .
                Each item in the list contains the inputs for one image.
            For now, each item in the list is a dict that contains:
                image: Tensor, image in (C, H, W) format.
                instances: Instances
                Other information that's included in the original dicts, such as:
                    "height", "width" (int): the output resolution of the model, used in inference.
                        See :meth:`postprocess` for details.
         Returns:
            losses (dict[str: Tensor]): mapping from a named loss to a tensor
                storing the loss. Used during training only.
        """
        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                logging.WARN,
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10,
            )
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]
        # apply the TensorMask head
        pred_logits, pred_deltas, pred_masks = self.head(features)
        # generate anchors based on features, is it image specific?
        anchors, unit_lengths, indexes = self.anchor_generator(features)

        if self.training:
            # get ground truths for class labels and box targets, it will label each anchor
            gt_class_info, gt_delta_info, gt_mask_info, num_fg = self.get_ground_truth(
                anchors, unit_lengths, indexes, gt_instances)
            # compute the loss
            return self.losses(
                gt_class_info,
                gt_delta_info,
                gt_mask_info,
                num_fg,
                pred_logits,
                pred_deltas,
                pred_masks,
            )
        else:
            # do inference to get the output
            results = self.inference(pred_logits, pred_deltas, pred_masks,
                                     anchors, indexes, images)
            processed_results = []
            for results_im, input_im, image_size in zip(
                    results, batched_inputs, images.image_sizes):
                height = input_im.get("height", image_size[0])
                width = input_im.get("width", image_size[1])
                # this is to do post-processing with the image size
                result_box, result_mask = results_im
                r = _postprocess(result_box, result_mask, height, width)
                processed_results.append({"instances": r})
            return processed_results
Example #11
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:

                * image: Tensor, image in (C, H, W) format.
                * instances: Instances

                Other information that's included in the original dicts, such as:

                * "height", "width" (int): the output resolution of the model, used in inference.
                    See :meth:`postprocess` for details.
        Returns:
            dict[str: Tensor]:
                mapping from a named loss to a tensor storing the loss. Used during training only.
        """
        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                logging.WARN,
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]
        box_cls, box_delta, box_center = self.head(features)
        shifts = self.shift_generator(features)

        if self.training:
            gt_classes, gt_shifts_reg_deltas, gt_centerness = self.get_ground_truth(
                shifts, gt_instances)
            return self.losses(gt_classes, gt_shifts_reg_deltas, gt_centerness,
                               box_cls, box_delta, box_center)
        else:
            if self.is_dynamic_head:
                soft_cost, used_cost, full_cost = get_module_running_cost(self)
                if self.head_complexity_buffer is None:
                    self.head_complexity_buffer = [used_cost]
                else:
                    self.head_complexity_buffer.append(used_cost)
                avg_cost = float(sum(self.head_complexity_buffer) / len(self.head_complexity_buffer))
                max_cost = float(max(self.head_complexity_buffer))
                min_cost = float(min(self.head_complexity_buffer))
                print("Head Complexity, CUR: {:.3f} GFLOPs, AVG: {:.3f} GFLOPs, MAX: {:.3f} GFLOPs, MIN: {:.3f} GFLOPs".format(
                    float(used_cost) / 2 ** 30, avg_cost / 2 ** 30, max_cost / 2 ** 30, min_cost / 2 ** 30))

            results = self.inference(box_cls, box_delta, box_center, shifts,
                                     images)
            processed_results = []
            for results_per_image, input_per_image, image_size in zip(
                    results, batched_inputs, images.image_sizes):
                height = input_per_image.get("height", image_size[0])
                width = input_per_image.get("width", image_size[1])
                r = detector_postprocess(results_per_image, height, width)
                processed_results.append({"instances": r})
            return processed_results
Example #12
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:

                * image: Tensor, image in (C, H, W) format.
                * instances (optional): groundtruth :class:`Instances`
                * proposals (optional): :class:`Instances`, precomputed proposals.

                Other information that's included in the original dicts, such as:

                * "height", "width" (int): the output resolution of the model, used in inference.
                  See :meth:`postprocess` for details.

        Returns:
            list[dict]:
                Each dict is the output for one input image.
                The dict contains one key "instances" whose value is a :class:`Instances`.
                The :class:`Instances` object has the following keys:
                "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
        """
        if not self.training:
            return self.inference(batched_inputs)

        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                logging.WARN,
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)

        if self.proposal_generator:
            proposals, proposal_losses = self.proposal_generator(
                images, features, gt_instances)
        else:
            assert "proposals" in batched_inputs[0]
            proposals = [
                x["proposals"].to(self.device) for x in batched_inputs
            ]
            proposal_losses = {}

        _, detector_losses = self.roi_heads(images, features, proposals,
                                            gt_instances)
        if self.vis_period > 0:
            storage = get_event_storage()
            if storage.iter % self.vis_period == 0:
                self.visualize_training(batched_inputs, proposals)

        losses = {}
        losses.update(detector_losses)
        losses.update(proposal_losses)
        return losses
Example #13
0
    def preprocess_image(self, batched_inputs, training):
        """
        Normalize, pad and batch the input images.
        """
        images = [x["image"].to(self.device) for x in batched_inputs]
        bs = len(images)
        images = [self.normalizer(x) for x in images]

        images = ImageList.from_tensors(images,
                                        size_divisibility=0,
                                        pad_ref_long=True)

        # sync image size for all gpus
        comm.synchronize()
        if training and self.iter % self.change_iter == 0:
            if self.iter < self.max_iter - 20000:
                meg = torch.LongTensor(1).to(self.device)
                comm.synchronize()
                if comm.is_main_process():
                    size = np.random.choice(self.multi_size)
                    meg.fill_(size)

                if comm.get_world_size() > 1:
                    comm.synchronize()
                    dist.broadcast(meg, 0)
                self.size = meg.item()

                comm.synchronize()
            else:
                self.size = 608

        if training:

            # resize image inputs
            modes = ['bilinear', 'nearest', 'bicubic', 'area']
            mode = modes[random.randrange(4)]
            if mode == 'bilinear' or mode == 'bicubic':
                images.tensor = F.interpolate(images.tensor,
                                              size=[self.size, self.size],
                                              mode=mode,
                                              align_corners=False)
            else:
                images.tensor = F.interpolate(images.tensor,
                                              size=[self.size, self.size],
                                              mode=mode)

            if "instances" in batched_inputs[0]:
                gt_instances = [
                    x["instances"].to(self.device) for x in batched_inputs
                ]
            elif "targets" in batched_inputs[0]:
                log_first_n(
                    logging.WARN,
                    "'targets' in the model inputs is now renamed to 'instances'!",
                    n=10)
                gt_instances = [
                    x["targets"].to(self.device) for x in batched_inputs
                ]
            else:
                gt_instances = None

            targets = [
                torch.cat([
                    instance.gt_classes.float().unsqueeze(-1),
                    instance.gt_boxes.tensor
                ],
                          dim=-1) for instance in gt_instances
            ]
            labels = torch.zeros((bs, 100, 5))
            for i, target in enumerate(targets):
                labels[i][:target.shape[0]] = target
            labels[:, :, 1:] = labels[:, :, 1:] / 512. * self.size
        else:
            labels = None

        self.iter += 1
        return images, labels
Example #14
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:

                * image: Tensor, image in (C, H, W) format.
                * instances: Instances

                Other information that's included in the original dicts, such as:

                * "height", "width" (int): the output resolution of the model, used in inference.
                    See :meth:`postprocess` for details.
        Returns:
            dict[str: Tensor]:
                mapping from a named loss to a tensor storing the loss. Used during training only.
        """
        images = self.preprocess_image(batched_inputs)

        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                logging.WARN,
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        # vgg feature maps: ['Conv4_3', 'Conv7']
        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]

        # featrue map: Conv4_3
        # Conv4_3 has a different feature scale compared to the other layers, we use
        # the L2 normalization technique to scale the feature norm at each location
        # in the feature map to 20 and learn the scale during back propagation.
        features[0] = self.l2norm(features[0])

        # Conv7
        x = features[-1]
        # compute featrue maps: conv8_2, conv9_2, conv10_2, and conv11_2
        for idx, extra_layer in enumerate(self.extra_layers):
            x = F.relu(extra_layer(x), inplace=True)
            if idx % 2 == 1:
                features.append(x)

        conf_pred, loc_pred = self.head(features)

        if self.training:
            gt_conf, gt_default_boxes_deltas = self.get_ground_truth(
                self.default_boxes, gt_instances)
            return self.losses(gt_conf, gt_default_boxes_deltas, conf_pred,
                               loc_pred)
        else:
            results = self.inference(conf_pred, loc_pred, self.default_boxes,
                                     images)
            processed_results = []
            for results_per_image, input_per_image, image_size in zip(
                    results, batched_inputs, images.image_sizes):
                height = input_per_image.get("height", image_size[0])
                width = input_per_image.get("width", image_size[1])
                r = detector_postprocess(results_per_image, height, width)
                processed_results.append({"instances": r})
            return processed_results
Example #15
0
    def forward(self, batched_inputs):

        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                "WARNING",
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]
        box_cls, box_delta, box_center, box_param = self.head(features)
        shifts = self.shift_generator(features)

        if self.training:
            gt_classes, gt_shifts_reg_deltas, gt_centerness, \
                gt_instance_masks, gt_inds, im_inds, fpn_levels = \
                self.get_ground_truth(
                    shifts, gt_instances, images.tensor.shape[-2:]
                )
            proposal_losses, proposals = self.proposals_losses(
                gt_classes, gt_shifts_reg_deltas, gt_centerness, gt_inds,
                im_inds, box_cls, box_delta, box_center, box_param, fpn_levels,
                shifts)

            proposals = self.generate_instance_masks(features, shifts,
                                                     proposals)
            instances_losses = self.instances_losses(gt_instance_masks,
                                                     proposals,
                                                     dummy_feature=box_cls[0])

            losses = {}
            losses.update(proposal_losses)
            losses.update(instances_losses)

            return losses
        else:
            proposals = self.proposals_inference(box_cls, box_delta,
                                                 box_center, box_param, shifts,
                                                 images)
            proposals = self.generate_instance_masks(features, shifts,
                                                     proposals)
            padded_im_h, padded_im_w = images.tensor.shape[-2:]
            processed_results = []
            for im_id, (input_per_image, image_size) in \
                    enumerate(zip(batched_inputs, images.image_sizes)):
                im_h = input_per_image.get("height", image_size[0])
                im_w = input_per_image.get("width", image_size[1])
                resized_in_h, resized_in_w = image_size
                instances_per_im = proposals[proposals.im_inds == im_id]
                instances_per_im = self.postprocess(instances_per_im, im_h,
                                                    im_w, resized_in_h,
                                                    resized_in_w, padded_im_h,
                                                    padded_im_w)
                processed_results.append({"instances": instances_per_im})

            return processed_results