def __init__(self, cfg): super(DynamicMaskHead, self).__init__() # fmt: off self.num_layers = cfg.MODEL.CONDINST.MASK_HEAD.NUM_LAYERS self.channels = cfg.MODEL.CONDINST.MASK_HEAD.HEAD_CHANNELS self.in_channels = cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE self.disable_rel_coords = cfg.MODEL.CONDINST.MASK_HEAD.DISABLE_REL_COORDS soi = [soi[1] for soi in cfg.MODEL.FCOS.OBJECT_SIZES_OF_INTEREST[:-1]] # fmt: on if self.disable_rel_coords: log_first_n("WARNING", "Training CondInst without Coord", n=1) self.sizes_of_interest = torch.tensor( soi + [soi[-1] * 2]) # 64 128 256 512 1024 weight_nums, bias_nums = [], [] for layer_ind in range(self.num_layers): if layer_ind == 0: if not self.disable_rel_coords: weight_nums.append((self.in_channels + 2) * self.channels) else: weight_nums.append(self.in_channels * self.channels) bias_nums.append(self.channels) elif layer_ind == self.num_layers - 1: weight_nums.append(self.channels * 1) bias_nums.append(1) else: weight_nums.append(self.channels * self.channels) bias_nums.append(self.channels) self.weight_nums = weight_nums self.bias_nums = bias_nums self.num_gen_params = sum(weight_nums) + sum(bias_nums) assert len(weight_nums) == len(bias_nums)
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] box_cls, box_delta, box_center = self.head(features) shifts = self.shift_generator(features) if self.training: # remove gt_instances with ignore label gt_instances = [ inst[inst.gt_classes >= 0] for inst in gt_instances ] gt_classes, gt_shifts_reg_deltas, gt_centerness = self.get_ground_truth( shifts, gt_instances) return self.losses(gt_classes, gt_shifts_reg_deltas, gt_centerness, box_cls, box_delta, box_center) else: results = self.inference(box_cls, box_delta, box_center, shifts, images) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] box_cls, box_delta = self.decoder(self.encoder(features[0])) anchors = self.anchor_generator(features) # Transpose the Hi*Wi*A dimension to the middle: pred_logits = [permute_to_N_HWA_K(box_cls, self.num_classes)] pred_anchor_deltas = [permute_to_N_HWA_K(box_delta, 4)] if self.training: indices = self.get_ground_truth(anchors, pred_anchor_deltas, gt_instances) losses = self.losses(indices, gt_instances, anchors, pred_logits, pred_anchor_deltas) return losses else: results = self.inference([box_cls], [box_delta], anchors, images.image_sizes) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( "WARNING", "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] if self.training: ins_preds_x, ins_preds_y, cate_preds = self.head(features, eval=False) featmap_sizes = [featmap.size()[-2:] for featmap in ins_preds_x] ins_label_list, cate_label_list, ins_ind_label_list, ins_ind_label_list_xy = \ self.get_ground_truth(gt_instances, featmap_sizes) return self.losses(ins_preds_x, ins_preds_y, cate_preds, ins_label_list, cate_label_list, ins_ind_label_list, ins_ind_label_list_xy) else: ins_preds_x, ins_preds_y, cate_preds = self.head(features, eval=True) results = self.inference(ins_preds_x, ins_preds_y, cate_preds, batched_inputs) processed_results = [{"instances": r} for r in results] return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of : class:`DatasetMapper`. Each item in the list contains the input for one image. For now, each item in the list is a dict that contains: * images: Tensor, image in (C, H, W) format. * instances: Instances. Other information that' s included in the original dict ,such as: * "height", "width"(int): the output resolution of the model, used in inference.See `postprocess` for detail Return: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss, Used during training only. At inference stage, return predicted bboxes. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x['instances'].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( "WARNING", "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x['instances'].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] cls_outs, pts_outs_init, pts_outs_refine = self.head(features) center_pts = self.shift_generator(features) if self.training: return self.losses(center_pts, cls_outs, pts_outs_init, pts_outs_refine, gt_instances) else: results = self.inference(center_pts, cls_outs, pts_outs_init, pts_outs_refine, images) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: Same as in :class:`GeneralizedRCNN.forward` Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "proposals" whose value is a :class:`Instances` with keys "proposal_boxes" and "objectness_logits". """ images = [x["image"].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) features = self.backbone(images.tensor) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None proposals, proposal_losses = self.proposal_generator( images, features, gt_instances) # In training, the proposals are not useful at all but we generate them anyway. # This makes RPN-only models about 5% slower. if self.training: return proposal_losses processed_results = [] for results_per_image, input_per_image, image_size in zip( proposals, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"proposals": r}) return processed_results
def print_instances_class_histogram(dataset_dicts, class_names): """ Args: dataset_dicts (list[dict]): list of dataset dicts. class_names (list[str]): list of class names (zero-indexed). """ num_classes = len(class_names) hist_bins = np.arange(num_classes + 1) histogram = np.zeros((num_classes, ), dtype=np.int) for entry in dataset_dicts: annos = entry["annotations"] classes = [x["category_id"] for x in annos if not x.get("iscrowd", 0)] histogram += np.histogram(classes, bins=hist_bins)[0] N_COLS = min(6, len(class_names) * 2) def short_name(x): # make long class names shorter. useful for lvis if len(x) > 13: return x[:11] + ".." return x data = list( itertools.chain( *[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)])) total_num_instances = sum(data[1::2]) data.extend([None] * (N_COLS - (len(data) % N_COLS))) if num_classes > 1: data.extend(["total", total_num_instances]) data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)]) table = tabulate( data, headers=["category", "#instances"] * (N_COLS // 2), tablefmt="pipe", numalign="left", stralign="center", ) log_first_n( logging.INFO, "Distribution of instances among all {} categories:\n".format( num_classes) + colored(table, "cyan"), key="message", )
def _flatten_to_tuple(outputs): result = [] if isinstance(outputs, torch.Tensor): result.append(outputs) elif isinstance(outputs, (list, tuple)): for v in outputs: result.extend(_flatten_to_tuple(v)) elif isinstance(outputs, dict): for _, v in outputs.items(): result.extend(_flatten_to_tuple(v)) elif isinstance(outputs, Instances): result.extend(_flatten_to_tuple(outputs.get_fields())) elif isinstance(outputs, (Boxes, BitMasks, ImageList)): result.append(outputs.tensor) else: log_first_n( "WARNING", f"Output of type {type(outputs)} not included in flops/activations count.", n=10, ) return tuple(result)
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: result (list[BoxList] or dict[Tensor]): the output from the model. During training, it returns a dict[Tensor] which contains the losses. During testing, it returns list[BoxList] contains additional fields like `scores`, `labels` and `mask` (for Mask R-CNN models). """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] shifts = self.shift_generator(features) (box_cls, box_delta, box_center, bd_box_cls, bd_box_delta, bd_based_box) = self.head(features, shifts) if self.training: (gt_classes, gt_shifts_reg_deltas, gt_centerness, gt_border_classes, gt_border_shifts_deltas) = self.get_ground_truth( shifts, gt_instances, bd_based_box) return self.losses( gt_classes, gt_shifts_reg_deltas, gt_centerness, gt_border_classes, gt_border_shifts_deltas, box_cls, box_delta, box_center, bd_box_cls, bd_box_delta, ) else: results = self.inference(box_cls, box_center, bd_box_cls, bd_box_delta, bd_based_box, images.image_sizes) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DetectionTransform` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: image: Tensor, image in (C, H, W) format. instances: Instances Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: losses (dict[str: Tensor]): mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10, ) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] # apply the TensorMask head pred_logits, pred_deltas, pred_masks = self.head(features) # generate anchors based on features, is it image specific? anchors, unit_lengths, indexes = self.anchor_generator(features) if self.training: # get ground truths for class labels and box targets, it will label each anchor gt_class_info, gt_delta_info, gt_mask_info, num_fg = self.get_ground_truth( anchors, unit_lengths, indexes, gt_instances) # compute the loss return self.losses( gt_class_info, gt_delta_info, gt_mask_info, num_fg, pred_logits, pred_deltas, pred_masks, ) else: # do inference to get the output results = self.inference(pred_logits, pred_deltas, pred_masks, anchors, indexes, images) processed_results = [] for results_im, input_im, image_size in zip( results, batched_inputs, images.image_sizes): height = input_im.get("height", image_size[0]) width = input_im.get("width", image_size[1]) # this is to do post-processing with the image size result_box, result_mask = results_im r = _postprocess(result_box, result_mask, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] box_cls, box_delta, box_center = self.head(features) shifts = self.shift_generator(features) if self.training: gt_classes, gt_shifts_reg_deltas, gt_centerness = self.get_ground_truth( shifts, gt_instances) return self.losses(gt_classes, gt_shifts_reg_deltas, gt_centerness, box_cls, box_delta, box_center) else: if self.is_dynamic_head: soft_cost, used_cost, full_cost = get_module_running_cost(self) if self.head_complexity_buffer is None: self.head_complexity_buffer = [used_cost] else: self.head_complexity_buffer.append(used_cost) avg_cost = float(sum(self.head_complexity_buffer) / len(self.head_complexity_buffer)) max_cost = float(max(self.head_complexity_buffer)) min_cost = float(min(self.head_complexity_buffer)) print("Head Complexity, CUR: {:.3f} GFLOPs, AVG: {:.3f} GFLOPs, MAX: {:.3f} GFLOPs, MIN: {:.3f} GFLOPs".format( float(used_cost) / 2 ** 30, avg_cost / 2 ** 30, max_cost / 2 ** 30, min_cost / 2 ** 30)) results = self.inference(box_cls, box_delta, box_center, shifts, images) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances (optional): groundtruth :class:`Instances` * proposals (optional): :class:`Instances`, precomputed proposals. Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "instances" whose value is a :class:`Instances`. The :class:`Instances` object has the following keys: "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" """ if not self.training: return self.inference(batched_inputs) images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) if self.proposal_generator: proposals, proposal_losses = self.proposal_generator( images, features, gt_instances) else: assert "proposals" in batched_inputs[0] proposals = [ x["proposals"].to(self.device) for x in batched_inputs ] proposal_losses = {} _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: self.visualize_training(batched_inputs, proposals) losses = {} losses.update(detector_losses) losses.update(proposal_losses) return losses
def preprocess_image(self, batched_inputs, training): """ Normalize, pad and batch the input images. """ images = [x["image"].to(self.device) for x in batched_inputs] bs = len(images) images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, size_divisibility=0, pad_ref_long=True) # sync image size for all gpus comm.synchronize() if training and self.iter % self.change_iter == 0: if self.iter < self.max_iter - 20000: meg = torch.LongTensor(1).to(self.device) comm.synchronize() if comm.is_main_process(): size = np.random.choice(self.multi_size) meg.fill_(size) if comm.get_world_size() > 1: comm.synchronize() dist.broadcast(meg, 0) self.size = meg.item() comm.synchronize() else: self.size = 608 if training: # resize image inputs modes = ['bilinear', 'nearest', 'bicubic', 'area'] mode = modes[random.randrange(4)] if mode == 'bilinear' or mode == 'bicubic': images.tensor = F.interpolate(images.tensor, size=[self.size, self.size], mode=mode, align_corners=False) else: images.tensor = F.interpolate(images.tensor, size=[self.size, self.size], mode=mode) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None targets = [ torch.cat([ instance.gt_classes.float().unsqueeze(-1), instance.gt_boxes.tensor ], dim=-1) for instance in gt_instances ] labels = torch.zeros((bs, 100, 5)) for i, target in enumerate(targets): labels[i][:target.shape[0]] = target labels[:, :, 1:] = labels[:, :, 1:] / 512. * self.size else: labels = None self.iter += 1 return images, labels
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None # vgg feature maps: ['Conv4_3', 'Conv7'] features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] # featrue map: Conv4_3 # Conv4_3 has a different feature scale compared to the other layers, we use # the L2 normalization technique to scale the feature norm at each location # in the feature map to 20 and learn the scale during back propagation. features[0] = self.l2norm(features[0]) # Conv7 x = features[-1] # compute featrue maps: conv8_2, conv9_2, conv10_2, and conv11_2 for idx, extra_layer in enumerate(self.extra_layers): x = F.relu(extra_layer(x), inplace=True) if idx % 2 == 1: features.append(x) conf_pred, loc_pred = self.head(features) if self.training: gt_conf, gt_default_boxes_deltas = self.get_ground_truth( self.default_boxes, gt_instances) return self.losses(gt_conf, gt_default_boxes_deltas, conf_pred, loc_pred) else: results = self.inference(conf_pred, loc_pred, self.default_boxes, images) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( "WARNING", "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] box_cls, box_delta, box_center, box_param = self.head(features) shifts = self.shift_generator(features) if self.training: gt_classes, gt_shifts_reg_deltas, gt_centerness, \ gt_instance_masks, gt_inds, im_inds, fpn_levels = \ self.get_ground_truth( shifts, gt_instances, images.tensor.shape[-2:] ) proposal_losses, proposals = self.proposals_losses( gt_classes, gt_shifts_reg_deltas, gt_centerness, gt_inds, im_inds, box_cls, box_delta, box_center, box_param, fpn_levels, shifts) proposals = self.generate_instance_masks(features, shifts, proposals) instances_losses = self.instances_losses(gt_instance_masks, proposals, dummy_feature=box_cls[0]) losses = {} losses.update(proposal_losses) losses.update(instances_losses) return losses else: proposals = self.proposals_inference(box_cls, box_delta, box_center, box_param, shifts, images) proposals = self.generate_instance_masks(features, shifts, proposals) padded_im_h, padded_im_w = images.tensor.shape[-2:] processed_results = [] for im_id, (input_per_image, image_size) in \ enumerate(zip(batched_inputs, images.image_sizes)): im_h = input_per_image.get("height", image_size[0]) im_w = input_per_image.get("width", image_size[1]) resized_in_h, resized_in_w = image_size instances_per_im = proposals[proposals.im_inds == im_id] instances_per_im = self.postprocess(instances_per_im, im_h, im_w, resized_in_h, resized_in_w, padded_im_h, padded_im_w) processed_results.append({"instances": instances_per_im}) return processed_results