def f(batched_inputs, c2_inputs, c2_results): image_sizes = [[int(im[0]), int(im[1])] for im in c2_inputs["im_info"]] detector_results = assemble_rcnn_outputs_by_name( image_sizes, c2_results, force_mask_on=True) sem_seg_results = c2_results["sem_seg"] # copied from meta_arch/panoptic_fpn.py ... processed_results = [] for sem_seg_result, detector_result, input_per_image, image_size in zip( sem_seg_results, detector_results, batched_inputs, image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width) detector_r = detector_postprocess(detector_result, height, width) processed_results.append({ "sem_seg": sem_seg_r, "instances": detector_r }) if combine_on: panoptic_r = combine_semantic_and_instance_outputs( detector_r, sem_seg_r.argmax(dim=0), combine_overlap_threshold, combine_stuff_area_limit, combine_instances_confidence_threshold, ) processed_results[-1]["panoptic_seg"] = panoptic_r return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] box_cls, box_delta, box_center = self.head(features) shifts = self.shift_generator(features) if self.training: # remove gt_instances with ignore label gt_instances = [ inst[inst.gt_classes >= 0] for inst in gt_instances ] gt_classes, gt_shifts_reg_deltas, gt_centerness = self.get_ground_truth( shifts, gt_instances) return self.losses(gt_classes, gt_shifts_reg_deltas, gt_centerness, box_cls, box_delta, box_center) else: results = self.inference(box_cls, box_delta, box_center, shifts, images) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] box_cls, box_delta = self.decoder(self.encoder(features[0])) anchors = self.anchor_generator(features) # Transpose the Hi*Wi*A dimension to the middle: pred_logits = [permute_to_N_HWA_K(box_cls, self.num_classes)] pred_anchor_deltas = [permute_to_N_HWA_K(box_delta, 4)] if self.training: indices = self.get_ground_truth(anchors, pred_anchor_deltas, gt_instances) losses = self.losses(indices, gt_instances, anchors, pred_logits, pred_anchor_deltas) return losses else: results = self.inference([box_cls], [box_delta], anchors, images.image_sizes) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of : class:`DatasetMapper`. Each item in the list contains the input for one image. For now, each item in the list is a dict that contains: * images: Tensor, image in (C, H, W) format. * instances: Instances. Other information that' s included in the original dict ,such as: * "height", "width"(int): the output resolution of the model, used in inference.See `postprocess` for detail Return: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss, Used during training only. At inference stage, return predicted bboxes. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x['instances'].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( "WARNING", "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x['instances'].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] cls_outs, pts_outs_init, pts_outs_refine = self.head(features) center_pts = self.shift_generator(features) if self.training: return self.losses(center_pts, cls_outs, pts_outs_init, pts_outs_refine, gt_instances) else: results = self.inference(center_pts, cls_outs, pts_outs_init, pts_outs_refine, images) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def _inference_for_ms_test(self, batched_inputs): """ function used for multiscale test, will be refactor in the future. The same input with `forward` function. """ assert not self.training, "inference mode with training=True" assert len(batched_inputs) == 1, "inference image number > 1" images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] box_cls, box_delta = self.head(features) shifts = self.shift_generator(features) results = self.inference(box_cls, box_delta, shifts, images) for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) processed_results = detector_postprocess(results_per_image, height, width) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: result (list[BoxList] or dict[Tensor]): the output from the model. During training, it returns a dict[Tensor] which contains the losses. During testing, it returns list[BoxList] contains additional fields like `scores`, `labels` and `mask` (for Mask R-CNN models). """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] shifts = self.shift_generator(features) (box_cls, box_delta, box_center, bd_box_cls, bd_box_delta, bd_based_box) = self.head(features, shifts) if self.training: (gt_classes, gt_shifts_reg_deltas, gt_centerness, gt_border_classes, gt_border_shifts_deltas) = self.get_ground_truth( shifts, gt_instances, bd_based_box) return self.losses( gt_classes, gt_shifts_reg_deltas, gt_centerness, gt_border_classes, gt_border_shifts_deltas, box_cls, box_delta, box_center, bd_box_cls, bd_box_delta, ) else: results = self.inference(box_cls, box_center, bd_box_cls, bd_box_delta, bd_based_box, images.image_sizes) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] box_cls, box_delta, box_center = self.head(features) shifts = self.shift_generator(features) if self.training: gt_classes, gt_shifts_reg_deltas, gt_centerness = self.get_ground_truth( shifts, gt_instances) return self.losses(gt_classes, gt_shifts_reg_deltas, gt_centerness, box_cls, box_delta, box_center) else: if self.is_dynamic_head: soft_cost, used_cost, full_cost = get_module_running_cost(self) if self.head_complexity_buffer is None: self.head_complexity_buffer = [used_cost] else: self.head_complexity_buffer.append(used_cost) avg_cost = float(sum(self.head_complexity_buffer) / len(self.head_complexity_buffer)) max_cost = float(max(self.head_complexity_buffer)) min_cost = float(min(self.head_complexity_buffer)) print("Head Complexity, CUR: {:.3f} GFLOPs, AVG: {:.3f} GFLOPs, MAX: {:.3f} GFLOPs, MIN: {:.3f} GFLOPs".format( float(used_cost) / 2 ** 30, avg_cost / 2 ** 30, max_cost / 2 ** 30, min_cost / 2 ** 30)) results = self.inference(box_cls, box_delta, box_center, shifts, images) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images, labels = self.preprocess_image(batched_inputs, self.training) # batched_inputs[0]['image'] = images.tensor[0].cpu() * 255 # self.visualize_data(batched_inputs[0]) x = images.tensor img_size = x.shape[-2:] def _branch(_embedding, _in): for i, e in enumerate(_embedding): _in = e(_in) if i == 4: out_branch = _in return _in, out_branch # backbone # x2, x1, x0 = self.backbone(x) out_features = self.backbone(x) features = [out_features[f] for f in self.in_features] [x2, x1, x0] = features # yolo branch 0 out0, out0_branch = _branch(self.out0, x0) # yolo branch 1 x1_in = self.out1_cbl(out0_branch) x1_in = self.out1_upsample(x1_in) x1_in = torch.cat([x1_in, x1], 1) out1, out1_branch = _branch(self.out1, x1_in) # yolo branch 2 x2_in = self.out2_cbl(out1_branch) x2_in = self.out2_upsample(x2_in) x2_in = torch.cat([x2_in, x2], 1) out2, out2_branch = _branch(self.out2, x2_in) outputs = [out0, out1, out2] if self.training: losses = [ loss_evaluator(out, labels, img_size) for out, loss_evaluator in zip(outputs, self.loss_evaluators) ] keys = [ "loss_x", "loss_y", "loss_w", "loss_h", "loss_conf", "loss_cls" ] losses_dict = {} for key in keys: losses_dict[key] = sum([loss[key] for loss in losses]) return losses_dict else: predictions_list = [ loss_evaluator(out, labels, img_size) for out, loss_evaluator in zip(outputs, self.loss_evaluators) ] predictions = torch.cat(predictions_list, 1) detections = postprocess(predictions, self.num_classes, self.conf_threshold, self.nms_threshold, nms_type=self.nms_type) results = [] for idx, out in enumerate(detections): if out is None: out = x.new_zeros((0, 7)) # image_size = images.image_sizes[idx] image_size = img_size result = Instances(image_size) result.pred_boxes = Boxes(out[:, :4]) result.scores = out[:, 5] * out[:, 4] result.pred_classes = out[:, -1] results.append(result) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None # vgg feature maps: ['Conv4_3', 'Conv7'] features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] # featrue map: Conv4_3 # Conv4_3 has a different feature scale compared to the other layers, we use # the L2 normalization technique to scale the feature norm at each location # in the feature map to 20 and learn the scale during back propagation. features[0] = self.l2norm(features[0]) # Conv7 x = features[-1] # compute featrue maps: conv8_2, conv9_2, conv10_2, and conv11_2 for idx, extra_layer in enumerate(self.extra_layers): x = F.relu(extra_layer(x), inplace=True) if idx % 2 == 1: features.append(x) conf_pred, loc_pred = self.head(features) if self.training: gt_conf, gt_default_boxes_deltas = self.get_ground_truth( self.default_boxes, gt_instances) return self.losses(gt_conf, gt_default_boxes_deltas, conf_pred, loc_pred) else: results = self.inference(conf_pred, loc_pred, self.default_boxes, images) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results