def f(batched_inputs, c2_inputs, c2_results): image_sizes = [[int(im[0]), int(im[1])] for im in c2_inputs["im_info"]] num_features = len( [x for x in c2_results.keys() if x.startswith("box_cls_")]) pred_logits = [ c2_results["box_cls_{}".format(i)] for i in range(num_features) ] pred_anchor_deltas = [ c2_results["box_delta_{}".format(i)] for i in range(num_features) ] # For each feature level, feature should have the same batch size and # spatial dimension as the box_cls and box_delta. dummy_features = [x.clone()[:, 0:0, :, :] for x in pred_logits] anchors = self.anchor_generator(dummy_features) # self.num_classess can be inferred self.num_classes = pred_logits[0].shape[1] // ( pred_anchor_deltas[0].shape[1] // 4) pred_logits = [ permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits ] pred_anchor_deltas = [ permute_to_N_HWA_K(x, 4) for x in pred_anchor_deltas ] results = self.inference(anchors, pred_logits, pred_anchor_deltas, image_sizes) return meta_arch.GeneralizedRCNN._postprocess( results, batched_inputs, image_sizes)
def produce_raw_output(self, anchors, features): """ Given anchors and features, produces raw pre-nms output to be used for custom fusion operations. """ # Perform inference run pred_logits, pred_anchor_deltas, pred_logits_vars, pred_anchor_deltas_vars = self.head( features) # Transpose the Hi*Wi*A dimension to the middle: pred_logits = [ permute_to_N_HWA_K( x, self.num_classes) for x in pred_logits] pred_anchor_deltas = [ permute_to_N_HWA_K( x, 4) for x in pred_anchor_deltas] if pred_logits_vars is not None: pred_logits_vars = [ permute_to_N_HWA_K( x, self.num_classes) for x in pred_logits_vars] if pred_anchor_deltas_vars is not None: pred_anchor_deltas_vars = [permute_to_N_HWA_K( x, self.bbox_cov_dims) for x in pred_anchor_deltas_vars] # Create raw output dictionary raw_output = {'anchors': anchors} # Shapes: # (N x R, K) for class_logits and class_logits_var. # (N x R, 4), (N x R x 10) for pred_anchor_deltas and pred_class_bbox_cov respectively. raw_output.update({'box_cls': pred_logits, 'box_delta': pred_anchor_deltas, 'box_cls_var': pred_logits_vars, 'box_reg_var': pred_anchor_deltas_vars}) return raw_output
def convert_outputs(self, batched_inputs, inputs, results): output_names = self.get_output_names() assert len(results) == len(output_names) m_results = {} for k, v in results.items(): assert k in output_names, k m_results[k] = v.to(self._ns.device) image_sizes = inputs["image_sizes"] num_features = len( [x for x in m_results.keys() if x.startswith("box_cls_")]) pred_logits = [ m_results["box_cls_{}".format(i)] for i in range(num_features) ] pred_anchor_deltas = [ m_results["box_delta_{}".format(i)] for i in range(num_features) ] # generate anchors from anchor_generator anchors = self._anchor_generator(pred_logits) # Transpose the Hi*Wi*A dimension to the middle: pred_logits = [ permute_to_N_HWA_K(x, self._ns.num_classes) for x in pred_logits ] pred_anchor_deltas = [ permute_to_N_HWA_K(x, 4) for x in pred_anchor_deltas ] results = self._ns.inference(anchors, pred_logits, pred_anchor_deltas, image_sizes) return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes)
def convert_outputs(self, batched_inputs, inputs, results): assert isinstance(self._wrapped_model, meta_arch.RetinaNet) image_sizes = inputs["image_sizes"] num_features = len( [x for x in results.keys() if x.startswith("box_cls_")]) pred_logits = [ results["box_cls_{}".format(i)] for i in range(num_features) ] pred_anchor_deltas = [ results["box_delta_{}".format(i)] for i in range(num_features) ] # generate anchors from wrapped_model anchor_generator anchors = self._wrapped_model.anchor_generator(pred_logits) # Transpose the Hi*Wi*A dimension to the middle: pred_logits = [ permute_to_N_HWA_K(x, self._wrapped_model.num_classes) for x in pred_logits ] pred_anchor_deltas = [ permute_to_N_HWA_K(x, 4) for x in pred_anchor_deltas ] results = self._wrapped_model.inference(anchors, pred_logits, pred_anchor_deltas, image_sizes) return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes)
def inference(self, box_cls, box_delta, landmark_delta, anchors, image_sizes): """ Arguments: box_cls, box_delta, landmark_delta: Same as the output of :meth:`RetinaNetHead.forward` anchors (list[Boxes]): A list of #feature level Boxes. The Boxes contain anchors of this image on the specific feature level. image_sizes (List[torch.Size]): the input image sizes Returns: results (List[Instances]): a list of #images elements. """ results = [] box_cls = [permute_to_N_HWA_K(x, self.num_classes) for x in box_cls] box_delta = [permute_to_N_HWA_K(x, 4) for x in box_delta] landmark_delta = [permute_to_N_HWA_K(x, 10) for x in landmark_delta] # list[Tensor], one per level, each has shape (N, Hi x Wi x A, K or 4 or 10) for img_idx, image_size in enumerate(image_sizes): box_cls_per_image = [box_cls_per_level[img_idx] for box_cls_per_level in box_cls] box_reg_per_image = [box_reg_per_level[img_idx] for box_reg_per_level in box_delta] landmark_reg_per_image = [landmark_reg_per_level[img_idx] for landmark_reg_per_level in landmark_delta] results_per_image = self.inference_single_image( box_cls_per_image, box_reg_per_image, landmark_reg_per_image, anchors, tuple( image_size) ) results.append(results_per_image) return results
def permute_all_cls_box_landmark_to_N_HWA_K_and_concat(box_cls, box_delta, landmark_delta, num_classes=80): """ Rearrange the tensor layout from the network output, i.e.: list[Tensor]: #lvl tensors of shape (N, A x K, Hi, Wi) to per-image predictions, i.e.: Tensor: of shape (N x sum(Hi x Wi x A), K) """ # for each feature level, permute the outputs to make them be in the # same format as the labels. Note that the labels are computed for # all feature levels concatenated, so we keep the same representation # for the objectness and the box_delta box_cls_flattened = [permute_to_N_HWA_K(x, num_classes) for x in box_cls] box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta] landmark_delta_flattened = [ permute_to_N_HWA_K(x, 10) for x in landmark_delta ] # concatenate on the first dimension (representing the feature levels), to # take into account the way the labels were generated (with all feature maps # being concatenated as well) box_cls = cat(box_cls_flattened, dim=1).view(-1, num_classes) box_delta = cat(box_delta_flattened, dim=1).view(-1, 4) landmark_delta = cat(landmark_delta_flattened, dim=1).view(-1, 10) return box_cls, box_delta, landmark_delta
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] anchors = self.anchor_generator(features) pred_logits, pred_anchor_deltas = self.head(features) # Transpose the Hi*Wi*A dimension to the middle: pred_logits = [permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits] pred_anchor_deltas = [permute_to_N_HWA_K(x, 4) for x in pred_anchor_deltas] if self.training: assert "instances" in batched_inputs[0], "Instance annotations are missing in training!" gt_instances = [x["instances"].to(self.device) for x in batched_inputs] gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances) losses = self.losses(anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes) if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: results = self.inference( anchors, pred_logits, pred_anchor_deltas, images.image_sizes ) self.visualize_training(batched_inputs, results) return losses else: results = self.inference(anchors, pred_logits, pred_anchor_deltas, images.image_sizes) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes ): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def retinanet_postprocess(torch_model, images, results): features = results[:5] pred_logits = results[5:10] pred_anchor_deltas = results[10:] anchors = torch_model.anchor_generator(features) pred_logits = [permute_to_N_HWA_K(x, torch_model.num_classes) for x in pred_logits] pred_anchor_deltas = [permute_to_N_HWA_K(x, 4) for x in pred_anchor_deltas] results = torch_model.inference(anchors, pred_logits, pred_anchor_deltas, images.image_sizes) return results
def permute_all_cls_and_box_to_N_HWA_K_and_concat(pred_logits, pred_anchor_deltas, num_classes=80): """ Rearrange the tensor layout from the network output, i.e.: list[Tensor]: #lvl tensors of shape (N, A x K, Hi, Wi) to per-image predictions, i.e.: Tensor: of shape (N x sum(Hi x Wi x A), K) """ # for each feature level, permute the outputs to make them be in the # same format as the labels. pred_logits_flattened = [permute_to_N_HWA_K(x, num_classes) for x in pred_logits] pred_anchor_deltas_flattened = [permute_to_N_HWA_K(x, 4) for x in pred_anchor_deltas] # concatenate on the first dimension (representing the feature levels), to # take into account the way the labels were generated (with all feature maps # being concatenated as well) pred_logits = cat(pred_logits_flattened, dim=1).view(-1, num_classes) pred_anchor_deltas = cat(pred_anchor_deltas_flattened, dim=1).view(-1, 4) return pred_logits, pred_anchor_deltas
def inference(self, pred_logits, pred_deltas, pred_masks, anchors, indexes, images): """ Arguments: pred_logits, pred_deltas, pred_masks: Same as the output of: meth:`TensorMaskHead.forward` anchors, indexes: Same as the input of meth:`TensorMask.get_ground_truth` images (ImageList): the input images Returns: results (List[Instances]): a list of #images elements. """ assert len(anchors) == len(images) results = [] pred_logits = [ permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits ] pred_deltas = [permute_to_N_HWA_K(x, 4) for x in pred_deltas] pred_logits = cat(pred_logits, dim=1) pred_deltas = cat(pred_deltas, dim=1) for img_idx, (anchors_im, indexes_im) in enumerate(zip(anchors, indexes)): # Get the size of the current image image_size = images.image_sizes[img_idx] logits_im = pred_logits[img_idx] deltas_im = pred_deltas[img_idx] if self.mask_on: masks_im = [[mla[img_idx] for mla in ml] for ml in pred_masks] else: masks_im = [None] * self.num_levels results_im = self.inference_single_image( logits_im, deltas_im, masks_im, Boxes.cat(anchors_im), cat(indexes_im), tuple(image_size), ) results.append(results_im) return results
def forward( self, batched_inputs: List[dict] ) -> Union[Dict[str, Any], List[Dict[str, Instances]]]: """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: Mapping from a named loss to a tensor storing the loss. Used during training only. """ images: ImageList = self.preprocess_image(batched_inputs) features_dict: Dict[str, ShapeSpec] = self.fpn(images.tensor) features: List[ShapeSpec] = [ features_dict[f] for f in self.in_features ] pred_logits, pred_anchor_deltas = self.head(features) anchors: List[Boxes] = self.anchor_generator(features) # Transpose the Hi*Wi*A dimension to the middle: pred_logits = [ permute_to_N_HWA_K(x, K=self.num_classes) for x in pred_logits ] pred_anchor_deltas = [ permute_to_N_HWA_K(x, K=4) for x in pred_anchor_deltas ] if self.training: gt_instances: Instances = [ x['instances'].to(self.device) for x in batched_inputs ] gt_classes, gt_boxes = self.get_ground_truth( anchors=anchors, gt_instances=gt_instances) losses: Dict[str, float] = self.losses( anchors=anchors, pred_logits=pred_logits, gt_classes=gt_classes, pred_anchor_deltas=pred_anchor_deltas, gt_boxes=gt_boxes) return losses # Otherwise, do inference. results: List[Instances] = self.inference( anchors=anchors, pred_logits=pred_logits, pred_anchor_deltas=pred_anchor_deltas, image_sizes=images.image_sizes) processed_results: List[Dict[str, Any]] = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) processed_results.append({ "instances": detector_postprocess(results_per_image, height, width) }) return processed_results
def forward( self, batched_inputs, return_anchorwise_output=False, num_mc_dropout_runs=-1): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. return_anchorwise_output (bool): returns raw output for probabilistic inference num_mc_dropout_runs (int): perform efficient monte-carlo dropout runs by running only the head and not full neural network. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ # Preprocess image images = self.preprocess_image(batched_inputs) # Extract features and generate anchors features = self.backbone(images.tensor) features = [features[f] for f in self.head_in_features] anchors = self.anchor_generator(features) # MC_Dropout inference forward if num_mc_dropout_runs > 1: anchors = anchors * num_mc_dropout_runs features = features * num_mc_dropout_runs output_dict = self.produce_raw_output(anchors, features) return output_dict # Regular inference forward if return_anchorwise_output: return self.produce_raw_output(anchors, features) # Training and validation forward pred_logits, pred_anchor_deltas, pred_logits_vars, pred_anchor_deltas_vars = self.head( features) # Transpose the Hi*Wi*A dimension to the middle: pred_logits = [ permute_to_N_HWA_K( x, self.num_classes) for x in pred_logits] pred_anchor_deltas = [ permute_to_N_HWA_K( x, 4) for x in pred_anchor_deltas] if pred_logits_vars is not None: pred_logits_vars = [ permute_to_N_HWA_K( x, self.num_classes) for x in pred_logits_vars] if pred_anchor_deltas_vars is not None: pred_anchor_deltas_vars = [permute_to_N_HWA_K( x, self.bbox_cov_dims) for x in pred_anchor_deltas_vars] if self.training: assert "instances" in batched_inputs[0], "Instance annotations are missing in training!" gt_instances = [ x["instances"].to( self.device) for x in batched_inputs] gt_classes, gt_boxes = self.label_anchors( anchors, gt_instances) self.anchors = torch.cat( [Boxes.cat(anchors).tensor for i in range(len(gt_instances))], 0) # Loss is computed based on what values are to be estimated by the neural # network losses = self.losses( anchors, gt_classes, gt_boxes, pred_logits, pred_anchor_deltas, pred_logits_vars, pred_anchor_deltas_vars) self.current_step += 1 if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: results = self.inference( anchors, pred_logits, pred_anchor_deltas, images.image_sizes) self.visualize_training(batched_inputs, results) return losses else: results = self.inference( anchors, pred_logits, pred_anchor_deltas, images.image_sizes) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes ): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def forward(self, batched_inputs: List[dict]) -> Union[Dict[str, Any], List[Dict[str, Instances]]]: """ Args: batched_inputs (List[dict]): A list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: Dict[str, Tensor]: Mapping from a named loss to a scalar tensor storing the loss. Used during training only. The dict keys are: 'panel_loss_cls', 'panel_loss_box_reg', 'label_loss_cls' and 'label_loss_box_reg'. """ images: ImageList = self.preprocess_image(batched_inputs) # detected panels panel_features_dict: Dict[str, ShapeSpec] = self.panel_fpn(images.tensor) panel_features: List[ShapeSpec] = [panel_features_dict[f] for f in self.panel_in_features] panel_anchors: List[Boxes] = self.panel_anchor_generator(panel_features) panel_pred_logits, panel_pred_anchor_deltas = self.panel_head(panel_features) # Transpose the Hi*Wi*A dimension to the middle: panel_pred_logits = [permute_to_N_HWA_K(x, K=1) for x in panel_pred_logits] panel_pred_anchor_deltas = [permute_to_N_HWA_K(x, K=4) for x in panel_pred_anchor_deltas] # detected labels label_features_dict: Dict[str, ShapeSpec] = self.label_fpn(images.tensor) label_features: List[ShapeSpec] = [label_features_dict[f] for f in self.label_in_features] label_anchors: List[Boxes] = self.label_anchor_generator(label_features) label_pred_logits, label_pred_anchor_deltas = self.label_head(label_features) # Transpose the Hi*Wi*A dimension to the middle: label_pred_logits = [permute_to_N_HWA_K(x, K=self.num_label_classes) for x in label_pred_logits] label_pred_anchor_deltas = [permute_to_N_HWA_K(x, K=4) for x in label_pred_anchor_deltas] # Training if self.training: # Panels panel_gt_instances: Instances = [x['panel_instances'].to(self.device) for x in batched_inputs] panel_gt_classes, panel_gt_boxes = self.get_ground_truth( anchors=panel_anchors, gt_instances=panel_gt_instances, num_classes=1) panel_loss_cls, panel_loss_box_reg = self._compute_single_head_losses( anchors=panel_anchors, pred_logits=panel_pred_logits, gt_classes=panel_gt_classes, pred_anchor_deltas=panel_pred_anchor_deltas, gt_boxes=panel_gt_boxes, num_classes=1) loss_dict: Dict[str, float] = { 'panel_loss_cls': panel_loss_cls, 'panel_loss_box_reg': panel_loss_box_reg } # Labels label_gt_instances: Instances = [x['label_instances'].to(self.device) for x in batched_inputs] label_gt_classes, label_gt_boxes = self.get_ground_truth( anchors=label_anchors, gt_instances=label_gt_instances, num_classes=self.num_label_classes) label_loss_cls, label_loss_box_reg = self._compute_single_head_losses( anchors=label_anchors, pred_logits=label_pred_logits, gt_classes=label_gt_classes, pred_anchor_deltas=label_pred_anchor_deltas, gt_boxes=label_gt_boxes, num_classes=self.num_label_classes) loss_dict['label_loss_cls'] = label_loss_cls loss_dict['label_loss_box_reg'] = label_loss_box_reg return loss_dict # Otherwise, do inference. batched_inference_results = self.inference( panel_anchors=panel_anchors, panel_pred_logits=panel_pred_logits, panel_pred_anchor_deltas=panel_pred_anchor_deltas, label_anchors=label_anchors, label_pred_logits=label_pred_logits, label_pred_anchor_deltas=label_pred_anchor_deltas, image_sizes=images.image_sizes) processed_results: List[Dict[str, Instances]] = [] for inference_results, input_per_image, image_size in zip( batched_inference_results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) # TODO check that this work with two sets of boxes # r = detector_postprocess(results_per_image, height, width) panel_results, label_results = inference_results scale_x, scale_y = (width / panel_results.image_size[1], height / panel_results.image_size[0]) # 1) Panels panel_results = Instances((height, width), **panel_results.get_fields()) # Clip and scale boxes panel_output_boxes = panel_results.pred_boxes panel_output_boxes.scale(scale_x, scale_y) panel_output_boxes.clip(panel_results.image_size) # 2) Labels label_results = Instances((height, width), **label_results.get_fields()) # Clip and scale boxes label_output_boxes = label_results.pred_boxes label_output_boxes.scale(scale_x, scale_y) label_output_boxes.clip(label_results.image_size) processed_results.append({"panels": panel_results, "labels": label_results}) return processed_results