def merge_branch_instances(instances, num_branch, nms_thrsh, topk_per_image): """ Merge detection results from different branches of TridentNet. Return detection results by applying non-maximum suppression (NMS) on bounding boxes and keep the unsuppressed boxes and other instances (e.g mask) if any. Args: instances (list[Instances]): A list of N * num_branch instances that store detection results. Contain N images and each image has num_branch instances. num_branch (int): Number of branches used for merging detection results for each image. nms_thresh (float): The threshold to use for box non-maximum suppression. Value in [0, 1]. topk_per_image (int): The number of top scoring detections to return. Set < 0 to return all detections. Returns: results: (list[Instances]): A list of N instances, one for each image in the batch, that stores the topk most confidence detections after merging results from multiple branches. """ if num_branch == 1: return instances batch_size = len(instances) // num_branch results = [] for i in range(batch_size): instance = Instances.cat([instances[i + batch_size * j] for j in range(num_branch)]) # Apply per-class NMS keep = batched_nms( instance.pred_boxes.tensor, instance.scores, instance.pred_classes, nms_thrsh ) keep = keep[:topk_per_image] result = instance[keep] results.append(result) return results
def add_ground_truth_to_proposals_single_image(gt_boxes, proposals): """ Augment `proposals` with ground-truth boxes from `gt_boxes`. Args: Same as `add_ground_truth_to_proposals`, but with gt_boxes and proposals per image. Returns: Same as `add_ground_truth_to_proposals`, but for only one image. """ device = proposals.objectness_logits.device # Assign all ground-truth boxes an objectness logit corresponding to # P(object) = sigmoid(logit) =~ 1. gt_logit_value = math.log((1.0 - 1e-10) / (1 - (1.0 - 1e-10))) gt_logits = gt_logit_value * torch.ones(len(gt_boxes), device=device) # Concatenating gt_boxes with proposals requires them to have the same fields gt_proposal = Instances(proposals.image_size) gt_proposal.proposal_boxes = gt_boxes gt_proposal.objectness_logits = gt_logits new_proposals = Instances.cat([proposals, gt_proposal]) return new_proposals
def predict_proposals(self): sampled_boxes = [] bundle = ( self.locations, self.logits_pred, self.reg_pred, self.ext_pred, self.ctrness_pred, self.strides ) for i, (l, o, r, e, c, s) in enumerate(zip(*bundle)): # recall that during training, we normalize regression targets with FPN's stride. # we denormalize them here. r = r * s sampled_boxes.append( self.forward_for_single_feature_map( l, o, r, e, c, self.image_sizes ) ) boxlists = list(zip(*sampled_boxes)) boxlists = [Instances.cat(boxlist) for boxlist in boxlists] boxlists = self.select_over_all_levels(boxlists) return boxlists
def predict_proposals(self): sampled_boxes = [] bundle = ( self.locations, self.logits_pred, self.reg_pred, self.ctrness_pred, self.strides, self.mask_regression, self.mask_prediction ) for i, (l, o, r, c, s, mr, mp) in enumerate(zip(*bundle)): # recall that during training, we normalize regression targets with FPN's stride. # we denormalize them here. r = r * s # if self.thresh_with_active: # mr = mr * torch.sigmoid(ma) sampled_boxes.append( self.forward_for_single_feature_map( l, o, r, c, mr, mp, self.image_sizes ) ) boxlists = list(zip(*sampled_boxes)) boxlists = [Instances.cat(boxlist) for boxlist in boxlists] boxlists = self.select_over_all_levels(boxlists) num_images = len(boxlists) for i in range(num_images): per_image_masks = boxlists[i].pred_masks if 'mask_bce' in self.mask_loss_type: per_image_masks = torch.sigmoid(per_image_masks) else: per_image_masks = torch.clamp(per_image_masks, min=0.001, max=0.999) per_image_masks = per_image_masks.view(-1, 1, self.output_mask_size, self.output_mask_size) boxlists[i].pred_masks = per_image_masks return boxlists
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper`. Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "sem_seg": semantic segmentation ground truth * "center": center points heatmap ground truth * "offset": pixel offsets to center points ground truth * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model (may be different from input resolution), used in inference. Returns: list[dict]: each dict is the results for one image. The dict contains the following keys: * "panoptic_seg", "sem_seg": see documentation :doc:`/tutorials/models` for the standard output format * "instances": available if ``predict_instances is True``. see documentation :doc:`/tutorials/models` for the standard output format """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] # To avoid error in ASPP layer when input has different size. size_divisibility = ( self.size_divisibility if self.size_divisibility > 0 else self.backbone.size_divisibility ) images = ImageList.from_tensors(images, size_divisibility) features = self.backbone(images.tensor) losses = {} if "sem_seg" in batched_inputs[0]: targets = [x["sem_seg"].to(self.device) for x in batched_inputs] targets = ImageList.from_tensors( targets, size_divisibility, self.sem_seg_head.ignore_value ).tensor if "sem_seg_weights" in batched_inputs[0]: # The default D2 DatasetMapper may not contain "sem_seg_weights" # Avoid error in testing when default DatasetMapper is used. weights = [x["sem_seg_weights"].to(self.device) for x in batched_inputs] weights = ImageList.from_tensors(weights, size_divisibility).tensor else: weights = None else: targets = None weights = None sem_seg_results, sem_seg_losses = self.sem_seg_head(features, targets, weights) losses.update(sem_seg_losses) if "center" in batched_inputs[0] and "offset" in batched_inputs[0]: center_targets = [x["center"].to(self.device) for x in batched_inputs] center_targets = ImageList.from_tensors( center_targets, size_divisibility ).tensor.unsqueeze(1) center_weights = [x["center_weights"].to(self.device) for x in batched_inputs] center_weights = ImageList.from_tensors(center_weights, size_divisibility).tensor offset_targets = [x["offset"].to(self.device) for x in batched_inputs] offset_targets = ImageList.from_tensors(offset_targets, size_divisibility).tensor offset_weights = [x["offset_weights"].to(self.device) for x in batched_inputs] offset_weights = ImageList.from_tensors(offset_weights, size_divisibility).tensor else: center_targets = None center_weights = None offset_targets = None offset_weights = None center_results, offset_results, center_losses, offset_losses = self.ins_embed_head( features, center_targets, center_weights, offset_targets, offset_weights ) losses.update(center_losses) losses.update(offset_losses) if self.training: return losses if self.benchmark_network_speed: return [] processed_results = [] for sem_seg_result, center_result, offset_result, input_per_image, image_size in zip( sem_seg_results, center_results, offset_results, batched_inputs, images.image_sizes ): height = input_per_image.get("height") width = input_per_image.get("width") r = sem_seg_postprocess(sem_seg_result, image_size, height, width) c = sem_seg_postprocess(center_result, image_size, height, width) o = sem_seg_postprocess(offset_result, image_size, height, width) # Post-processing to get panoptic segmentation. panoptic_image, _ = get_panoptic_segmentation( r.argmax(dim=0, keepdim=True), c, o, thing_ids=self.meta.thing_dataset_id_to_contiguous_id.values(), label_divisor=self.meta.label_divisor, stuff_area=self.stuff_area, void_label=-1, threshold=self.threshold, nms_kernel=self.nms_kernel, top_k=self.top_k, ) # For semantic segmentation evaluation. processed_results.append({"sem_seg": r}) panoptic_image = panoptic_image.squeeze(0) semantic_prob = F.softmax(r, dim=0) # Write results to disk: img = input_per_image["image"] from detectron2.utils.visualizer import Visualizer from detectron2.data.detection_utils import convert_image_to_rgb from PIL import Image import os img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format).astype("uint8") img = np.array(Image.fromarray(img).resize((width, height))) v_panoptic = Visualizer(img, self.meta) v_panoptic = v_panoptic.draw_panoptic_seg_predictions(panoptic_image.cpu(), None) pan_img = v_panoptic.get_image() image_path = input_per_image['file_name'].split(os.sep) image_name = os.path.splitext(image_path[-1])[0] Image.fromarray(pan_img).save(os.path.join('/home/ahabbas/projects/conseg/affinityNet/output_pdl/coco/eval_vis', image_name + '_panoptic.png')) # For panoptic segmentation evaluation. processed_results[-1]["panoptic_seg"] = (panoptic_image, None) # For instance segmentation evaluation. if self.predict_instances: instances = [] panoptic_image_cpu = panoptic_image.cpu().numpy() for panoptic_label in np.unique(panoptic_image_cpu): if panoptic_label == -1: continue pred_class = panoptic_label // self.meta.label_divisor isthing = pred_class in list( self.meta.thing_dataset_id_to_contiguous_id.values() ) # Get instance segmentation results. if isthing: instance = Instances((height, width)) # Evaluation code takes continuous id starting from 0 instance.pred_classes = torch.tensor( [pred_class], device=panoptic_image.device ) mask = panoptic_image == panoptic_label instance.pred_masks = mask.unsqueeze(0) # Average semantic probability sem_scores = semantic_prob[pred_class, ...] sem_scores = torch.mean(sem_scores[mask]) # Center point probability mask_indices = torch.nonzero(mask).float() center_y, center_x = ( torch.mean(mask_indices[:, 0]), torch.mean(mask_indices[:, 1]), ) center_scores = c[0, int(center_y.item()), int(center_x.item())] # Confidence score is semantic prob * center prob. instance.scores = torch.tensor( [sem_scores * center_scores], device=panoptic_image.device ) # Get bounding boxes instance.pred_boxes = BitMasks(instance.pred_masks).get_bounding_boxes() instances.append(instance) if len(instances) > 0: processed_results[-1]["instances"] = Instances.cat(instances) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances (optional): groundtruth :class:`Instances` * proposals (optional): :class:`Instances`, precomputed proposals. Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "instances" whose value is a :class:`Instances`. The :class:`Instances` object has the following keys: "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" """ if not self.training: self.init_model() return self.inference(batched_inputs) images, support_images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: for x in batched_inputs: x['instances'].set( 'gt_classes', torch.full_like(x['instances'].get('gt_classes'), 0)) gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) # support branches support_bboxes_ls = [] for item in batched_inputs: bboxes = item['support_bboxes'] for box in bboxes: box = Boxes(box[np.newaxis, :]) support_bboxes_ls.append(box.to(self.device)) B, N, C, H, W = support_images.tensor.shape assert N == self.support_way * self.support_shot support_images = support_images.tensor.reshape(B * N, C, H, W) support_features = self.backbone(support_images) # support feature roi pooling feature_pooled = self.roi_heads.roi_pooling(support_features, support_bboxes_ls) support_box_features = self.roi_heads._shared_roi_transform( [support_features[f] for f in self.in_features], support_bboxes_ls) assert self.support_way == 2 # now only 2 way support detector_loss_cls = [] detector_loss_box_reg = [] rpn_loss_rpn_cls = [] rpn_loss_rpn_loc = [] for i in range(B): # batch # query query_gt_instances = [gt_instances[i]] # one query gt instances query_images = ImageList.from_tensors([images[i] ]) # one query image query_feature_res4 = features['res4'][i].unsqueeze( 0) # one query feature for attention rpn query_features = { 'res4': query_feature_res4 } # one query feature for rcnn # positive support branch ################################## pos_begin = i * self.support_shot * self.support_way pos_end = pos_begin + self.support_shot pos_support_features = feature_pooled[pos_begin:pos_end].mean( 0, True ) # pos support features from res4, average all supports, for rcnn pos_support_features_pool = pos_support_features.mean( dim=[2, 3], keepdim=True ) # average pooling support feature for attention rpn pos_correlation = F.conv2d(query_feature_res4, pos_support_features_pool.permute( 1, 0, 2, 3), groups=1024) # attention map pos_features = { 'res4': pos_correlation } # attention map for attention rpn pos_support_box_features = support_box_features[ pos_begin:pos_end].mean(0, True) pos_proposals, pos_anchors, pos_pred_objectness_logits, pos_gt_labels, pos_pred_anchor_deltas, pos_gt_boxes = self.proposal_generator( query_images, pos_features, query_gt_instances) # attention rpn pos_pred_class_logits, pos_pred_proposal_deltas, pos_detector_proposals = self.roi_heads( query_images, query_features, pos_support_box_features, pos_proposals, query_gt_instances) # pos rcnn # negative support branch ################################## neg_begin = pos_end neg_end = neg_begin + self.support_shot neg_support_features = feature_pooled[neg_begin:neg_end].mean( 0, True) neg_support_features_pool = neg_support_features.mean(dim=[2, 3], keepdim=True) neg_correlation = F.conv2d(query_feature_res4, neg_support_features_pool.permute( 1, 0, 2, 3), groups=1024) neg_features = {'res4': neg_correlation} neg_support_box_features = support_box_features[ neg_begin:neg_end].mean(0, True) neg_proposals, neg_anchors, neg_pred_objectness_logits, neg_gt_labels, neg_pred_anchor_deltas, neg_gt_boxes = self.proposal_generator( query_images, neg_features, query_gt_instances) neg_pred_class_logits, neg_pred_proposal_deltas, neg_detector_proposals = self.roi_heads( query_images, query_features, neg_support_box_features, neg_proposals, query_gt_instances) # rpn loss outputs_images = ImageList.from_tensors([images[i], images[i]]) outputs_pred_objectness_logits = [ torch.cat(pos_pred_objectness_logits + neg_pred_objectness_logits, dim=0) ] outputs_pred_anchor_deltas = [ torch.cat(pos_pred_anchor_deltas + neg_pred_anchor_deltas, dim=0) ] outputs_anchors = pos_anchors # + neg_anchors # convert 1 in neg_gt_labels to 0 for item in neg_gt_labels: item[item == 1] = 0 outputs_gt_boxes = pos_gt_boxes + neg_gt_boxes #[None] outputs_gt_labels = pos_gt_labels + neg_gt_labels if self.training: proposal_losses = self.proposal_generator.losses( outputs_anchors, outputs_pred_objectness_logits, outputs_gt_labels, outputs_pred_anchor_deltas, outputs_gt_boxes) proposal_losses = { k: v * self.proposal_generator.loss_weight for k, v in proposal_losses.items() } else: proposal_losses = {} # detector loss detector_pred_class_logits = torch.cat( [pos_pred_class_logits, neg_pred_class_logits], dim=0) detector_pred_proposal_deltas = torch.cat( [pos_pred_proposal_deltas, neg_pred_proposal_deltas], dim=0) for item in neg_detector_proposals: item.gt_classes = torch.full_like(item.gt_classes, 1) #detector_proposals = pos_detector_proposals + neg_detector_proposals detector_proposals = [ Instances.cat(pos_detector_proposals + neg_detector_proposals) ] if self.training: predictions = detector_pred_class_logits, detector_pred_proposal_deltas detector_losses = self.roi_heads.box_predictor.losses( predictions, detector_proposals) rpn_loss_rpn_cls.append(proposal_losses['loss_rpn_cls']) rpn_loss_rpn_loc.append(proposal_losses['loss_rpn_loc']) detector_loss_cls.append(detector_losses['loss_cls']) detector_loss_box_reg.append(detector_losses['loss_box_reg']) proposal_losses = {} detector_losses = {} proposal_losses['loss_rpn_cls'] = torch.stack(rpn_loss_rpn_cls).mean() proposal_losses['loss_rpn_loc'] = torch.stack(rpn_loss_rpn_loc).mean() detector_losses['loss_cls'] = torch.stack(detector_loss_cls).mean() detector_losses['loss_box_reg'] = torch.stack( detector_loss_box_reg).mean() losses = {} losses.update(detector_losses) losses.update(proposal_losses) return losses
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper`. Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "sem_seg": semantic segmentation ground truth * "center": center points heatmap ground truth * "offset": pixel offsets to center points ground truth * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model (may be different from input resolution), used in inference. Returns: list[dict]: each dict is the results for one image. The dict contains the following keys: * "instances": see :meth:`GeneralizedRCNN.forward` for its format. * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format. * "panoptic_seg": see :func:`combine_semantic_and_instance_outputs` for its format. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] size_divisibility = self.backbone.size_divisibility images = ImageList.from_tensors(images, size_divisibility) features = self.backbone(images.tensor) losses = {} if "sem_seg" in batched_inputs[0]: targets = [x["sem_seg"].to(self.device) for x in batched_inputs] targets = ImageList.from_tensors( targets, size_divisibility, self.sem_seg_head.ignore_value).tensor if "sem_seg_weights" in batched_inputs[0]: # The default D2 DatasetMapper may not contain "sem_seg_weights" # Avoid error in testing when default DatasetMapper is used. weights = [ x["sem_seg_weights"].to(self.device) for x in batched_inputs ] weights = ImageList.from_tensors(weights, size_divisibility).tensor else: weights = None else: targets = None weights = None sem_seg_results, sem_seg_losses = self.sem_seg_head( features, targets, weights) losses.update(sem_seg_losses) if "center" in batched_inputs[0] and "offset" in batched_inputs[0]: center_targets = [ x["center"].to(self.device) for x in batched_inputs ] center_targets = ImageList.from_tensors( center_targets, size_divisibility).tensor.unsqueeze(1) center_weights = [ x["center_weights"].to(self.device) for x in batched_inputs ] center_weights = ImageList.from_tensors(center_weights, size_divisibility).tensor offset_targets = [ x["offset"].to(self.device) for x in batched_inputs ] offset_targets = ImageList.from_tensors(offset_targets, size_divisibility).tensor offset_weights = [ x["offset_weights"].to(self.device) for x in batched_inputs ] offset_weights = ImageList.from_tensors(offset_weights, size_divisibility).tensor else: center_targets = None center_weights = None offset_targets = None offset_weights = None center_results, offset_results, center_losses, offset_losses = self.ins_embed_head( features, center_targets, center_weights, offset_targets, offset_weights) losses.update(center_losses) losses.update(offset_losses) if self.training: return losses processed_results = [] for sem_seg_result, center_result, offset_result, input_per_image, image_size in zip( sem_seg_results, center_results, offset_results, batched_inputs, images.image_sizes): height = input_per_image.get("height") width = input_per_image.get("width") r = sem_seg_postprocess(sem_seg_result, image_size, height, width) c = sem_seg_postprocess(center_result, image_size, height, width) o = sem_seg_postprocess(offset_result, image_size, height, width) # Post-processing to get panoptic segmentation. panoptic_image, _ = get_panoptic_segmentation( r.argmax(dim=0, keepdim=True), c, o, thing_ids=self.meta.thing_dataset_id_to_contiguous_id.values(), label_divisor=self.meta.label_divisor, stuff_area=self.stuff_area, void_label=-1, threshold=self.threshold, nms_kernel=self.nms_kernel, top_k=self.top_k, ) # For semantic segmentation evaluation. processed_results.append({"sem_seg": r}) panoptic_image = panoptic_image.squeeze(0) semantic_prob = F.softmax(r, dim=0) # For panoptic segmentation evaluation. processed_results[-1]["panoptic_seg"] = (panoptic_image, None) # For instance segmentation evaluation. if self.predict_instances: instances = [] panoptic_image_cpu = panoptic_image.cpu().numpy() for panoptic_label in np.unique(panoptic_image_cpu): if panoptic_label == -1: continue pred_class = panoptic_label // self.meta.label_divisor isthing = pred_class in list( self.meta.thing_dataset_id_to_contiguous_id.values()) # Get instance segmentation results. if isthing: instance = Instances((height, width)) # Evaluation code takes continuous id starting from 0 instance.pred_classes = torch.tensor( [pred_class], device=panoptic_image.device) mask = panoptic_image == panoptic_label instance.pred_masks = mask.unsqueeze(0) # Average semantic probability sem_scores = semantic_prob[pred_class, ...] sem_scores = torch.mean(sem_scores[mask]) # Center point probability mask_indices = torch.nonzero(mask).float() center_y, center_x = ( torch.mean(mask_indices[:, 0]), torch.mean(mask_indices[:, 1]), ) center_scores = c[0, int(center_y.item()), int(center_x.item())] # Confidence score is semantic prob * center prob. instance.scores = torch.tensor( [sem_scores * center_scores], device=panoptic_image.device) # Get bounding boxes instance.pred_boxes = BitMasks( instance.pred_masks).get_bounding_boxes() instances.append(instance) if len(instances) > 0: processed_results[-1]["instances"] = Instances.cat( instances) return processed_results
def predict_proposals_single_image( self, cls_scores, bbox_preds, centernesses, all_level_points, image_size ): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: cls_scores (list[Tensor]): list of #feature levels. Each entry contains tensor of size (C, Hi, Wi), where i denotes a specific feature level. bbox_preds (list[Tensor]): Same shape as 'cls_scores' except that C becomes 4. centernesses (list[Tensor]): Same shape as 'cls_scores' except that C becomes 1. all_level_points (list[Tensor]): list of #feature levels. Each entry contains tensor of size (Hi*Wi, 2), a set of point coordinates (xi, yi) of all feature map locations on 'feature level i' in image coordinate. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `predict_proposals`, but for only one image. """ assert len(cls_scores) == len(bbox_preds) == len(all_level_points) bboxes_list = [] # Iterate over every feature level for (cls_score, bbox_pred, centerness, points) in zip( cls_scores, bbox_preds, centernesses, all_level_points ): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] # (C, Hi, Wi) -> (Hi*Wi, C) scores = cls_score.permute(1, 2, 0).reshape(-1, self.num_classes).sigmoid() # (4, Hi, Wi) -> (Hi*Wi, 4) bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) # (1, Hi, Wi) -> (Hi*Wi, ) centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid() """ Your code starts here """ nms_pre_topk = scores.clamp(max=self.nms_pre_topk) candidate_inds = nms_pre_topk > self.score_threshold scores = scores * centerness[:, None] bbox_scores = scores[candidate_inds] bbox_hw = candidate_inds.nonzero()[:, 0] bbox_classes = candidate_inds.nonzero()[:, 1] bbox_lrtb = bbox_pred[bbox_hw] bbox_xy = points[bbox_hw] if len(bbox_hw) != 0: h, w = image_size detections = torch.stack([ bbox_xy[:, 0] - bbox_lrtb[:, 0], bbox_xy[:, 1] - bbox_lrtb[:, 1], bbox_xy[:, 0] + bbox_lrtb[:, 2], bbox_xy[:, 1] + bbox_lrtb[:, 3], ], dim=1) bboxes = Boxes(detections) bbox_instances = Instances((int(h), int(w))) bbox_instances.set("pred_boxes", bboxes) bbox_instances.set("scores", bbox_scores) bbox_instances.set("pred_classes", bbox_classes) bboxes_list.append(bbox_instances) """ Your code ends here """ bboxes_list = Instances.cat(bboxes_list) # non-maximum suppression per-image. results = ml_nms( bboxes_list, self.nms_threshold, # Limit to max_per_image detections **over all classes** max_proposals=self.nms_post_topk ) return results
def forward(self, batched_inputs): # También 1 vez por imagen """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances (optional): groundtruth :class:`Instances` * proposals (optional): :class:`Instances`, precomputed proposals. Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "instances" whose value is a :class:`Instances`. The :class:`Instances` object has the following keys: "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" """ if not self.training: ### Estimar nº clases support_file_name = './support_dir/support_feature.pkl' if os.path.exists(support_file_name) and self.n_clases == 1: device = torch.cuda.current_device() avaliable = torch.cuda.get_device_properties( device).total_memory - torch.cuda.memory_reserved(device) #print("Memoria disponieble MiB: ", avaliable/(1024*1024)) with open(support_file_name, "rb") as hFile: aux = pickle.load(hFile, encoding="latin1") size = aux['res5_avg'][0].element_size( ) * aux['res5_avg'][0].nelement() size += aux['res4_avg'][0].element_size( ) * aux['res4_avg'][0].nelement() #print("Memoria ocupada por soporte: ", size) #print("Numero de clases :", math.floor(avaliable/size)) self.n_clases = math.floor(avaliable / (size * 1000)) print("Classes number: ", self.n_clases) ### Fin estimacion #Cambiar n_clases a valor estático si se quiere. #n_clases = self.n_clases n_clases = 101 # Obtener lista de id_clases: [1,2,3....] metadata = MetadataCatalog.get('fsod_eval') class_list = list( metadata.thing_dataset_id_to_contiguous_id.values()) # En cada iteración tomamos n_clases de class_list e iniciamos el modelo con ellas # Ejemplo con class_list=[1,2,3,4,5] y n_clases=2 # iter1: [1,2]; iter2: [3,4]; iter3: [5] aux = [] for i in range(math.ceil(len(class_list) / n_clases)): self.init_model(class_list[i * n_clases:i * n_clases + n_clases]) aux.append(self.inference(batched_inputs)[0]["instances"]) # aux es una lista de predicciones [pred_n_primeras_clases, pred_siguientes, ....] # necesitamos unificarlas todas en 1 solo elemento # -> empleando detectron2.structures.instances.Instances.cat. _predictions = {"instances": Instances.cat(aux)} return [_predictions] images, support_images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: for x in batched_inputs: x['instances'].set( 'gt_classes', torch.full_like(x['instances'].get('gt_classes'), 0)) gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) # support branches support_bboxes_ls = [] for item in batched_inputs: bboxes = item['support_bboxes'] for box in bboxes: box = Boxes(box[np.newaxis, :]) support_bboxes_ls.append(box.to(self.device)) B, N, C, H, W = support_images.tensor.shape assert N == self.support_way * self.support_shot support_images = support_images.tensor.reshape(B * N, C, H, W) support_features = self.backbone(support_images) # support feature roi pooling feature_pooled = self.roi_heads.roi_pooling(support_features, support_bboxes_ls) support_box_features = self.roi_heads._shared_roi_transform( [support_features[f] for f in self.in_features], support_bboxes_ls) #assert self.support_way == 2 # now only 2 way support detector_loss_cls = [] detector_loss_box_reg = [] rpn_loss_rpn_cls = [] rpn_loss_rpn_loc = [] for i in range(B): # batch # query query_gt_instances = [gt_instances[i]] # one query gt instances query_images = ImageList.from_tensors([images[i] ]) # one query image query_feature_res4 = features['res4'][i].unsqueeze( 0) # one query feature for attention rpn query_features = { 'res4': query_feature_res4 } # one query feature for rcnn # positive support branch ################################## pos_begin = i * self.support_shot * self.support_way pos_end = pos_begin + self.support_shot pos_support_features = feature_pooled[pos_begin:pos_end].mean( 0, True ) # pos support features from res4, average all supports, for rcnn pos_support_features_pool = pos_support_features.mean( dim=[2, 3], keepdim=True ) # average pooling support feature for attention rpn pos_correlation = F.conv2d(query_feature_res4, pos_support_features_pool.permute( 1, 0, 2, 3), groups=1024) # attention map pos_features = { 'res4': pos_correlation } # attention map for attention rpn pos_support_box_features = support_box_features[ pos_begin:pos_end].mean(0, True) pos_proposals, pos_anchors, pos_pred_objectness_logits, pos_gt_labels, pos_pred_anchor_deltas, pos_gt_boxes = self.proposal_generator( query_images, pos_features, query_gt_instances) # attention rpn pos_pred_class_logits, pos_pred_proposal_deltas, pos_detector_proposals = self.roi_heads( query_images, query_features, pos_support_box_features, pos_proposals, query_gt_instances) # pos rcnn # negative support branch ################################## neg_begin = pos_end neg_end = neg_begin + self.support_shot neg_support_features = feature_pooled[neg_begin:neg_end].mean( 0, True) neg_support_features_pool = neg_support_features.mean(dim=[2, 3], keepdim=True) neg_correlation = F.conv2d(query_feature_res4, neg_support_features_pool.permute( 1, 0, 2, 3), groups=1024) neg_features = {'res4': neg_correlation} neg_support_box_features = support_box_features[ neg_begin:neg_end].mean(0, True) neg_proposals, neg_anchors, neg_pred_objectness_logits, neg_gt_labels, neg_pred_anchor_deltas, neg_gt_boxes = self.proposal_generator( query_images, neg_features, query_gt_instances) neg_pred_class_logits, neg_pred_proposal_deltas, neg_detector_proposals = self.roi_heads( query_images, query_features, neg_support_box_features, neg_proposals, query_gt_instances) # rpn loss outputs_images = ImageList.from_tensors([images[i], images[i]]) outputs_pred_objectness_logits = [ torch.cat(pos_pred_objectness_logits + neg_pred_objectness_logits, dim=0) ] outputs_pred_anchor_deltas = [ torch.cat(pos_pred_anchor_deltas + neg_pred_anchor_deltas, dim=0) ] outputs_anchors = pos_anchors # + neg_anchors # convert 1 in neg_gt_labels to 0 for item in neg_gt_labels: item[item == 1] = 0 outputs_gt_boxes = pos_gt_boxes + neg_gt_boxes #[None] outputs_gt_labels = pos_gt_labels + neg_gt_labels if self.training: proposal_losses = self.proposal_generator.losses( outputs_anchors, outputs_pred_objectness_logits, outputs_gt_labels, outputs_pred_anchor_deltas, outputs_gt_boxes) proposal_losses = { k: v * self.proposal_generator.loss_weight for k, v in proposal_losses.items() } else: proposal_losses = {} # detector loss detector_pred_class_logits = torch.cat( [pos_pred_class_logits, neg_pred_class_logits], dim=0) detector_pred_proposal_deltas = torch.cat( [pos_pred_proposal_deltas, neg_pred_proposal_deltas], dim=0) for item in neg_detector_proposals: item.gt_classes = torch.full_like(item.gt_classes, 1) #detector_proposals = pos_detector_proposals + neg_detector_proposals detector_proposals = [ Instances.cat(pos_detector_proposals + neg_detector_proposals) ] if self.training: predictions = detector_pred_class_logits, detector_pred_proposal_deltas detector_losses = self.roi_heads.box_predictor.losses( predictions, detector_proposals) rpn_loss_rpn_cls.append(proposal_losses['loss_rpn_cls']) rpn_loss_rpn_loc.append(proposal_losses['loss_rpn_loc']) detector_loss_cls.append(detector_losses['loss_cls']) detector_loss_box_reg.append(detector_losses['loss_box_reg']) proposal_losses = {} detector_losses = {} proposal_losses['loss_rpn_cls'] = torch.stack(rpn_loss_rpn_cls).mean() proposal_losses['loss_rpn_loc'] = torch.stack(rpn_loss_rpn_loc).mean() detector_losses['loss_cls'] = torch.stack(detector_loss_cls).mean() detector_losses['loss_box_reg'] = torch.stack( detector_loss_box_reg).mean() losses = {} losses.update(detector_losses) losses.update(proposal_losses) return losses
def _merge_untracked_instances(self, instances: Instances) -> Instances: """ For untracked previous instances, under certain condition, still keep them in tracking and merge with the current instances. Args: instances: D2 Instances, for predictions of the current frame Return: D2 Instances merging current instances and instances from previous frame decided to keep tracking """ untracked_instances = Instances( image_size=instances.image_size, pred_boxes=[], pred_masks=[], pred_classes=[], scores=[], ID=[], ID_period=[], lost_frame_count=[], ) prev_bboxes = list(self._prev_instances.pred_boxes) prev_classes = list(self._prev_instances.pred_classes) prev_scores = list(self._prev_instances.scores) prev_ID_period = self._prev_instances.ID_period if instances.has("pred_masks"): prev_masks = list(self._prev_instances.pred_masks) for idx in self._untracked_prev_idx: x_left, y_top, x_right, y_bot = prev_bboxes[idx] if ((1.0 * (x_right - x_left) / self._video_width < self._min_box_rel_dim) or (1.0 * (y_bot - y_top) / self._video_height < self._min_box_rel_dim) or self._prev_instances.lost_frame_count[idx] >= self._max_lost_frame_count or prev_ID_period[idx] <= self._min_instance_period): continue untracked_instances.pred_boxes.append( list(prev_bboxes[idx].numpy())) untracked_instances.pred_classes.append(int(prev_classes[idx])) untracked_instances.scores.append(float(prev_scores[idx])) untracked_instances.ID.append(self._prev_instances.ID[idx]) untracked_instances.ID_period.append( self._prev_instances.ID_period[idx]) untracked_instances.lost_frame_count.append( self._prev_instances.lost_frame_count[idx] + 1) if instances.has("pred_masks"): untracked_instances.pred_masks.append( prev_masks[idx].numpy().astype(np.uint8)) untracked_instances.pred_boxes = Boxes( torch.FloatTensor(untracked_instances.pred_boxes)) untracked_instances.pred_classes = torch.IntTensor( untracked_instances.pred_classes) untracked_instances.scores = torch.FloatTensor( untracked_instances.scores) if instances.has("pred_masks"): untracked_instances.pred_masks = torch.IntTensor( untracked_instances.pred_masks) else: untracked_instances.remove("pred_masks") return Instances.cat([ instances, untracked_instances, ])
def predict_proposals_single_image(self, cls_scores, bbox_preds, centernesses, all_level_points, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: cls_scores (list[Tensor]): list of #feature levels. Each entry contains tensor of size (C, Hi, Wi), where i denotes a specific feature level. bbox_preds (list[Tensor]): Same shape as 'cls_scores' except that C becomes 4. centernesses (list[Tensor]): Same shape as 'cls_scores' except that C becomes 1. all_level_points (list[Tensor]): list of #feature levels. Each entry contains tensor of size (Hi*Wi, 2), a set of point coordinates (xi, yi) of all feature map locations on 'feature level i' in image coordinate. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `predict_proposals`, but for only one image. """ assert len(cls_scores) == len(bbox_preds) == len(all_level_points) bboxes_list = [] # Iterate over every feature level for (cls_score, bbox_pred, centerness, points) in zip(cls_scores, bbox_preds, centernesses, all_level_points): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] # (C, Hi, Wi) -> (Hi*Wi, C) scores = cls_score.permute(1, 2, 0).reshape(-1, self.num_classes).sigmoid() # (4, Hi, Wi) -> (Hi*Wi, 4) bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) # (1, Hi, Wi) -> (Hi*Wi, ) centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid() # Fanchen: DEBUG # torch.save((cls_scores, # bbox_preds, # centernesses, # all_level_points, # image_size, # scores, # bbox_pred, # centerness), '/home/CtrlDrive/fanchen/pyws/ee898_pa1/debugdata/inf.data') # print('DEBUG: inf.data') # exit(0) # >>> len(cls_scores) # 5 # >>> [score.size() for score in cls_scores] # [torch.Size([80, 152, 100]), torch.Size([80, 76, 50]), torch.Size([80, 38, 25]), torch.Size([80, 19, 13]), torch.Size([80, 10, 7])] # >>> scores # tensor([[0.0082, 0.0043, 0.0070, ..., 0.0048, 0.0050, 0.0046], # [0.0034, 0.0016, 0.0029, ..., 0.0021, 0.0017, 0.0015], # [0.0024, 0.0013, 0.0020, ..., 0.0018, 0.0017, 0.0013], # ..., # [0.0050, 0.0022, 0.0024, ..., 0.0010, 0.0013, 0.0008], # [0.0057, 0.0027, 0.0032, ..., 0.0014, 0.0015, 0.0010], # [0.0129, 0.0077, 0.0085, ..., 0.0048, 0.0057, 0.0040]], # device='cuda:7') # >>> scores.size() # torch.Size([15200, 80]) # >>> bbox_pred, bbox_pred.size() # (tensor([[ 6.7271, 6.7130, 16.7200, 13.4471], # [12.8911, 5.4016, 11.4462, 10.5563], # [17.2124, 5.3992, 17.0486, 10.5352], # ..., # [22.8796, 15.5267, 19.0822, 8.6359], # [28.2969, 15.3834, 15.9940, 9.6031], # [18.1814, 19.1390, 12.2707, 13.9811]], device='cuda:7'), torch.Size([15200, 4])) # >>> centerness, centerness.size() # (tensor([0.1976, 0.2229, 0.2007, ..., 0.2555, 0.2092, 0.2774], device='cuda:7'), torch.Size([15200])) # >>> all_level_points[0].size() # torch.Size([15200, 2]) """ Your code starts here """ # H, W = image_size scores_i_th_inds = torch.zeros_like(scores) + ( scores > self.score_threshold) scores *= scores_i_th_inds scores *= centerness[:, None] topk_cnt = scores_i_th_inds.reshape(-1).sum().clamp( max=self.nms_pre_topk) bbox_pred = torch.stack([ points[:, 0] - bbox_pred[:, 0], points[:, 1] - bbox_pred[:, 1], points[:, 0] + bbox_pred[:, 2], points[:, 1] + bbox_pred[:, 3] ], dim=1) flatten_scores = scores.reshape(-1) # Fanchen: size is (H*W*C, ) # flatten_labels = torch.tensor(range(self.num_classes)). \ # repeat(image_size[0] * image_size[1]) # Fanchen: size is (H*W*C, ) flatten_boxes = bbox_pred.unsqueeze(1). \ expand(-1, self.num_classes, -1).reshape(-1, 4) # Fanchen: size is (H*W*C, 4) pred_scores, topk_inds = flatten_scores.topk(int(topk_cnt)) pred_scores = torch.sqrt(pred_scores) pred_boxes = Boxes(flatten_boxes[topk_inds]) pred_classes = topk_inds % self.num_classes box_list = Instances(image_size, pred_boxes=pred_boxes, scores=pred_scores, pred_classes=pred_classes) bboxes_list.append(box_list) # Fanchen: tensor (Tensor[float]): a Nx4 matrix. Each row is (x1, y1, x2, y2). """ Your code ends here """ bboxes_list = Instances.cat(bboxes_list) # Fanchen: def cat(instance_lists: List["Instances"]) -> "Instances": # non-maximum suppression per-image. results = ml_nms( bboxes_list, # Fanchen: # boxes = boxlist.pred_boxes.tensor # scores = boxlist.scores # labels = boxlist.pred_classes self.nms_threshold, # Limit to max_per_image detections **over all classes** max_proposals=self.nms_post_topk) # Fanchen: DEBUG # torch.save((bboxes_list, results), '/home/CtrlDrive/fanchen/pyws/ee898_pa1/debugdata/infres.data') # exit(0) return results