def _match_annotations(self, image_annotations, image_predictions): # TODO: Evaluate the number of detected instances. prediction_boxes = Boxes.cat(_extract_instances_property(image_predictions, "bbox")) annotation_boxes = Boxes.cat(_extract_instances_property(image_annotations, "bbox")) match_quality_matrix = pairwise_iou(annotation_boxes, prediction_boxes) matched_idxs, matched_labels = self._bbox_matcher(match_quality_matrix) matched_image_annotations = [image_annotations[i] for i in matched_idxs] return matched_image_annotations, matched_labels
def _detectAndMap(self, image): ''' image: Frame amin a játékosokat akarjuk megtalálni result: dict((x_cut, y_cut) -> counturList) ''' st = time.time() # 0. Preprocess image frame = self.preprocess(image) # 1. Külön felvágom a képeket kis cellákra l_cells = self._cutImageToGrids(frame) # 2. Majd ezeket a képeket beadom a multiple prediktálóba l_preds = self._predictMultipleImages(l_cells) # 2.1 Lista a cellákon található instancokról l_preds = [x['instances'].to('cpu') for x in l_preds] # 2.2 Visszamappelem a bemeneti képre, majd felskálázom az 5K-s képre # 2.3 A Kamerától vett távolság függvényében változtatom a score-t (yTL) for inst, cell in zip(l_preds, self.gridList): inst.remove( 'pred_masks' ) # pred_masks nincs használva TODO: TeamColor esetében lehet jól jön inst.pred_boxes.tensor[:, 0:4] += torch.Tensor( [cell[0], cell[1], cell[0], cell[1]]) inst.boxes_before = inst.pred_boxes.clone( ) # Eredeti képen skálázás nélkül hol vannak inst.pred_boxes.tensor = inst.pred_boxes.tensor.divide( self.trans_value) inst.scores *= self.cameraDistWeight[cell[1]] # 2.4 Egész képre vonatkoztatott Instancok finalInstances = Instances( image_size=self.origResolution[::-1]) # (1440, 5120) finalInstances.pred_boxes = Boxes.cat([x.pred_boxes for x in l_preds]) finalInstances.boxes_before = Boxes.cat( [x.boxes_before for x in l_preds]) finalInstances.scores = torch.cat([x.scores for x in l_preds]) finalInstances.pred_classes = torch.cat( [x.pred_classes for x in l_preds]) # 3. Leszűröm az emberekre csak _person_class_ID = 0 finalInstances = finalInstances[finalInstances.pred_classes == _person_class_ID] # 4. NMS használata, hogy kiiktassam az átlapolódásokat iouIdx = torchvision.ops.nms(finalInstances.pred_boxes.tensor, finalInstances.scores, self.nmsThreshold) finalInstances = finalInstances[iouIdx] # 5. Félemberek leszűrése finalInstances = self._filterHalfMan(finalInstances) return finalInstances, frame
def get_ground_truth(self, anchors, bbox_preds, targets): anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] N = len(anchors) # list[Tensor(R, 4)], one for each image all_anchors = Boxes.cat(anchors).tensor.reshape(N, -1, 4) # Boxes(Tensor(N*R, 4)) box_delta = cat(bbox_preds, dim=1) # box_pred: xyxy; targets: xyxy box_pred = self.box2box_transform.apply_deltas(box_delta, all_anchors) indices = self.anchor_matcher(box_pred, all_anchors, targets) return indices
def setup(file): # get cfg cfg = get_cfg() cfg.merge_from_file(file) cfg.SOLVER.IMS_PER_BATCH = 2 # get data loader iter data_loader = build_detection_train_loader(cfg) data_loader_iter = iter(data_loader) batched_inputs = next(data_loader_iter) # build anchors backbone = build_backbone(cfg).to(device) images = [x["image"].to(device) for x in batched_inputs] images = ImageList.from_tensors(images, backbone.size_divisibility) features = backbone(images.tensor.float()) input_shape = backbone.output_shape() in_features = cfg.MODEL.RPN.IN_FEATURES anchor_generator = build_anchor_generator( cfg, [input_shape[f] for f in in_features]) anchors = anchor_generator([features[f] for f in in_features]) anchors = Boxes.cat(anchors).to(device) # build matcher raw_matcher = Matcher(cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, allow_low_quality_matches=True) matcher = TopKMatcher(cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, 9) return cfg, data_loader_iter, anchors, matcher, raw_matcher
def label_and_sample_anchors( self, anchors: List[Boxes], gt_instances: List[Instances] ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: """ Args: anchors (list[Boxes]): anchors for each feature map. gt_instances: the ground-truth instances for each image. Returns: list[Tensor]: List of #img tensors. i-th element is a vector of labels whose length is the total number of anchors across all feature maps R = sum(Hi * Wi * A). Label values are in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative class; 1 = positive class. list[Tensor]: i-th element is a Rx4 tensor. The values are the matched gt boxes for each anchor. Values are undefined for those anchors not labeled as 1. """ anchors = Boxes.cat(anchors) gt_boxes = [x.gt_boxes for x in gt_instances] image_sizes = [x.image_size for x in gt_instances] del gt_instances gt_labels = [] matched_gt_boxes = [] for image_size_i, gt_boxes_i in zip(image_sizes, gt_boxes): """ image_size_i: (h, w) for the i-th image gt_boxes_i: ground-truth boxes for i-th image """ match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i, anchors) matched_idxs, gt_labels_i = retry_if_cuda_oom( self.anchor_matcher)(match_quality_matrix) # Matching is memory-expensive and may result in CPU tensors. But the result is small gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device) del match_quality_matrix if self.anchor_boundary_thresh >= 0: # Discard anchors that go out of the boundaries of the image # NOTE: This is legacy functionality that is turned off by default in Detectron2 anchors_inside_image = anchors.inside_box( image_size_i, self.anchor_boundary_thresh) gt_labels_i[~anchors_inside_image] = -1 # A vector of labels (-1, 0, 1) for each anchor gt_labels_i = self._subsample_labels(gt_labels_i) if len(gt_boxes_i) == 0: # These values won't be used anyway since the anchor is labeled as background matched_gt_boxes_i = torch.zeros_like(anchors.tensor) else: # TODO wasted indexing computation for ignored boxes matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor gt_labels.append(gt_labels_i) # N,AHW matched_gt_boxes.append(matched_gt_boxes_i) return gt_labels, matched_gt_boxes
def get_ground_truth(self, anchors: List[Boxes], gt_instances: List[Instances], num_classes: int) -> Tuple[List[Tensor], List[Tensor]]: """ Extract the ground truth classes and boxes from a list of Instances objects. Args: anchors (List[Boxes]): A list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. gt_instances (List[Instances]): A list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. num_classes (int): The number of classes. Returns: gt_classes (List[Tensor]): List of #img tensors. i-th element is a vector of classes whose length is the total number of anchors across all feature maps (sum(Hi * Wi * A)). Label values are in {-1, 0, ..., K}, with -1 means ignore, and K means background. matched_gt_boxes (List[Tensor]): i-th element is a Rx4 tensor, where R is the total number of anchors across feature maps. The values are the matched gt boxes for each anchor. Values are undefined for those anchors not labeled as foreground. """ anchors_boxes: Boxes = Boxes.cat(anchors) gt_classes: List[Tensor] = [] matched_gt_boxes: List[Tensor] = [] for gt_instance in gt_instances: match_quality_matrix: Tensor = pairwise_iou(gt_instance.gt_boxes, anchors_boxes) matched_idxs, anchor_classes = self.anchor_matcher(match_quality_matrix) del match_quality_matrix if len(gt_instance) > 0: matched_gt_boxes_i: Tensor = gt_instance.gt_boxes.tensor[matched_idxs] gt_classes_i: Tensor = gt_instance.gt_classes[matched_idxs] # Anchors with class 0 are treated as background. gt_classes_i[anchor_classes == 0] = num_classes # Anchors with class -1 are ignored. gt_classes_i[anchor_classes == -1] = -1 else: matched_gt_boxes_i = torch.zeros_like(anchors_boxes.tensor) gt_classes_i = torch.zeros_like(matched_idxs) + num_classes gt_classes.append(gt_classes_i) matched_gt_boxes.append(matched_gt_boxes_i) return gt_classes, matched_gt_boxes
def get_proposal_clusters(self, box, proposals, label, cls_prob): gt_boxes = [] gt_classes = [] gt_scores = [] if cls_prob.numel() > 0: for idx, gt_class in enumerate(label): curr_cls_prob = cls_prob.index_select( 1, index=gt_class).clone().detach() max_gt_score, max_index = curr_cls_prob.max(dim=0) gt_boxes.append(box[max_index]) gt_classes.append(gt_class) gt_scores.append(max_gt_score) cls_prob[max_index, :] = 0.0 gt_classes = torch.stack(gt_classes) gt_scores = torch.cat(gt_scores) new_instance = Instances(box.image_size) new_instance.gt_boxes = copy.deepcopy( Boxes.cat([x.proposal_boxes for x in gt_boxes])) new_instance.gt_classes = label.clone().detach() else: new_instance = Instances(box.image_size) new_instance.gt_boxes = Boxes(torch.zeros(0, self.box_dim)).to( cls_prob.device) new_instance.gt_classes = torch.zeros(0).to(cls_prob.device) gt_scores = torch.zeros(0).to(cls_prob.device) return new_instance, gt_scores
def forward(self, image_sizes, box_cls, box_regression, centerness, anchors): sampled_boxes = [] # anchors = list(zip(*anchors)) for _, (o, b, c, a) in enumerate(zip(box_cls, box_regression, centerness, anchors)): sampled_boxes.append( self.forward_for_single_feature_map(o, b, c, a) ) boxlists = [] for i, image_size in enumerate(image_sizes): boxlist = Instances(image_size) boxes = [] scores = [] classes = [] for j in range(len(anchors)): boxes.append(sampled_boxes[j][i][0]) scores.append(sampled_boxes[j][i][1]) classes.append(sampled_boxes[j][i][2]) boxes = Boxes.cat(boxes) boxes.clip(image_size) keep = boxes.nonempty(self.min_size) boxlist.pred_boxes = boxes[keep] boxlist.scores = torch.cat(scores, dim=0)[keep] boxlist.pred_classes = torch.cat(classes, dim=0)[keep] boxlists.append(boxlist) # boxlists = list(zip(*sampled_boxes)) # boxlists = [cat_boxlist(boxlist) for boxlist in boxlists] # boxlists = Boxes.cat(boxlists) if not self.bbox_aug_enabled: boxlists = self.select_over_all_levels(boxlists) return boxlists
def get_ground_truth(self, anchors, targets): """ Args: anchors (list[list[Boxes]]): a list of N=#image elements. Each is a list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each anchor. R is the total number of anchors, i.e. the sum of Hi x Wi x A for all levels. Anchors with an IoU with some target higher than the foreground threshold are assigned their corresponding label in the [0, K-1] range. Anchors whose IoU are below the background threshold are assigned the label "K". Anchors whose IoU are between the foreground and background thresholds are assigned a label "-1", i.e. ignore. gt_anchors_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth box2box transform targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding anchor is labeled as foreground. """ gt_classes = [] gt_anchors_deltas = [] anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] # list[Tensor(R, 4)], one for each image for anchors_per_image, targets_per_image in zip(anchors, targets): match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, anchors_per_image) gt_matched_idxs, anchor_labels = self.matcher(match_quality_matrix) has_gt = len(targets_per_image) > 0 if has_gt: # ground truth box regression matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs] gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas( anchors_per_image.tensor, matched_gt_boxes.tensor) gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Anchors with label 0 are treated as background. gt_classes_i[anchor_labels == 0] = self.num_classes # Anchors with label -1 are ignored. gt_classes_i[anchor_labels == -1] = -1 else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes gt_anchors_reg_deltas_i = torch.zeros_like( anchors_per_image.tensor) gt_classes.append(gt_classes_i) gt_anchors_deltas.append(gt_anchors_reg_deltas_i) return torch.stack(gt_classes), torch.stack(gt_anchors_deltas)
def _join_elements_pred_with_gt(pred_ele, pred_indices, gt_ele, gt_indices): if isinstance(pred_ele, Boxes): return Boxes.cat( [_deselect(pred_ele, pred_indices), gt_ele[gt_indices]]) else: return torch.cat( [_deselect(pred_ele, pred_indices), gt_ele[gt_indices]])
def inference_single_image(self, conf_pred_per_image, loc_pred_per_image, default_boxes, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: conf_pred_per_image (list[Tensor]): list of #feature levels. Each entry contains tensor of size [Hi x Wi x D, C]. loc_pred_per_image (list[Tensor]): same shape as 'conf_pred_per_image' except that C becomes 4. default_boxes (list['Boxes']): a list of 'Boxes' elements. The Boxes contains default boxes of one image on the specific feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ # predict confidence conf_pred = torch.cat(conf_pred_per_image, dim=0) # [R, C] conf_pred = conf_pred.softmax(dim=1) # predict boxes loc_pred = torch.cat(loc_pred_per_image, dim=0) # [R, 4] default_boxes = Boxes.cat(default_boxes) # [R, 4] boxes_pred = self.box2box_transform.apply_deltas( loc_pred, default_boxes.tensor) num_boxes, num_classes = conf_pred.shape boxes_pred = boxes_pred.view(num_boxes, 1, 4).expand(num_boxes, num_classes, 4) # [R, C, 4] labels = torch.arange(num_classes, device=self.device) # [0, ..., C] labels = labels.view(1, num_classes).expand_as(conf_pred) # [R, C] # remove predictions with the background label boxes_pred = boxes_pred[:, :-1] conf_pred = conf_pred[:, :-1] labels = labels[:, :-1] # batch everything, by making every class prediction be a separate instance boxes_pred = boxes_pred.reshape(-1, 4) conf_pred = conf_pred.reshape(-1) labels = labels.reshape(-1) # remove low scoring boxes indices = torch.nonzero(conf_pred > self.score_threshold).squeeze(1) boxes_pred, conf_pred, labels = boxes_pred[indices], conf_pred[ indices], labels[indices] keep = batched_nms(boxes_pred, conf_pred, labels, self.nms_threshold) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_pred[keep]) result.scores = conf_pred[keep] result.pred_classes = labels[keep] return result
def label_anchors(self, anchors, gt_instances): """ Args: anchors (list[Boxes]): A list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. gt_instances (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Returns: list[Tensor]: List of #img tensors. i-th element is a vector of labels whose length is the total number of anchors across all feature maps (sum(Hi * Wi * A)). Label values are in {-1, 0, ..., K}, with -1 means ignore, and K means background. list[Tensor]: i-th element is a Rx4 tensor, where R is the total number of anchors across feature maps. The values are the matched gt boxes for each anchor. Values are undefined for those anchors not labeled as foreground. """ anchors = Boxes.cat(anchors) # Rx4 gt_labels, gt_labels_1, gt_labels_2 = [], [], [] #change matched_gt_boxes = [] for gt_per_image in gt_instances: match_quality_matrix = pairwise_iou(gt_per_image.gt_boxes, anchors) matched_idxs, anchor_labels = self.anchor_matcher( match_quality_matrix) del match_quality_matrix if len(gt_per_image) > 0: matched_gt_boxes_i = gt_per_image.gt_boxes.tensor[matched_idxs] gt_labels_i = gt_per_image.gt_classes[matched_idxs] gt_labels_i_1 = gt_per_image.gt_classes_1[matched_idxs] gt_labels_i_2 = gt_per_image.gt_classes_2[matched_idxs] # Anchors with label 0 are treated as background. gt_labels_i[anchor_labels == 0] = self.num_classes gt_labels_i_1[anchor_labels == 0] = 3 gt_labels_i_2[anchor_labels == 0] = 3 # Anchors with label -1 are ignored. gt_labels_i[anchor_labels == -1] = -1 gt_labels_i_1[anchor_labels == -1] = -1 gt_labels_i_2[anchor_labels == -1] = -1 else: matched_gt_boxes_i = torch.zeros_like(anchors.tensor) gt_labels_i = torch.zeros_like(matched_idxs) + self.num_classes gt_labels_i_1 = torch.zeros_like(matched_idxs) + 3 gt_labels_i_2 = torch.zeros_like(matched_idxs) + 3 gt_labels.append(gt_labels_i) gt_labels_1.append(gt_labels_i_1) gt_labels_2.append(gt_labels_i_2) matched_gt_boxes.append(matched_gt_boxes_i) return gt_labels, gt_labels_1, gt_labels_2, matched_gt_boxes
def match_anchors(self, anchors: List[Boxes], gt_instances: List[Instances]): """ Match anchors with ground truth boxes. Args: anchors: #level boxes, from the highest resolution to lower resolution gt_instances: ground truth instances per image Returns: List[Tensor]: #image tensors, each is a vector of matched gt indices (or -1 for unmatched anchors) for all anchors. """ num_anchors_per_level = [len(x) for x in anchors] anchors = Boxes.cat(anchors) # Rx4 anchor_centers = anchors.get_centers() # Rx2 anchor_sizes = anchors.tensor[:, 2] - anchors.tensor[:, 0] # R lower_bound = anchor_sizes * 4 lower_bound[:num_anchors_per_level[0]] = 0 upper_bound = anchor_sizes * 8 upper_bound[-num_anchors_per_level[-1]:] = float("inf") matched_indices = [] for gt_per_image in gt_instances: gt_centers = gt_per_image.gt_boxes.get_centers() # Nx2 # FCOS with center sampling: anchor point must be close enough to gt center. pairwise_match = ( anchor_centers[:, None, :] - gt_centers[None, :, :]).abs_().max( dim=2 ).values < self.center_sampling_radius * anchor_sizes[:, None] pairwise_dist = pairwise_point_box_distance( anchor_centers, gt_per_image.gt_boxes) # The original FCOS anchor matching rule: anchor point must be inside gt pairwise_match &= pairwise_dist.min(dim=2).values > 0 # Multilevel anchor matching in FCOS: each anchor is only responsible # for certain scale range. pairwise_dist = pairwise_dist.max(dim=2).values pairwise_match &= (pairwise_dist > lower_bound[:, None]) & ( pairwise_dist < upper_bound[:, None]) # Match the GT box with minimum area, if there are multiple GT matches gt_areas = gt_per_image.gt_boxes.area() # N pairwise_match = pairwise_match.to( torch.float32) * (1e8 - gt_areas[None, :]) min_values, matched_idx = pairwise_match.max( dim=1) # R, per-anchor match matched_idx[ min_values < 1e-5] = -1 # Unmatched anchors are assigned -1 matched_indices.append(matched_idx) return matched_indices
def _match_anchors(self, gt_boxes: Boxes, anchors: List[Boxes]): """ Match ground-truth boxes to a set of multi-level anchors. Args: gt_boxes: Ground-truth boxes from instances of an image. anchors: List of anchors for each feature map (of different scales). Returns: torch.Tensor A tensor of shape `(M, R)`, given `M` ground-truth boxes and total `R` anchor points from all feature levels, indicating the quality of match between m-th box and r-th anchor. Higher value indicates better match. """ # Naming convention: (M = ground-truth boxes, R = anchor points) # Anchor points are represented as square boxes of size = stride. num_anchors_per_level = [len(x) for x in anchors] anchors = Boxes.cat(anchors) # (R, 4) anchor_centers = anchors.get_centers() # (R, 2) anchor_sizes = anchors.tensor[:, 2] - anchors.tensor[:, 0] # (R, ) lower_bound = anchor_sizes * 4 lower_bound[:num_anchors_per_level[0]] = 0 upper_bound = anchor_sizes * 8 upper_bound[-num_anchors_per_level[-1]:] = float("inf") gt_centers = gt_boxes.get_centers() # FCOS with center sampling: anchor point must be close enough to # ground-truth box center. center_dists = (anchor_centers[None, :, :] - gt_centers[:, None, :]).abs_() sampling_regions = self.center_sampling_radius * anchor_sizes[None, :] match_quality_matrix = center_dists.max( dim=2).values < sampling_regions pairwise_dist = pairwise_point_box_distance(anchor_centers, gt_boxes) pairwise_dist = pairwise_dist.permute(1, 0, 2) # (M, R, 4) # The original FCOS anchor matching rule: anchor point must be inside GT. match_quality_matrix &= pairwise_dist.min(dim=2).values > 0 # Multilevel anchor matching in FCOS: each anchor is only responsible # for certain scale range. pairwise_dist = pairwise_dist.max(dim=2).values match_quality_matrix &= (pairwise_dist > lower_bound[None, :]) & ( pairwise_dist < upper_bound[None, :]) # Match the GT box with minimum area, if there are multiple GT matches. gt_areas = gt_boxes.area() # (M, ) match_quality_matrix = match_quality_matrix.to(torch.float32) match_quality_matrix *= 1e8 - gt_areas[:, None] return match_quality_matrix # (M, R)
def get_graph_centers(self, box, cls_prob, label): gt_boxes = [] gt_classes = [] gt_scores = [] for idx, gt_class in enumerate(label): curr_cls_prob = cls_prob.index_select(1, index=gt_class) non_zero_idxs = torch.where(curr_cls_prob >= 0)[0] top_ranking_idxs = self.get_top_ranking_proposals( curr_cls_prob[non_zero_idxs]) non_zero_idxs = non_zero_idxs[top_ranking_idxs] curr_box = box[non_zero_idxs] curr_cls_prob = curr_cls_prob[non_zero_idxs] graph = self.build_graph(curr_box) count = curr_cls_prob.size(0) keep_idxs = [] curr_gt_scores = [] while True: order = torch.sum(graph, 1).argsort(descending=True) keep_idxs.append(order[0]) graph_idx = torch.where(graph[order[0], :] > 0)[0] curr_gt_scores.append(torch.max(curr_cls_prob[graph_idx])) graph[:, graph_idx] = 0 graph[graph_idx, :] = 0 count = count - len(graph_idx) if count <= 5: break keep_idxs = torch.stack(keep_idxs, 0) curr_gt_scores = torch.stack(curr_gt_scores, 0) curr_gt_boxes = curr_box[keep_idxs] keep_idxs_selected = curr_gt_scores.argsort().flip([ 0 ])[:min(len(curr_gt_scores), self.max_pc_num)].clone().detach() gt_boxes.append(curr_gt_boxes[keep_idxs_selected]) gt_scores.append(curr_gt_scores[keep_idxs_selected]) gt_classes.append( (torch.zeros_like(keep_idxs_selected) + gt_class).long()) # Delete selected proposals ids_to_remove = non_zero_idxs[keep_idxs][keep_idxs_selected] indexer = torch.ones(cls_prob.size(0)).to(cls_prob.device) indexer[ids_to_remove] = 0. indexer_mask = indexer == 1. cls_prob = cls_prob.clone().detach()[indexer_mask] box = copy.deepcopy(box)[indexer_mask] new_instance = Instances(box.image_size) new_instance.gt_boxes = copy.deepcopy( Boxes.cat([x.proposal_boxes for x in gt_boxes])) new_instance.gt_classes = torch.cat(gt_classes) gt_scores = torch.cat(gt_scores) return new_instance, gt_scores
def label_anchors(self, anchors, gt_instances): """ Args: anchors (list[Boxes]): A list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. gt_instances (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Returns: list[Tensor]: List of #img tensors. i-th element is a vector of labels whose length is the total number of anchors across all feature maps (sum(Hi * Wi * A)). Label values are in {-1, 0, ..., K}, with -1 means ignore, and K means background. list[Tensor]: i-th element is a Rx4 tensor, where R is the total number of anchors across feature maps. The values are the matched gt boxes for each anchor. Values are undefined for those anchors not labeled as foreground. """ anchors = Boxes.cat(anchors) # Rx4 num_anchors = anchors.tensor.shape[0] gt_labels, matched_gt_boxes, matched_gt_marks, matched_gt_marks_labels = [[] for _ in range(4)] for gt_per_image in gt_instances: match_quality_matrix = pairwise_iou(gt_per_image.gt_boxes, anchors) matched_idxs, anchor_labels = self.anchor_matcher(match_quality_matrix) del match_quality_matrix if len(gt_per_image) > 0: matched_gt_boxes_i = gt_per_image.gt_boxes.tensor[matched_idxs] matched_gt_marks_iv = gt_per_image.gt_keypoints.tensor[matched_idxs] matched_gt_marks_i = matched_gt_marks_iv[:, :, :2].flatten(1) matched_gt_marks_labels_i = matched_gt_marks_iv[:, :, 2].flatten(1) matched_gt_marks_labels_i, _ = torch.min(matched_gt_marks_labels_i, dim=1) gt_labels_i = gt_per_image.gt_classes[matched_idxs] # Anchors with label 0 are treated as background. gt_labels_i[anchor_labels == 0] = self.num_classes # Anchors with label -1 are ignored. gt_labels_i[anchor_labels == -1] = -1 else: matched_gt_boxes_i = torch.zeros_like(anchors.tensor) gt_labels_i = torch.zeros_like(matched_idxs) + self.num_classes matched_gt_marks_i = torch.zeros(num_anchors, self.num_landmark * 2).to(self.device) matched_gt_marks_labels_i = torch.zeros(num_anchors).to(self.device) gt_labels.append(gt_labels_i) matched_gt_boxes.append(matched_gt_boxes_i) matched_gt_marks.append(matched_gt_marks_i) matched_gt_marks_labels.append(matched_gt_marks_labels_i) return gt_labels, matched_gt_boxes, matched_gt_marks, matched_gt_marks_labels
def compute_ctrness_targets(self, anchors: List[Boxes], gt_boxes: List[torch.Tensor]): anchors = Boxes.cat(anchors).tensor # Rx4 reg_targets = [self.box2box_transform.get_deltas(anchors, m) for m in gt_boxes] reg_targets = torch.stack(reg_targets, dim=0) # NxRx4 if len(reg_targets) == 0: return reg_targets.new_zeros(len(reg_targets)) left_right = reg_targets[:, :, [0, 2]] top_bottom = reg_targets[:, :, [1, 3]] ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * ( top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0] ) return torch.sqrt(ctrness)
def _uniform_sample_train_points(self, instances): assert self.training proposal_boxes = [x.proposal_boxes for x in instances] cat_boxes = Boxes.cat(proposal_boxes) # uniform sample point_coords = torch.rand( len(cat_boxes), self.mask_point_train_num_points, 2, device=cat_boxes.tensor.device ) # sample point_labels point_coords_wrt_image = get_point_coords_wrt_image(cat_boxes.tensor, point_coords) point_labels = sample_point_labels(instances, point_coords_wrt_image) return point_coords, point_labels
def write_priors(self, images: Tensor, output_priors: str): features = self.backbone(images) features = [features[f] for f in self.head_in_features] anchors = Boxes.cat(self.anchor_generator(features)).tensor.detach().cpu().numpy() with open(output_priors, "wb") as f: import struct shape = anchors.shape f.write(struct.pack("=i", len(shape))) f.write(struct.pack("={}".format("i" * len(shape)), *shape)) data = anchors.reshape([-1]) for d in data: f.write(struct.pack("=f", d))
def _get_ground_truth(self): """ Returns: gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the total number of anchors in image i (i.e., len(anchors[i])). Label values are in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative class; 1 = positive class. gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), 4). """ gt_objectness_logits = [] gt_anchor_deltas = [] # Concatenate anchors from all feature maps into a single Boxes per image # anchors_i 是第 i 个 image 上的所有 feature maps 的 anchors, list(Boxes) # 把每个 image 上所有 feature maps 的 anchors 连接起来 anchors = [Boxes.cat(anchors_i) for anchors_i in self.anchors] for image_size_i, anchors_i, gt_boxes_i in zip(self.image_sizes, anchors, self.gt_boxes): """ image_size_i: (h, w) for the i-th image anchors_i: anchors for i-th image gt_boxes_i: ground-truth boxes for i-th image """ match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i, anchors_i) # [N, ] matched_idxs, gt_objectness_logits_i = retry_if_cuda_oom(self.anchor_matcher)( match_quality_matrix ) # Matching is memory-expensive and may result in CPU tensors. But the result is small gt_objectness_logits_i = gt_objectness_logits_i.to(device=gt_boxes_i.device) del match_quality_matrix if self.boundary_threshold >= 0: # Discard anchors that go out of the boundaries of the image # NOTE: This is legacy functionality that is turned off by default in Detectron2 anchors_inside_image = anchors_i.inside_box(image_size_i, self.boundary_threshold) gt_objectness_logits_i[~anchors_inside_image] = -1 if len(gt_boxes_i) == 0: # These values won't be used anyway since the anchor is labeled as background gt_anchor_deltas_i = torch.zeros_like(anchors_i.tensor) else: # TODO wasted computation for ignored boxes matched_gt_boxes = gt_boxes_i[matched_idxs] # [N, 4] gt_anchor_deltas_i = self.box2box_transform.get_deltas( anchors_i.tensor, matched_gt_boxes.tensor ) gt_objectness_logits.append(gt_objectness_logits_i) gt_anchor_deltas.append(gt_anchor_deltas_i) return gt_objectness_logits, gt_anchor_deltas
def ga_sampled_approxs(self, images, features, gt_instances): approxs = self.approx_anchor_generator(features) approxs = Boxes.cat(approxs) image_sizes = [x.image_size for x in gt_instances] inside_flags_list = [] for image_size_i in image_sizes: if self.boundary_threshold >= 0: inside_flags = approxs.inside_box(image_size_i, self.boundary_threshhold) inside_flags_list.append(inside_flags) inside_flags = ( torch.stack(inside_flags_list, 0).sum(dim=0) > 0) return approxs, inside_flags_list
def __call__(self, box_cls, box_regression, centerness, gt_instances, anchors): labels, reg_targets = self.prepare_targets(gt_instances, anchors) N = len(labels) box_cls_flatten, box_regression_flatten = concat_box_prediction_layers(box_cls, box_regression) centerness_flatten = [ct.permute(0, 2, 3, 1).reshape(N, -1, 1) for ct in centerness] centerness_flatten = torch.cat(centerness_flatten, dim=1).reshape(-1) labels_flatten = torch.cat(labels, dim=0) reg_targets_flatten = torch.cat(reg_targets, dim=0) # anchors_flatten = torch.cat([cat_boxlist(anchors_per_image).bbox for anchors_per_image in anchors], dim=0) anchors_flatten = torch.cat([Boxes.cat(anchors).tensor for _ in range(N)], dim=0) pos_inds = torch.nonzero(labels_flatten != self.num_classes).squeeze(1) num_gpus = get_num_gpus() total_num_pos = reduce_sum(pos_inds.new_tensor([pos_inds.numel()])).item() num_pos_avg_per_gpu = max(total_num_pos / float(num_gpus), 1.0) # one hot label for focal loss class_target = torch.zeros_like(box_cls_flatten) class_target[pos_inds, labels_flatten[pos_inds]] = 1 cls_loss = sigmoid_focal_loss_jit( box_cls_flatten, class_target, alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum" ) / num_pos_avg_per_gpu box_regression_flatten = box_regression_flatten[pos_inds] reg_targets_flatten = reg_targets_flatten[pos_inds] anchors_flatten = anchors_flatten[pos_inds] centerness_flatten = centerness_flatten[pos_inds] centerness_targets = self.compute_centerness_targets(reg_targets_flatten, anchors_flatten) sum_centerness_targets_avg_per_gpu = reduce_sum(centerness_targets.sum()).item() / float(num_gpus) if pos_inds.numel() > 0: reg_loss = self.DIoULoss(box_regression_flatten, reg_targets_flatten, anchors_flatten, weight=centerness_targets) / sum_centerness_targets_avg_per_gpu centerness_loss = self.centerness_loss_func(centerness_flatten, centerness_targets) / num_pos_avg_per_gpu else: reg_loss = box_regression_flatten.sum() centerness_loss = centerness_flatten.sum() return cls_loss, reg_loss * self.cfg.MODEL.ATSS.REG_LOSS_WEIGHT, centerness_loss
def point_sample_fine_grained_features(features_list, feature_scales, boxes, point_coords): """ Get features from feature maps in `features_list` that correspond to specific point coordinates inside each bounding box from `boxes`. Args: features_list (list[Tensor]): A list of feature map tensors to get features from. feature_scales (list[float]): A list of scales for tensors in `features_list`. boxes (list[Boxes]): A list of I Boxes objects that contain R_1 + ... + R_I = R boxes all together. point_coords (Tensor): A tensor of shape (R, P, 2) that contains [0, 1] x [0, 1] box-normalized coordinates of the P sampled points. Returns: point_features (Tensor): A tensor of shape (R, C, P) that contains features sampled from all features maps in feature_list for P sampled points for all R boxes in `boxes`. point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains image-level coordinates of P points. """ cat_boxes = Boxes.cat(boxes) num_boxes = [len(b) for b in boxes] point_coords_wrt_image = get_point_coords_wrt_image( cat_boxes.tensor, point_coords) split_point_coords_wrt_image = torch.split(point_coords_wrt_image, num_boxes) point_features = [] for idx_img, point_coords_wrt_image_per_image in enumerate( split_point_coords_wrt_image): point_features_per_image = [] for idx_feature, feature_map in enumerate(features_list): h, w = feature_map.shape[-2:] scale = torch.tensor( [w, h], device=feature_map.device) / feature_scales[idx_feature] point_coords_scaled = point_coords_wrt_image_per_image / scale point_features_per_image.append( point_sample( feature_map[idx_img].unsqueeze(0), point_coords_scaled.unsqueeze(0), align_corners=False, ).squeeze(0).transpose(1, 0)) point_features.append(cat(point_features_per_image, dim=1)) return cat(point_features, dim=0), point_coords_wrt_image
def label_anchors(self, anchors, gt_instances): """ Args: anchors (list[Boxes]): A list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. gt_instances (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Returns: list[Tensor]: List of #img tensors. i-th element is a vector of labels whose length is the total number of anchors across all feature maps (sum(Hi * Wi * A)). Label values are in {-1, 0, ..., K}, with -1 means ignore, and K means background. list[Tensor]: i-th element is a Rx4 tensor, where R is the total number of anchors across feature maps. The values are the matched gt boxes for each anchor. Values are undefined for those anchors not labeled as foreground. """ # generate strides: [R] strides = [] backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] for i in range(len(feature_shapes)): stride = feature_shapes[i].stride anchor_num_i = anchors[i].tensor.shape[0] stride = torch.full((anchor_num_i,), stride, device=anchors[i].tensor.device) strides.append(stride) anchors = Boxes.cat(anchors).tensor centers = torch.stack(((anchors[:, 0] + anchors[:, 2]) // 2, (anchors[:, 1] + anchors[:, 3]) // 2), dim=1) strides = torch.cat(strides, 0) gt_labels = [] matched_gt_boxes = [] for gt_per_image in gt_instances: image_size = gt_per_image.image_size centers_invalid = (centers[:, 0] >= image_size[1]).logical_or( centers[:, 1] >= image_size[0]) objectness_label_i, bbox_label_i = rep_points_match_with_classes( centers, strides, gt_per_image.gt_boxes, gt_per_image.gt_classes) objectness_label_i[centers_invalid] = -1 gt_labels.append(objectness_label_i) matched_gt_boxes.append(bbox_label_i) return gt_labels, matched_gt_boxes
def inference(self, pred_logits, pred_deltas, pred_masks, anchors, indexes, images): """ Arguments: pred_logits, pred_deltas, pred_masks: Same as the output of: meth:`TensorMaskHead.forward` anchors, indexes: Same as the input of meth:`TensorMask.get_ground_truth` images (ImageList): the input images Returns: results (List[Instances]): a list of #images elements. """ assert len(anchors) == len(images) results = [] pred_logits = [ permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits ] pred_deltas = [permute_to_N_HWA_K(x, 4) for x in pred_deltas] pred_logits = cat(pred_logits, dim=1) pred_deltas = cat(pred_deltas, dim=1) for img_idx, (anchors_im, indexes_im) in enumerate(zip(anchors, indexes)): # Get the size of the current image image_size = images.image_sizes[img_idx] logits_im = pred_logits[img_idx] deltas_im = pred_deltas[img_idx] if self.mask_on: masks_im = [[mla[img_idx] for mla in ml] for ml in pred_masks] else: masks_im = [None] * self.num_levels results_im = self.inference_single_image( logits_im, deltas_im, masks_im, Boxes.cat(anchors_im), cat(indexes_im), tuple(image_size), ) results.append(results_im) return results
def _sample_train_points(self, coarse_mask, instances): assert self.training gt_classes = cat([x.gt_classes for x in instances]) with torch.no_grad(): # sample point_coords point_coords = get_uncertain_point_coords_with_randomness( coarse_mask, lambda logits: calculate_uncertainty(logits, gt_classes), self.mask_point_train_num_points, self.mask_point_oversample_ratio, self.mask_point_importance_sample_ratio, ) # sample point_labels proposal_boxes = [x.proposal_boxes for x in instances] cat_boxes = Boxes.cat(proposal_boxes) point_coords_wrt_image = get_point_coords_wrt_image(cat_boxes.tensor, point_coords) point_labels = sample_point_labels(instances, point_coords_wrt_image) return point_coords, point_labels
def label_anchors(self, anchors: List[Boxes], gt_instances: List[Instances]): """ Same interface as :meth:`RetinaNet.label_anchors`, but implemented with FCOS anchor matching rule. Unlike RetinaNet, there are no ignored anchors. """ gt_labels, matched_gt_boxes = [], [] for inst in gt_instances: if len(inst) > 0: match_quality_matrix = self._match_anchors( inst.gt_boxes, anchors) # Find matched ground-truth box per anchor. Un-matched anchors are # assigned -1. This is equivalent to using an anchor matcher as used # in R-CNN/RetinaNet: `Matcher(thresholds=[1e-5], labels=[0, 1])` match_quality, matched_idxs = match_quality_matrix.max(dim=0) matched_idxs[match_quality < 1e-5] = -1 matched_gt_boxes_i = inst.gt_boxes.tensor[matched_idxs.clip( min=0)] gt_labels_i = inst.gt_classes[matched_idxs.clip(min=0)] # Anchors with matched_idxs = -1 are labeled background. gt_labels_i[matched_idxs < 0] = self.num_classes else: matched_gt_boxes_i = torch.zeros_like( Boxes.cat(anchors).tensor) gt_labels_i = torch.full( (len(matched_gt_boxes_i), ), fill_value=self.num_classes, dtype=torch.long, device=matched_gt_boxes_i.device, ) gt_labels.append(gt_labels_i) matched_gt_boxes.append(matched_gt_boxes_i) return gt_labels, matched_gt_boxes
def _get_boxes_from_image(image, scale_xy=None): """Extract boxes from image created by `_get_image_with_box()`""" cur_img_int = ((image / 10.0 + 0.5).int().float() * 10.0).int() values = torch.unique(cur_img_int) gt_values = [x * 10 for x in range(len(values))] assert set(values.tolist()) == set(gt_values) boxes = [] for idx in range(cur_img_int.shape[0]): val = torch.unique(cur_img_int[idx]).tolist() val = max(val) if val == 0: continue # mask = (cur_img_int[idx, :, :] == val).int() mask = (cur_img_int[idx, :, :] > 0).int() box_xywh = bu.get_box_from_mask(mask.numpy()) boxes.append(bu.to_boxes_from_xywh(box_xywh)) ret = Boxes.cat(boxes) if scale_xy is not None: ret.scale(*scale_xy) return ret
def prepare_iou_based_targets(self, targets, anchors): """Compute IoU-based targets""" cls_labels = [] reg_targets = [] matched_idx_all = [] for im_i in range(len(targets)): targets_per_im = targets[im_i] assert targets_per_im.mode == "xyxy" bboxes_per_im = targets_per_im.bbox labels_per_im = targets_per_im.get_field("labels") anchors_per_im = Boxes.cat(anchors[im_i]) num_gt = bboxes_per_im.shape[0] match_quality_matrix = boxlist_iou(targets_per_im, anchors_per_im) matched_idxs = self.matcher(match_quality_matrix) targets_per_im = targets_per_im.copy_with_fields(['labels']) matched_targets = targets_per_im[matched_idxs.clamp(min=0)] cls_labels_per_im = matched_targets.get_field("labels") cls_labels_per_im = cls_labels_per_im.to(dtype=torch.float32) # Background (negative examples) bg_indices = matched_idxs == Matcher.BELOW_LOW_THRESHOLD cls_labels_per_im[bg_indices] = 0 # discard indices that are between thresholds inds_to_discard = matched_idxs == Matcher.BETWEEN_THRESHOLDS cls_labels_per_im[inds_to_discard] = -1 matched_gts = matched_targets.bbox matched_idx_all.append(matched_idxs.view(1, -1)) reg_targets_per_im = self.box_coder.encode(matched_gts, anchors_per_im.bbox) cls_labels.append(cls_labels_per_im) reg_targets.append(reg_targets_per_im) return cls_labels, reg_targets, matched_idx_all
def func_cat(x: torch.Tensor): boxes1 = Boxes(x) boxes2 = Boxes(x) # boxes3 = Boxes.cat([boxes1, boxes2]) # this is not supported by torchsript for now. boxes3 = boxes1.cat([boxes1, boxes2]) return boxes3