def get_ground_truth(self, anchors, targets): """ Args: anchors (list[list[Boxes]]): a list of N=#image elements. Each is a list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each anchor. R is the total number of anchors, i.e. the sum of Hi x Wi x A for all levels. Anchors with an IoU with some target higher than the foreground threshold are assigned their corresponding label in the [0, K-1] range. Anchors whose IoU are below the background threshold are assigned the label "K". Anchors whose IoU are between the foreground and background thresholds are assigned a label "-1", i.e. ignore. gt_anchors_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth box2box transform targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding anchor is labeled as foreground. """ gt_classes = [] gt_anchors_deltas = [] anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] # list[Tensor(R, 4)], one for each image for anchors_per_image, targets_per_image in zip(anchors, targets): match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, anchors_per_image) gt_matched_idxs, anchor_labels = self.matcher(match_quality_matrix) has_gt = len(targets_per_image) > 0 if has_gt: # ground truth box regression matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs] gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas( anchors_per_image.tensor, matched_gt_boxes.tensor ) gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Anchors with label 0 are treated as background. gt_classes_i[anchor_labels == 0] = self.num_classes # Anchors with label -1 are ignored. gt_classes_i[anchor_labels == -1] = -1 else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes gt_anchors_reg_deltas_i = torch.zeros_like( anchors_per_image.tensor) gt_classes.append(gt_classes_i) gt_anchors_deltas.append(gt_anchors_reg_deltas_i) return torch.stack(gt_classes), torch.stack(gt_anchors_deltas)
def bbox_targets(self, candidate_bboxes, gt_bboxes, gt_labels, pos_iou_thr=0.5, neg_iou_thr=0.4, gt_max_matching=True): """ Target assign: MaxIoU assign Args: candidate_bboxes: gt_bboxes: gt_labels: pos_iou_thr: neg_iou_thr: gt_max_matching: Returns: """ if candidate_bboxes.size(0) == 0 or gt_bboxes.tensor.size(0) == 0: raise ValueError('No gt or anchors') candidate_bboxes[:, 0].clamp_(min=0) candidate_bboxes[:, 1].clamp_(min=0) candidate_bboxes[:, 2].clamp_(min=0) candidate_bboxes[:, 3].clamp_(min=0) num_candidates = candidate_bboxes.size(0) overlaps = pairwise_iou(Boxes(candidate_bboxes), gt_bboxes) assigned_labels = overlaps.new_full((overlaps.size(0), ), self.num_classes, dtype=torch.long) # for each anchor, which gt best overlaps with it # for each anchor, the max iou of all gts max_overlaps, argmax_overlaps = overlaps.max(dim=1) # for each gt, which anchor best overlaps with it # for each gt, the max iou of all proposals gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=0) bg_inds = max_overlaps < neg_iou_thr assigned_labels[bg_inds] = self.num_classes fg_inds = max_overlaps >= pos_iou_thr assigned_labels[fg_inds] = gt_labels[argmax_overlaps[fg_inds]] if gt_max_matching: fg_inds = torch.nonzero(overlaps == gt_max_overlaps)[:, 0] assigned_labels[fg_inds] = gt_labels[argmax_overlaps[fg_inds]] assigned_bboxes = overlaps.new_zeros((num_candidates, 4)) fg_inds = (assigned_labels >= 0) & (assigned_labels != self.num_classes) assigned_bboxes[fg_inds] = gt_bboxes.tensor[argmax_overlaps[fg_inds]] return assigned_bboxes, assigned_labels
def get_transform(self, img, annotations): """ Args: img (ndarray): of shape HxWxC(RGB). The array can be of type uint8 in range [0, 255], or floating point in range [0, 255]. annotations (list[dict[str->str]]): Each item in the list is a bbox label of an object. The object is represented by a dict, which contains: - bbox (list): bbox coordinates, top left and bottom right. - bbox_mode (str): bbox label mode, for example: `XYXY_ABS`, `XYWH_ABS` and so on... """ sample_mode = (1, *self.min_ious, 0) h, w = img.shape[:2] boxes = list() for obj in annotations: boxes.append(BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS)) boxes = torch.tensor(boxes) while True: mode = np.random.choice(sample_mode) if mode == 1: return NoOpTransform() min_iou = mode for i in range(50): new_w = np.random.uniform(self.min_crop_size * w, w) new_h = np.random.uniform(self.min_crop_size * h, h) # h / w in [0.5, 2] if new_h / new_w < 0.5 or new_h / new_w > 2: continue left = np.random.uniform(w - new_w) top = np.random.uniform(h - new_h) patch = np.array( (int(left), int(top), int(left + new_w), int(top + new_h))) overlaps = pairwise_iou( Boxes(patch.reshape(-1, 4)), Boxes(boxes.reshape(-1, 4)) ) if overlaps.min() < min_iou: continue # center of boxes should inside the crop img center = (boxes[:, :2] + boxes[:, 2:]) / 2 mask = ((center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (center[:, 0] < patch[2]) * ( center[:, 1] < patch[3])) if not mask.any(): continue return IoUCropTransform(int(left), int(top), int(new_w), int(new_h))
def _match_and_label_boxes(self, proposals, stage, targets): """ Match proposals with groundtruth using the matcher at the given stage. Label the proposals as foreground or background based on the match. Args: proposals (list[Instances]): One Instances for each image, with the field "proposal_boxes". stage (int): the current stage targets (list[Instances]): the ground truth instances Returns: list[Instances]: the same proposals, but with fields "gt_classes" and "gt_boxes" """ num_fg_samples, num_bg_samples = [], [] for proposals_per_image, targets_per_image in zip(proposals, targets): match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes) # proposal_labels are 0 or 1 matched_idxs, proposal_labels = self.proposal_matchers[stage]( match_quality_matrix) if len(targets_per_image) > 0: gt_classes = targets_per_image.gt_classes[matched_idxs] # Label unmatched proposals (0 label from matcher) as background (label=num_classes) gt_classes[proposal_labels == 0] = self.num_classes gt_boxes = targets_per_image.gt_boxes[matched_idxs] else: gt_classes = torch.zeros_like(matched_idxs) + self.num_classes gt_boxes = Boxes( targets_per_image.gt_boxes.tensor.new_zeros( (len(proposals_per_image), 4))) proposals_per_image.gt_classes = gt_classes proposals_per_image.gt_boxes = gt_boxes num_fg_samples.append((proposal_labels == 1).sum().item()) num_bg_samples.append(proposal_labels.numel() - num_fg_samples[-1]) # Log the number of fg/bg samples in each stage storage = get_event_storage() storage.put_scalar( "stage{}/roi_head/num_fg_samples".format(stage), sum(num_fg_samples) / len(num_fg_samples), ) storage.put_scalar( "stage{}/roi_head/num_bg_samples".format(stage), sum(num_bg_samples) / len(num_bg_samples), ) return proposals
def test_pairwise_iou(self): boxes1 = torch.tensor([[0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0]]) boxes2 = torch.tensor( [ [0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 0.5, 1.0], [0.0, 0.0, 1.0, 0.5], [0.0, 0.0, 0.5, 0.5], [0.5, 0.5, 1.0, 1.0], [0.5, 0.5, 1.5, 1.5], ] ) expected_ious = torch.tensor( [ [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)], [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)], ] ) ious = pairwise_iou(Boxes(boxes1), Boxes(boxes2)) self.assertTrue(torch.allclose(ious, expected_ious))
def get_ground_truth(self, shifts, targets, pre_boxes_list): """ Args: shifts (list[list[Tensor]]): a list of N=#image elements. Each is a list of #feature level tensors. The tensors contains shifts of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each shift. R is the total number of shifts, i.e. the sum of Hi x Wi for all levels. Shifts in the valid boxes are assigned their corresponding label in the [0, K-1] range. Shifts in the background are assigned the label "K". Shifts in the ignore areas are assigned a label "-1", i.e. ignore. gt_shifts_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth shift2box transform targets (dl, dt, dr, db) that map each shift to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding shift is labeled as foreground. gt_centerness (Tensor): An float tensor (0, 1) of shape (N, R) whose values in [0, 1] storing ground-truth centerness for each shift. border_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each shift. R is the total number of shifts, i.e. the sum of Hi x Wi for all levels. Shifts in the valid boxes are assigned their corresponding label in the [0, K-1] range. Shifts in the background are assigned the label "K". Shifts in the ignore areas are assigned a label "-1", i.e. ignore. border_shifts_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth shift2box transform targets (dl, dt, dr, db) that map each shift to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding shift is labeled as foreground. """ gt_classes = [] gt_shifts_deltas = [] gt_centerness = [] border_classes = [] border_shifts_deltas = [] for shifts_per_image, targets_per_image, pre_boxes in zip( shifts, targets, pre_boxes_list): object_sizes_of_interest = torch.cat([ shifts_i.new_tensor(size).unsqueeze(0).expand( shifts_i.size(0), -1) for shifts_i, size in zip( shifts_per_image, self.object_sizes_of_interest) ], dim=0) shifts_over_all_feature_maps = torch.cat(shifts_per_image, dim=0) gt_boxes = targets_per_image.gt_boxes deltas = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes.tensor.unsqueeze(1)) if self.center_sampling_radius > 0: centers = gt_boxes.get_centers() is_in_boxes = [] for stride, shifts_i in zip(self.fpn_strides, shifts_per_image): radius = stride * self.center_sampling_radius center_boxes = torch.cat(( torch.max(centers - radius, gt_boxes.tensor[:, :2]), torch.min(centers + radius, gt_boxes.tensor[:, 2:]), ), dim=-1) center_deltas = self.shift2box_transform.get_deltas( shifts_i, center_boxes.unsqueeze(1)) is_in_boxes.append(center_deltas.min(dim=-1).values > 0) is_in_boxes = torch.cat(is_in_boxes, dim=1) else: # no center sampling, it will use all the locations within a ground-truth box is_in_boxes = deltas.min(dim=-1).values > 0 max_deltas = deltas.max(dim=-1).values # limit the regression range for each location is_cared_in_the_level = \ (max_deltas >= object_sizes_of_interest[None, :, 0]) & \ (max_deltas <= object_sizes_of_interest[None, :, 1]) gt_positions_area = gt_boxes.area().unsqueeze(1).repeat( 1, shifts_over_all_feature_maps.size(0)) gt_positions_area[~is_in_boxes] = math.inf gt_positions_area[~is_cared_in_the_level] = math.inf # if there are still more than one objects for a position, # we choose the one with minimal area positions_min_area, gt_matched_idxs = gt_positions_area.min(dim=0) # ground truth box regression gt_shifts_reg_deltas_i = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes[gt_matched_idxs].tensor) # ground truth classes has_gt = len(targets_per_image) > 0 if has_gt: gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Shifts with area inf are treated as background. gt_classes_i[positions_min_area == math.inf] = self.num_classes else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes # ground truth centerness left_right = gt_shifts_reg_deltas_i[:, [0, 2]] top_bottom = gt_shifts_reg_deltas_i[:, [1, 3]] gt_centerness_i = torch.sqrt( (left_right.min(dim=-1).values / left_right.max(dim=-1).values).clamp_(min=0) * (top_bottom.min(dim=-1).values / top_bottom.max(dim=-1).values).clamp_(min=0)) gt_classes.append(gt_classes_i) gt_shifts_deltas.append(gt_shifts_reg_deltas_i) gt_centerness.append(gt_centerness_i) # border iou = pairwise_iou(Boxes(pre_boxes), gt_boxes) (max_iou, argmax_iou) = iou.max(dim=1) invalid = max_iou < self.border_iou_thresh gt_target = gt_boxes[argmax_iou].tensor border_cls_target = targets_per_image.gt_classes[argmax_iou] border_cls_target[invalid] = self.num_classes border_bbox_std = pre_boxes.new_tensor(self.border_bbox_std) pre_boxes_wh = pre_boxes[:, 2:4] - pre_boxes[:, 0:2] pre_boxes_wh = torch.cat([pre_boxes_wh, pre_boxes_wh], dim=1) border_off_target = (gt_target - pre_boxes) / (pre_boxes_wh * border_bbox_std) border_classes.append(border_cls_target) border_shifts_deltas.append(border_off_target) return ( torch.stack(gt_classes), torch.stack(gt_shifts_deltas), torch.stack(gt_centerness), torch.stack(border_classes), torch.stack(border_shifts_deltas), )
def get_ground_truth(self, shifts, targets, box_cls, box_delta): """ Args: shifts (list[list[Tensor]]): a list of N=#image elements. Each is a list of #feature level tensors. The tensors contains shifts of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each shift. R is the total number of shifts, i.e. the sum of Hi x Wi for all levels. Shifts in the valid boxes are assigned their corresponding label in the [0, K-1] range. Shifts in the background are assigned the label "K". Shifts in the ignore areas are assigned a label "-1", i.e. ignore. gt_shifts_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth shift2box transform targets (dl, dt, dr, db) that map each shift to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding shift is labeled as foreground. """ gt_classes = [] gt_shifts_deltas = [] box_cls = torch.cat( [permute_to_N_HWA_K(x, self.num_classes) for x in box_cls], dim=1) box_delta = torch.cat([permute_to_N_HWA_K(x, 4) for x in box_delta], dim=1) box_cls = box_cls.sigmoid_() num_fg = 0 num_gt = 0 for shifts_per_image, targets_per_image, box_cls_per_image, box_delta_per_image in zip( shifts, targets, box_cls, box_delta): shifts_over_all_feature_maps = torch.cat(shifts_per_image, dim=0) gt_boxes = targets_per_image.gt_boxes prob = box_cls_per_image[:, targets_per_image.gt_classes].t() boxes = self.shift2box_transform.apply_deltas( box_delta_per_image, shifts_over_all_feature_maps) iou = pairwise_iou(gt_boxes, Boxes(boxes)) quality = prob**(1 - self.poto_alpha) * iou**self.poto_alpha deltas = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes.tensor.unsqueeze(1)) if self.center_sampling_radius > 0: centers = gt_boxes.get_centers() is_in_boxes = [] for stride, shifts_i in zip(self.fpn_strides, shifts_per_image): radius = stride * self.center_sampling_radius center_boxes = torch.cat(( torch.max(centers - radius, gt_boxes.tensor[:, :2]), torch.min(centers + radius, gt_boxes.tensor[:, 2:]), ), dim=-1) center_deltas = self.shift2box_transform.get_deltas( shifts_i, center_boxes.unsqueeze(1)) is_in_boxes.append(center_deltas.min(dim=-1).values > 0) is_in_boxes = torch.cat(is_in_boxes, dim=1) else: # no center sampling, it will use all the locations within a ground-truth box is_in_boxes = deltas.min(dim=-1).values > 0 quality[~is_in_boxes] = -1 # because argmax is the approximate solution of bipartite matching # in dense prediction scenario, we can replace linear sum assignment # by argmax operation to achieve faster training time (~10%) foreground_idxs = quality.argmax(dim=1, keepdim=True) is_foreground = torch.zeros_like(is_in_boxes).scatter_( 1, foreground_idxs, True) quality[~is_foreground] = -1 # if there are still more than one objects for a position, # we choose the one with maximum quality positions_max_quality, gt_matched_idxs = quality.max(dim=0) num_fg += (positions_max_quality != -1).sum().item() num_gt += len(targets_per_image) # ground truth box regression gt_shifts_reg_deltas_i = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes[gt_matched_idxs].tensor) # ground truth classes has_gt = len(targets_per_image) > 0 if has_gt: gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Shifts with quality -1 are treated as background. gt_classes_i[positions_max_quality == -1] = self.num_classes else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes gt_classes.append(gt_classes_i) gt_shifts_deltas.append(gt_shifts_reg_deltas_i) get_event_storage().put_scalar("num_fg_per_gt", num_fg / num_gt) return torch.stack(gt_classes), torch.stack(gt_shifts_deltas)
def get_ground_truth(self, shifts, targets): """ Args: shifts (list[list[Tensor]]): a list of N=#image elements. Each is a list of #feature level tensors. The tensors contains shifts of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each shift. R is the total number of shifts, i.e. the sum of Hi x Wi for all levels. Shifts in the valid boxes are assigned their corresponding label in the [0, K-1] range. Shifts in the background are assigned the label "K". Shifts in the ignore areas are assigned a label "-1", i.e. ignore. gt_shifts_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth shift2box transform targets (dl, dt, dr, db) that map each shift to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding shift is labeled as foreground. gt_centerness (Tensor): An float tensor (0, 1) of shape (N, R) whose values in [0, 1] storing ground-truth centerness for each shift. """ gt_classes = [] gt_shifts_deltas = [] gt_centerness = [] for shifts_per_image, targets_per_image in zip(shifts, targets): shifts_over_all_feature_maps = torch.cat(shifts_per_image, dim=0) gt_boxes = targets_per_image.gt_boxes is_in_boxes = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes.tensor.unsqueeze(1)).min(dim=-1).values > 0 gt_positions_iou = [] candidate_idxs = [] base = 0 for stride, shifts_i in zip(self.fpn_strides, shifts_per_image): gt_positions_iou.append( pairwise_iou( gt_boxes, Boxes( torch.cat(( shifts_i - stride * self.anchor_scale / 2, shifts_i + stride * self.anchor_scale / 2, ), dim=1)))) distances = (gt_boxes.get_centers().unsqueeze(1) - shifts_i).pow_(2).sum(dim=-1).sqrt_() _, topk_idxs = distances.topk(self.atss_topk, dim=1, largest=False) candidate_idxs.append(base + topk_idxs) base += len(shifts_i) gt_positions_iou = torch.cat(gt_positions_iou, dim=1) candidate_idxs = torch.cat(candidate_idxs, dim=1) candidate_ious = gt_positions_iou.gather(1, candidate_idxs) ious_thr = (candidate_ious.mean(dim=1, keepdim=True) + candidate_ious.std(dim=1, keepdim=True)) is_foreground = torch.zeros_like(is_in_boxes).scatter_( 1, candidate_idxs, True) is_foreground &= gt_positions_iou >= ious_thr gt_positions_iou[~is_in_boxes] = -1 gt_positions_iou[~is_foreground] = -1 # if there are still more than one objects for a position, # we choose the one with maximum iou positions_max_iou, gt_matched_idxs = gt_positions_iou.max(dim=0) # ground truth box regression gt_shifts_reg_deltas_i = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes[gt_matched_idxs].tensor) # ground truth classes has_gt = len(targets_per_image) > 0 if has_gt: gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Shifts with iou -1 are treated as background. gt_classes_i[positions_max_iou == -1] = self.num_classes else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes # ground truth centerness left_right = gt_shifts_reg_deltas_i[:, [0, 2]] top_bottom = gt_shifts_reg_deltas_i[:, [1, 3]] gt_centerness_i = torch.sqrt( (left_right.min(dim=-1).values / left_right.max(dim=-1).values).clamp_(min=0) * (top_bottom.min(dim=-1).values / top_bottom.max(dim=-1).values).clamp_(min=0)) gt_classes.append(gt_classes_i) gt_shifts_deltas.append(gt_shifts_reg_deltas_i) gt_centerness.append(gt_centerness_i) return torch.stack(gt_classes), torch.stack( gt_shifts_deltas), torch.stack(gt_centerness)
def _evaluate_box_proposals(dataset_predictions, coco_api, thresholds=None, area="all", limit=None): """ Evaluate detection proposal recall metrics. This function is a much faster alternative to the official COCO API recall evaluation code. However, it produces slightly different results. """ # Record max overlap value for each gt box # Return vector of overlap values areas = { "all": 0, "small": 1, "medium": 2, "large": 3, "96-128": 4, "128-256": 5, "256-512": 6, "512-inf": 7, } area_ranges = [ [0**2, 1e5**2], # all [0**2, 32**2], # small [32**2, 96**2], # medium [96**2, 1e5**2], # large [96**2, 128**2], # 96-128 [128**2, 256**2], # 128-256 [256**2, 512**2], # 256-512 [512**2, 1e5**2], ] # 512-inf assert area in areas, "Unknown area range: {}".format(area) area_range = area_ranges[areas[area]] gt_overlaps = [] num_pos = 0 for prediction_dict in dataset_predictions: predictions = prediction_dict["proposals"] # sort predictions in descending order # TODO maybe remove this and make it explicit in the documentation inds = predictions.objectness_logits.sort(descending=True)[1] predictions = predictions[inds] ann_ids = coco_api.getAnnIds(imgIds=prediction_dict["image_id"]) anno = coco_api.loadAnns(ann_ids) gt_boxes = [ BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno if obj["iscrowd"] == 0 ] gt_boxes = torch.as_tensor(gt_boxes).reshape( -1, 4) # guard against no boxes gt_boxes = Boxes(gt_boxes) gt_areas = torch.as_tensor( [obj["area"] for obj in anno if obj["iscrowd"] == 0]) if len(gt_boxes) == 0 or len(predictions) == 0: continue valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) gt_boxes = gt_boxes[valid_gt_inds] num_pos += len(gt_boxes) if len(gt_boxes) == 0: continue if limit is not None and len(predictions) > limit: predictions = predictions[:limit] overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes) _gt_overlaps = torch.zeros(len(gt_boxes)) for j in range(min(len(predictions), len(gt_boxes))): # find which proposal box maximally covers each gt box # and get the iou amount of coverage for each gt box max_overlaps, argmax_overlaps = overlaps.max(dim=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ovr, gt_ind = max_overlaps.max(dim=0) assert gt_ovr >= 0 # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] # record the iou coverage of this gt box _gt_overlaps[j] = overlaps[box_ind, gt_ind] assert _gt_overlaps[j] == gt_ovr # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 # append recorded iou coverage level gt_overlaps.append(_gt_overlaps) gt_overlaps = torch.cat(gt_overlaps, dim=0) gt_overlaps, _ = torch.sort(gt_overlaps) if thresholds is None: step = 0.05 thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) recalls = torch.zeros_like(thresholds) # compute recall for each iou threshold for i, t in enumerate(thresholds): recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) # ar = 2 * np.trapz(recalls, thresholds) ar = recalls.mean() return { "ar": ar, "recalls": recalls, "thresholds": thresholds, "gt_overlaps": gt_overlaps, "num_pos": num_pos, }
def label_and_sample_proposals( self, proposals: List[Instances], targets: List[Instances]) -> List[Instances]: """ Prepare some proposals to be used to train the ROI heads. It performs box matching between `proposals` and `targets`, and assigns training labels to the proposals. It returns ``self.batch_size_per_image`` random samples from proposals and groundtruth boxes, with a fraction of positives that is no larger than ``self.positive_sample_fraction``. Args: See :meth:`ROIHeads.forward` Returns: list[Instances]: length `N` list of `Instances`s containing the proposals sampled for training. Each `Instances` has the following fields: - proposal_boxes: the proposal boxes - gt_boxes: the ground-truth box that the proposal is assigned to (this is only meaningful if the proposal has a label > 0; if label = 0 then the ground-truth box is random) Other fields such as "gt_classes", "gt_masks", that's included in `targets`. """ gt_boxes = [x.gt_boxes for x in targets] # Augment proposals with ground-truth boxes. # In the case of learned proposals (e.g., RPN), when training starts # the proposals will be low quality due to random initialization. # It's possible that none of these initial # proposals have high enough overlap with the gt objects to be used # as positive examples for the second stage components (box head, # cls head, mask head). Adding the gt boxes to the set of proposals # ensures that the second stage components will have some positive # examples from the start of training. For RPN, this augmentation improves # convergence and empirically improves box AP on COCO by about 0.5 # points (under one tested configuration). if self.proposal_append_gt: proposals = add_ground_truth_to_proposals(gt_boxes, proposals) proposals_with_gt = [] num_fg_samples = [] num_bg_samples = [] for proposals_per_image, targets_per_image in zip(proposals, targets): has_gt = len(targets_per_image) > 0 match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes) matched_idxs, matched_labels = self.proposal_matcher( match_quality_matrix) sampled_idxs, gt_classes = self._sample_proposals( matched_idxs, matched_labels, targets_per_image.gt_classes) # Set target attributes of the sampled proposals: proposals_per_image = proposals_per_image[sampled_idxs] proposals_per_image.gt_classes = gt_classes # We index all the attributes of targets that start with "gt_" # and have not been added to proposals yet (="gt_classes"). if has_gt: sampled_targets = matched_idxs[sampled_idxs] # NOTE: here the indexing waste some compute, because heads # like masks, keypoints, etc, will filter the proposals again, # (by foreground/background, or number of keypoints in the image, etc) # so we essentially index the data twice. for (trg_name, trg_value) in targets_per_image.get_fields().items(): if trg_name.startswith( "gt_") and not proposals_per_image.has(trg_name): proposals_per_image.set(trg_name, trg_value[sampled_targets]) else: gt_boxes = Boxes( targets_per_image.gt_boxes.tensor.new_zeros( (len(sampled_idxs), 4))) proposals_per_image.gt_boxes = gt_boxes num_bg_samples.append( (gt_classes == self.num_classes).sum().item()) num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) proposals_with_gt.append(proposals_per_image) # Log the number of fg/bg samples that are selected for training ROI heads storage = get_event_storage() storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) return proposals_with_gt
def losses(self, anchors, gt_instances, box_cls, box_delta): anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] box_cls_flattened = [ permute_to_N_HWA_K(x, self.num_classes) for x in box_cls ] box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta] pred_class_logits = cat(box_cls_flattened, dim=1) pred_anchor_deltas = cat(box_delta_flattened, dim=1) pred_class_probs = pred_class_logits.sigmoid() pred_box_probs = [] num_foreground = 0 positive_losses = [] for anchors_per_image, \ gt_instances_per_image, \ pred_class_probs_per_image, \ pred_anchor_deltas_per_image in zip( anchors, gt_instances, pred_class_probs, pred_anchor_deltas): gt_classes_per_image = gt_instances_per_image.gt_classes with torch.no_grad(): # predicted_boxes_per_image: a_{j}^{loc}, shape: [j, 4] predicted_boxes_per_image = self.box2box_transform.apply_deltas( pred_anchor_deltas_per_image, anchors_per_image.tensor) # gt_pred_iou: IoU_{ij}^{loc}, shape: [i, j] gt_pred_iou = pairwise_iou(gt_instances_per_image.gt_boxes, Boxes(predicted_boxes_per_image)) t1 = self.bbox_threshold t2 = gt_pred_iou.max(dim=1, keepdim=True).values.clamp_( min=t1 + torch.finfo(torch.float32).eps) # gt_pred_prob: P{a_{j} -> b_{i}}, shape: [i, j] gt_pred_prob = ((gt_pred_iou - t1) / (t2 - t1)).clamp_(min=0, max=1) # pred_box_prob_per_image: P{a_{j} \in A_{+}}, shape: [j, c] nonzero_idxs = torch.nonzero(gt_pred_prob, as_tuple=True) pred_box_prob_per_image = torch.zeros_like( pred_class_probs_per_image) pred_box_prob_per_image[nonzero_idxs[1], gt_classes_per_image[nonzero_idxs[0]]] \ = gt_pred_prob[nonzero_idxs] pred_box_probs.append(pred_box_prob_per_image) # construct bags for objects match_quality_matrix = pairwise_iou( gt_instances_per_image.gt_boxes, anchors_per_image) _, foreground_idxs = torch.topk(match_quality_matrix, self.pos_anchor_topk, dim=1, sorted=False) # matched_pred_class_probs_per_image: P_{ij}^{cls} matched_pred_class_probs_per_image = torch.gather( pred_class_probs_per_image[foreground_idxs], 2, gt_classes_per_image.view(-1, 1, 1).repeat(1, self.pos_anchor_topk, 1)).squeeze(2) # matched_gt_anchor_deltas_per_image: P_{ij}^{loc} matched_gt_anchor_deltas_per_image = self.box2box_transform.get_deltas( anchors_per_image.tensor[foreground_idxs], gt_instances_per_image.gt_boxes.tensor.unsqueeze(1)) loss_box_reg = smooth_l1_loss( pred_anchor_deltas_per_image[foreground_idxs], matched_gt_anchor_deltas_per_image, beta=self.smooth_l1_loss_beta, reduction="none").sum(dim=-1) * self.reg_weight matched_pred_reg_probs_per_image = (-loss_box_reg).exp() # positive_losses: { -log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) ) } num_foreground += len(gt_instances_per_image) positive_losses.append( positive_bag_loss(matched_pred_class_probs_per_image * matched_pred_reg_probs_per_image, dim=1)) # positive_loss: \sum_{i}{ -log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) ) } / ||B|| positive_loss = torch.cat(positive_losses).sum() / max( 1, num_foreground) # pred_box_probs: P{a_{j} \in A_{+}} pred_box_probs = torch.stack(pred_box_probs, dim=0) # negative_loss: \sum_{j}{ FL( (1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg}) ) } / n||B|| negative_loss = negative_bag_loss( pred_class_probs * (1 - pred_box_probs), self.focal_loss_gamma).sum() / max( 1, num_foreground * self.pos_anchor_topk) loss_pos = positive_loss * self.focal_loss_alpha loss_neg = negative_loss * (1 - self.focal_loss_alpha) return {"loss_pos": loss_pos, "loss_neg": loss_neg}
def get_ground_truth(self, default_boxes, targets): """ Args: default_boxes (list[Boxes]): a list of 'Boxes' elements. The Boxes contains default boxes of one image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_conf (Tensor): An integer tensor of shape [N, R] storing ground-truth labels for each default box. R is the total number of default box, i.e. the sum of Hi x Wi x D for all levels. * Default box with an IoU with some target higher than the foreground threshold are assigned their corresponding label in the [0, C-1] range. * Default box whose IoU are below the background threshold are assigned the label "C". * Default box whose IoU are between the foreground and background thresholds are assigned a label "-1", i.e. ignore. gt_default_boxes_deltas (Tensor): Shape [N, R, 4]. The last dimension represents ground-truth box2box transform targets (g^cx, g^cy, g^w, g^h)that map each default box to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding default box is labeled as foreground. """ gt_conf = list() gt_default_boxes_deltas = list() # list[Tensor(R, 4)], one for each image default_boxes_per_image = Boxes.cat(default_boxes) # each Instances (for one image) for targets_per_image in targets: match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, default_boxes_per_image) # M * N gt_matched_idxs, default_box_labels = self.matcher( match_quality_matrix) has_gt = len(targets_per_image) > 0 if has_gt: # ground truth box regression matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs] # meaningful only when the corresponding default box is labeled as foreground. gt_default_boxes_deltas_i = self.box2box_transform.get_deltas( default_boxes_per_image.tensor, matched_gt_boxes.tensor) gt_conf_i = targets_per_image.gt_classes[gt_matched_idxs] # Anchors with label 0 are treated as background. gt_conf_i[default_box_labels == 0] = self.num_classes # Anchors with label -1 are ignored. gt_conf_i[default_box_labels == -1] = -1 else: gt_conf_i = torch.zeros_like( gt_matched_idxs) + self.num_classes gt_default_boxes_deltas_i = torch.zeros_like( default_boxes_per_image.tensor) gt_conf.append(gt_conf_i) gt_default_boxes_deltas.append(gt_default_boxes_deltas_i) return torch.stack(gt_conf), torch.stack(gt_default_boxes_deltas)
def get_ground_truth(self, shifts, targets, box_cls, box_delta): """ Args: shifts (list[list[Tensor]]): a list of N=#image elements. Each is a list of #feature level tensors. The tensors contains shifts of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each shift. R is the total number of shifts, i.e. the sum of Hi x Wi for all levels. Shifts in the valid boxes are assigned their corresponding label in the [0, K-1] range. Shifts in the background are assigned the label "K". Shifts in the ignore areas are assigned a label "-1", i.e. ignore. gt_shifts_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth shift2box transform targets (dl, dt, dr, db) that map each shift to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding shift is labeled as foreground. """ gt_classes = [] gt_shifts_deltas = [] box_cls = torch.cat( [permute_to_N_HWA_K(x, self.num_classes) for x in box_cls], dim=1) box_delta = torch.cat([permute_to_N_HWA_K(x, 4) for x in box_delta], dim=1) box_cls = box_cls.sigmoid_() num_fg = 0 num_gt = 0 for shifts_per_image, targets_per_image, box_cls_per_image, box_delta_per_image in zip( shifts, targets, box_cls, box_delta): shifts_over_all_feature_maps = torch.cat(shifts_per_image, dim=0) gt_boxes = targets_per_image.gt_boxes prob = box_cls_per_image[:, targets_per_image.gt_classes].t() boxes = self.shift2box_transform.apply_deltas( box_delta_per_image, shifts_over_all_feature_maps) iou = pairwise_iou(gt_boxes, Boxes(boxes)) quality = prob**(1 - self.poto_alpha) * iou**self.poto_alpha deltas = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes.tensor.unsqueeze(1)) if self.center_sampling_radius > 0: centers = gt_boxes.get_centers() is_in_boxes = [] for stride, shifts_i in zip(self.fpn_strides, shifts_per_image): radius = stride * self.center_sampling_radius center_boxes = torch.cat(( torch.max(centers - radius, gt_boxes.tensor[:, :2]), torch.min(centers + radius, gt_boxes.tensor[:, 2:]), ), dim=-1) center_deltas = self.shift2box_transform.get_deltas( shifts_i, center_boxes.unsqueeze(1)) is_in_boxes.append(center_deltas.min(dim=-1).values > 0) is_in_boxes = torch.cat(is_in_boxes, dim=1) else: # no center sampling, it will use all the locations within a ground-truth box is_in_boxes = deltas.min(dim=-1).values > 0 quality[~is_in_boxes] = -1 gt_idxs, shift_idxs = linear_sum_assignment(quality.cpu().numpy(), maximize=True) num_fg += len(shift_idxs) num_gt += len(targets_per_image) gt_classes_i = shifts_over_all_feature_maps.new_full( (len(shifts_over_all_feature_maps), ), self.num_classes, dtype=torch.long) gt_shifts_reg_deltas_i = shifts_over_all_feature_maps.new_zeros( len(shifts_over_all_feature_maps), 4) if len(targets_per_image) > 0: # ground truth classes gt_classes_i[shift_idxs] = targets_per_image.gt_classes[ gt_idxs] # ground truth box regression gt_shifts_reg_deltas_i[ shift_idxs] = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps[shift_idxs], gt_boxes[gt_idxs].tensor) gt_classes.append(gt_classes_i) gt_shifts_deltas.append(gt_shifts_reg_deltas_i) get_event_storage().put_scalar("num_fg_per_gt", num_fg / num_gt) return torch.stack(gt_classes), torch.stack(gt_shifts_deltas)
def get_aux_ground_truth(self, shifts, targets, box_cls, box_delta): """ Args: shifts (list[list[Tensor]]): a list of N=#image elements. Each is a list of #feature level tensors. The tensors contains shifts of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each shift. R is the total number of shifts, i.e. the sum of Hi x Wi for all levels. Shifts in the valid boxes are assigned their corresponding label in the [0, K-1] range. Shifts in the background are assigned the label "K". Shifts in the ignore areas are assigned a label "-1", i.e. ignore. """ gt_classes = [] box_cls = torch.cat( [permute_to_N_HWA_K(x, self.num_classes) for x in box_cls], dim=1) box_delta = torch.cat([permute_to_N_HWA_K(x, 4) for x in box_delta], dim=1) box_cls = box_cls.sigmoid_() num_fg = 0 num_gt = 0 for shifts_per_image, targets_per_image, box_cls_per_image, box_delta_per_image in zip( shifts, targets, box_cls, box_delta): shifts_over_all_feature_maps = torch.cat(shifts_per_image, dim=0) gt_boxes = targets_per_image.gt_boxes prob = box_cls_per_image[:, targets_per_image.gt_classes].t() boxes = self.shift2box_transform.apply_deltas( box_delta_per_image, shifts_over_all_feature_maps) iou = pairwise_iou(gt_boxes, Boxes(boxes)) quality = prob**(1 - self.poto_alpha) * iou**self.poto_alpha candidate_idxs = [] st, ed = 0, 0 for shifts_i in shifts_per_image: ed += len(shifts_i) _, topk_idxs = quality[:, st:ed].topk(self.poto_aux_topk, dim=1) candidate_idxs.append(st + topk_idxs) st = ed candidate_idxs = torch.cat(candidate_idxs, dim=1) is_in_boxes = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes.tensor.unsqueeze(1)).min(dim=-1).values > 0 candidate_qualities = quality.gather(1, candidate_idxs) quality_thr = candidate_qualities.mean(dim=1, keepdim=True) + \ candidate_qualities.std(dim=1, keepdim=True) is_foreground = torch.zeros_like(is_in_boxes).scatter_( 1, candidate_idxs, True) is_foreground &= quality >= quality_thr quality[~is_in_boxes] = -1 quality[~is_foreground] = -1 # if there are still more than one objects for a position, # we choose the one with maximum quality positions_max_quality, gt_matched_idxs = quality.max(dim=0) num_fg += (positions_max_quality != -1).sum().item() num_gt += len(targets_per_image) # ground truth classes has_gt = len(targets_per_image) > 0 if has_gt: gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Shifts with quality -1 are treated as background. gt_classes_i[positions_max_quality == -1] = self.num_classes else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes gt_classes.append(gt_classes_i) get_event_storage().put_scalar("num_fg_per_gt_aux", num_fg / num_gt) return torch.stack(gt_classes)
def losses(self, shifts, gt_instances, box_cls, box_delta, box_center): box_cls_flattened = [ permute_to_N_HWA_K(x, self.num_classes) for x in box_cls ] box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta] box_center_flattened = [permute_to_N_HWA_K(x, 1) for x in box_center] pred_class_logits = cat(box_cls_flattened, dim=1) pred_shift_deltas = cat(box_delta_flattened, dim=1) pred_obj_logits = cat(box_center_flattened, dim=1) pred_class_probs = pred_class_logits.sigmoid() pred_obj_probs = pred_obj_logits.sigmoid() pred_box_probs = [] num_foreground = pred_class_logits.new_zeros(1) num_background = pred_class_logits.new_zeros(1) positive_losses = [] gaussian_norm_losses = [] for shifts_per_image, gt_instances_per_image, \ pred_class_probs_per_image, pred_shift_deltas_per_image, \ pred_obj_probs_per_image in zip( shifts, gt_instances, pred_class_probs, pred_shift_deltas, pred_obj_probs): locations = torch.cat(shifts_per_image, dim=0) labels = gt_instances_per_image.gt_classes gt_boxes = gt_instances_per_image.gt_boxes target_shift_deltas = self.shift2box_transform.get_deltas( locations, gt_boxes.tensor.unsqueeze(1)) is_in_boxes = target_shift_deltas.min(dim=-1).values > 0 foreground_idxs = torch.nonzero(is_in_boxes, as_tuple=True) with torch.no_grad(): # predicted_boxes_per_image: a_{j}^{loc}, shape: [j, 4] predicted_boxes_per_image = self.shift2box_transform.apply_deltas( pred_shift_deltas_per_image, locations) # gt_pred_iou: IoU_{ij}^{loc}, shape: [i, j] gt_pred_iou = pairwise_iou( gt_boxes, Boxes(predicted_boxes_per_image)).max( dim=0, keepdim=True).values.repeat( len(gt_instances_per_image), 1) # pred_box_prob_per_image: P{a_{j} \in A_{+}}, shape: [j, c] pred_box_prob_per_image = torch.zeros_like( pred_class_probs_per_image) box_prob = 1 / (1 - gt_pred_iou[foreground_idxs]).clamp_(1e-12) for i in range(len(gt_instances_per_image)): idxs = foreground_idxs[0] == i if idxs.sum() > 0: box_prob[idxs] = normalize(box_prob[idxs]) pred_box_prob_per_image[foreground_idxs[1], labels[foreground_idxs[0]]] = box_prob pred_box_probs.append(pred_box_prob_per_image) normal_probs = [] for stride, shifts_i in zip(self.fpn_strides, shifts_per_image): gt_shift_deltas = self.shift2box_transform.get_deltas( shifts_i, gt_boxes.tensor.unsqueeze(1)) distances = (gt_shift_deltas[..., :2] - gt_shift_deltas[..., 2:]) / 2 normal_probs.append( normal_distribution(distances / stride, self.mu[labels].unsqueeze(1), self.sigma[labels].unsqueeze(1))) normal_probs = torch.cat(normal_probs, dim=1).prod(dim=-1) composed_cls_prob = pred_class_probs_per_image[:, labels] * pred_obj_probs_per_image # matched_gt_shift_deltas: P_{ij}^{loc} loss_box_reg = iou_loss(pred_shift_deltas_per_image.unsqueeze(0), target_shift_deltas, box_mode="ltrb", loss_type=self.iou_loss_type, reduction="none") * self.reg_weight pred_reg_probs = (-loss_box_reg).exp() # positive_losses: { -log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) ) } positive_losses.append( positive_bag_loss( composed_cls_prob.permute(1, 0) * pred_reg_probs, is_in_boxes.float(), normal_probs)) num_foreground += len(gt_instances_per_image) num_background += normal_probs[foreground_idxs].sum().item() gaussian_norm_losses.append( len(gt_instances_per_image) / normal_probs[foreground_idxs].sum().clamp_(1e-12)) if dist.is_initialized(): dist.all_reduce(num_foreground) num_foreground /= dist.get_world_size() dist.all_reduce(num_background) num_background /= dist.get_world_size() # positive_loss: \sum_{i}{ -log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) ) } / ||B|| positive_loss = torch.cat(positive_losses).sum() / max( 1, num_foreground) # pred_box_probs: P{a_{j} \in A_{+}} pred_box_probs = torch.stack(pred_box_probs, dim=0) # negative_loss: \sum_{j}{ FL( (1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg}) ) } / n||B|| negative_loss = negative_bag_loss( pred_class_probs * pred_obj_probs * (1 - pred_box_probs), self.focal_loss_gamma).sum() / max(1, num_background) loss_pos = positive_loss * self.focal_loss_alpha loss_neg = negative_loss * (1 - self.focal_loss_alpha) loss_norm = torch.stack(gaussian_norm_losses).mean() * ( 1 - self.focal_loss_alpha) return { "loss_pos": loss_pos, "loss_neg": loss_neg, "loss_norm": loss_norm, }
def get_ground_truth(self, anchors, targets): """ Args: anchors (list[list[Boxes]]): a list of N=#image elements. Each is a list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each anchor. R is the total number of anchors, i.e. the sum of Hi x Wi for all levels. Anchors with an IoU with some target higher than the foreground threshold are assigned their corresponding label in the [0, K-1] range. Anchors whose IoU are below the background threshold are assigned the label "K". Anchors whose IoU are between the foreground and background thresholds are assigned a label "-1", i.e. ignore. gt_anchors_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth box2box transform targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding anchor is labeled as foreground. """ gt_classes = [] gt_anchors_deltas = [] num_fg = 0 num_gt = 0 for anchors_per_image, targets_per_image in zip(anchors, targets): anchors_per_image = Boxes.cat(anchors_per_image) gt_boxes = targets_per_image.gt_boxes match_quality_matrix = pairwise_iou(gt_boxes, anchors_per_image) _, is_positive = match_quality_matrix.topk(self.iou_topk, dim=1) is_foreground = torch.zeros_like(match_quality_matrix, dtype=torch.bool).scatter_( 1, is_positive, True) match_quality_matrix[~is_foreground] = -1 # if there are still more than one objects for a position, # we choose the one with maximum quality anchor_labels, gt_matched_idxs = match_quality_matrix.max(dim=0) num_fg += (anchor_labels != -1).sum().item() num_gt += len(targets_per_image) # ground truth box regression gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas( anchors_per_image.tensor, gt_boxes[gt_matched_idxs].tensor) # ground truth classes has_gt = len(targets_per_image) > 0 if has_gt: gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Anchors with label -1 are treated as background. gt_classes_i[anchor_labels == -1] = self.num_classes else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes gt_classes.append(gt_classes_i) gt_anchors_deltas.append(gt_anchors_reg_deltas_i) get_event_storage().put_scalar("num_fg_per_gt", num_fg / num_gt) return torch.stack(gt_classes), torch.stack(gt_anchors_deltas)