def detect(self, predicted_locs, predicted_scores, threshold, max_overlap): batch_size = predicted_locs.size(0) predicted_scores = torch.nn.functional.softmax( predicted_scores, dim=2) # (batch_size, 8732, 2) all_image_boxes = list() all_image_scores = list() for i in range(batch_size): decode_locs = cxcy_to_xy( gcxgcy_to_cxcy(predicted_locs[i], self.priors_cxcy)) image_boxes = list() image_scores = list() text_scores = predicted_scores[i][:, 1] score_above_threshold = text_scores > threshold n_score_above_threshold = score_above_threshold.sum().item() text_scores = text_scores[score_above_threshold] text_decoded_locs = decode_locs[score_above_threshold] overlap = IoU(xy_to_cxcy(text_decoded_locs), xy_to_cxcy(text_decoded_locs)) suppress = torch.zeros(n_score_above_threshold, dtype=torch.uint8).to(device) for box in range(text_decoded_locs.size(0)): if suppress[box] == 1: continue suppress, _ = torch.max(suppress, overlap[box] > max_overlap) suppress[box] = 0 image_boxes.append(text_decoded_locs[1 - suppress]) image_scores.append(text_scores[1 - suppress]) if len(image_boxes) == 0: image_boxes.append( torch.FloatTensor([0., 0., 1., 1.]).to(device)) image_scores.append(torch.FloatTensor([0.]).to(device)) image_boxes = torch.cat(image_boxes, dim=0) image_scores = torch.cat(image_scores, dim=0) all_image_boxes.append(image_boxes) all_image_scores.append(image_scores) return all_image_boxes, all_image_scores
def match_gt_priors(self, boxes, labels): ''' Given gt boxes, labels and (8732) priors, match them into the most suited priors N: batch size Params: boxes: true object bounding boxes in boundary coordinates, (xy), a list of N tensors: N(n_objects, 4) labels: true object labels, a list of N tensors: N(n_objects,) Return: truth_offsets: tensor (N, 8732, 4) truth_classes: tensor (N, 8732,) ''' N = len(boxes) #batch size n_priors = self.priors_cxcy.size(0) # print(n_priors) truth_offsets = torch.zeros((N, n_priors, 4), dtype=torch.float).to(device) truth_classes = torch.zeros((N, n_priors), dtype=torch.long).to(device) # for each image for i in range(N): n_objects = labels[i].shape[0] overlap = find_jaccard_overlap(self.priors_xy, boxes[i]) #(n_priors, n_boxes) # print(overlap, overlap.shape) # for each prior, find the max iou and the coresponding object id prior_iou, prior_obj = overlap.max(dim=1) #(n_priors) # print(prior_iou, prior_obj) # for each object, find the most suited prior id _, object_prior = overlap.max(dim=0) #(n_objects) # print(_, object_prior) # for each object, assign its most suited prior with object id for j in range(n_objects): prior_obj[object_prior[j]] = j # for each object, assign its most suited prior with hight iou to ensure it qualifies the thresholding prior_iou[object_prior] = 1. # match bbox coordinates boxes_xy = boxes[i][prior_obj] # (8732, 4) # print(boxes[0].shape, prior_obj, boxes_xy.shape) # match prior class prior_class = labels[i][prior_obj] # (8732) # thresholding: assign prior with iou < threshold to the class 0: background prior_class[prior_iou < self.threshold] = 0 # save into the truth tensors truth_offsets[i,:,:] = cxcy_to_gcxgcy(xy_to_cxcy(boxes_xy), self.priors_cxcy) truth_classes[i,:] = prior_class return truth_offsets, truth_classes
def forward(self, predicted_locs, predicted_scores, boxes, labels, device): """ Forward propagation. :param predicted_locs: predicted locations/boxes w.r.t the 8732 prior boxes, a tensor of dimensions (N, 8732, 4) :param predicted_scores: class scores for each of the encoded locations/boxes, a tensor of dimensions (N, 8732, n_classes) :param boxes: true object bounding boxes in boundary coordinates, a list of N tensors :param labels: true object labels, a list of N tensors :return: multibox loss, a scalar """ batch_size = predicted_locs.size(0) n_priors = self.priors_cxcy.size(0) n_classes = predicted_scores.size(2) assert n_priors == predicted_locs.size(1) == predicted_scores.size(1) true_locs = torch.zeros((batch_size, n_priors, 4), dtype=torch.float).to(device) # (N, 8732, 4) true_classes = torch.zeros((batch_size, n_priors), dtype=torch.long).to(device) # (N, 8732) # For each image for i in range(batch_size): n_objects = boxes[i].size(0) overlap = find_jaccard_overlap(boxes[i], self.priors_xy) # (n_objects, 8732) # For each prior, find the object that has the maximum overlap overlap_for_each_prior, object_for_each_prior = overlap.max( dim=0) # (8732) # We don't want a situation where an object is not represented in our positive (non-background) priors - # 1. An object might not be the best object for all priors, and is therefore not in object_for_each_prior. # 2. All priors with the object may be assigned as background based on the threshold (0.5). # To remedy this - # First, find the prior that has the maximum overlap for each object. _, prior_for_each_object = overlap.max(dim=1) # (N_o) # Then, assign each object to the corresponding maximum-overlap-prior. (This fixes 1.) object_for_each_prior[prior_for_each_object] = torch.LongTensor( range(n_objects)).to(device) # To ensure these priors qualify, artificially give them an overlap of greater than 0.5. (This fixes 2.) overlap_for_each_prior[prior_for_each_object] = 1. # Labels for each prior label_for_each_prior = labels[i][object_for_each_prior] # (8732) # Set priors whose overlaps with objects are less than the threshold to be background (no object) label_for_each_prior[ overlap_for_each_prior < self.threshold] = 0 # (8732) # Store true_classes[i] = label_for_each_prior # Encode center-size object coordinates into the form we regressed predicted boxes to true_locs[i] = cxcy_to_gcxgcy( xy_to_cxcy(boxes[i][object_for_each_prior]), self.priors_cxcy) # (8732, 4) # Identify priors that are positive (object/non-background) positive_priors = true_classes != 0 # (N, 8732) # LOCALIZATION LOSS # Localization loss is computed only over positive (non-background) priors loc_loss = self.smooth_l1(predicted_locs[positive_priors], true_locs[positive_priors]) # (), scalar # Note: indexing with a torch.uint8 (byte) tensor flattens the tensor when indexing is across multiple dimensions (N & 8732) # So, if predicted_locs has the shape (N, 8732, 4), predicted_locs[positive_priors] will have (total positives, 4) # CONFIDENCE LOSS # Confidence loss is computed over positive priors and the most difficult (hardest) negative priors in each image # That is, FOR EACH IMAGE, # we will take the hardest (neg_pos_ratio * n_positives) negative priors, i.e where there is maximum loss # This is called Hard Negative Mining - it concentrates on hardest negatives in each image, and also minimizes pos/neg imbalance # Number of positive and hard-negative priors per image n_positives = positive_priors.sum(dim=1) # (N) n_hard_negatives = self.neg_pos_ratio * n_positives # (N) # First, find the loss for all priors conf_loss_all = self.cross_entropy(predicted_scores.view( -1, n_classes), true_classes.view(-1)) # (N * 8732) conf_loss_all = conf_loss_all.view(batch_size, n_priors) # (N, 8732) # We already know which priors are positive conf_loss_pos = conf_loss_all[positive_priors] # (sum(n_positives)) # Next, find which priors are hard-negative # To do this, sort ONLY negative priors in each image in order of decreasing loss and take top n_hard_negatives conf_loss_neg = conf_loss_all.clone() # (N, 8732) conf_loss_neg[ positive_priors] = 0. # (N, 8732), positive priors are ignored (never in top n_hard_negatives) conf_loss_neg, _ = conf_loss_neg.sort( dim=1, descending=True) # (N, 8732), sorted by decreasing hardness hardness_ranks = torch.LongTensor( range(n_priors)).unsqueeze(0).expand_as(conf_loss_neg).to( device) # (N, 8732) hard_negatives = hardness_ranks < n_hard_negatives.unsqueeze( 1) # (N, 8732) conf_loss_hard_neg = conf_loss_neg[ hard_negatives] # (sum(n_hard_negatives)) # As in the paper, averaged over positive priors only, although computed over both positive and hard-negative priors conf_loss = (conf_loss_hard_neg.sum() + conf_loss_pos.sum() ) / n_positives.sum().float() # (), scalar # TOTAL LOSS return conf_loss + self.alpha * loc_loss
def forward(self, predicted_locs, predicted_scores, boxes, labels): """ Forward propagation. :param predicted_locs: predicted locations/boxes w.r.t the 8732 prior boxes, a tensor of dimensions (N, 8732, 4) :param predicted_scores: class scores for each of the encoded locations/boxes, a tensor of dimensions (N, 8732, n_classes) :param boxes: true object bounding boxes in boundary coordinates, a list of N tensors :param labels: true object labels, a list of N tensors :return: multibox loss, a scalar """ batch_size = predicted_locs.size(0) n_priors = self.priors_cxcy.size(0) n_classes = predicted_scores.size(2) assert n_priors == predicted_locs.size(1) == predicted_scores.size(1) true_locs = torch.zeros((batch_size, n_priors, 4), dtype=torch.float).to(device) # (N, 8732, 4) true_classes = torch.zeros((batch_size, n_priors), dtype=torch.long).to(device) # (N, 8732) # For each image for i in range(batch_size): n_objects = boxes[i].size(0) overlap = find_jaccard_overlap(boxes[i], self.priors_xy) # (n_objects, 8732) # For each prior, find the object that has the maximum overlap overlap_for_each_prior, object_for_each_prior = overlap.max(dim=0) # (8732) # We don't want a situation where an object is not represented in our positive (non-background) priors - # 1. An object might not be the best object for all priors, and is therefore not in object_for_each_prior. # 2. All priors with the object may be assigned as background based on the threshold (0.5). # To remedy this - # First, find the prior that has the maximum overlap for each object. _, prior_for_each_object = overlap.max(dim=1) # (N_o) # Then, assign each object to the corresponding maximum-overlap-prior. (This fixes 1.) object_for_each_prior[prior_for_each_object] = torch.LongTensor(range(n_objects)).to(device) # To ensure these priors qualify, artificially give them an overlap of greater than 0.5. (This fixes 2.) overlap_for_each_prior[prior_for_each_object] = 1. # Labels for each prior label_for_each_prior = labels[i][object_for_each_prior] # (8732) # Set priors whose overlaps with objects are less than the threshold to be background (no object) label_for_each_prior[overlap_for_each_prior < self.threshold] = 0 # (8732) # Store true_classes[i] = label_for_each_prior # Encode center-size object coordinates into the form we regressed predicted boxes to true_locs[i] = cxcy_to_gcxgcy(xy_to_cxcy(boxes[i][object_for_each_prior]), self.priors_cxcy) # (8732, 4) positive_priors = true_classes != 0 # (N, 8732) loc_loss = self.smooth_l1(predicted_locs[positive_priors], true_locs[positive_priors]) # (), scalar n_positives = positive_priors.sum(dim=1) # (N) n_hard_negatives = self.neg_pos_ratio * n_positives # (N) # First, find the loss for all priors conf_loss_all = self.cross_entropy(predicted_scores.view(-1, n_classes), true_classes.view(-1)) # (N * 8732) conf_loss_all = conf_loss_all.view(batch_size, n_priors) # (N, 8732) # We already know which priors are positive conf_loss_pos = conf_loss_all[positive_priors] # (sum(n_positives)) # Next, find which priors are hard-negative # To do this, sort ONLY negative priors in each image in order of decreasing loss and take top n_hard_negatives conf_loss_neg = conf_loss_all.clone() # (N, 8732) conf_loss_neg[positive_priors] = 0. # (N, 8732), positive priors are ignored (never in top n_hard_negatives) conf_loss_neg, _ = conf_loss_neg.sort(dim=1, descending=True) # (N, 8732), sorted by decreasing hardness hardness_ranks = torch.LongTensor(range(n_priors)).unsqueeze(0).expand_as(conf_loss_neg).to(device) # (N, 8732) hard_negatives = hardness_ranks < n_hard_negatives.unsqueeze(1) # (N, 8732) conf_loss_hard_neg = conf_loss_neg[hard_negatives] # (sum(n_hard_negatives)) conf_loss = (conf_loss_hard_neg.sum() + conf_loss_pos.sum()) / n_positives.sum().float() # (), scalar return conf_loss + self.alpha * loc_loss
def forward(self, locs_pred, cls_pred, boxes, labels): ''' Forward propagation locs_pred: Pred location, a tensor of dimensions (N, 8732, 4) cls_pred: Pred class scores for each of the encoded boxes, a tensor fo dimensions (N, 8732, n_classes) boxes: True object bouding boxes, a list of N tensors labels: True object labels, a list of N tensors Out: Mutilbox loss ''' batch_size = locs_pred.size(0) #N n_default_boxes = self.default_boxes.size(0) #8732 num_classes = cls_pred.size(2) #num_classes t_locs = torch.zeros((batch_size, n_default_boxes, 4), dtype=torch.float).to(device) #(N, 8732, 4) t_classes = torch.zeros((batch_size, n_default_boxes), dtype=torch.long).to(device) #(N, 8732) default_boxes_xy = cxcy_to_xy(self.default_boxes) for i in range(batch_size): n_objects = boxes[i].size(0) overlap = find_IoU(boxes[i], default_boxes_xy) #(n_objects, 8732) #for each default box, find the object has maximum overlap overlap_each_default_box, object_each_default_box = overlap.max( dim=0) #(8732) #find default box has maximum oberlap for each object _, default_boxes_each_object = overlap.max(dim=1) object_each_default_box[ default_boxes_each_object] = torch.LongTensor( range(n_objects)).to(device) overlap_each_default_box[default_boxes_each_object] = 1. #Labels for each default box label_each_default_box = labels[i][ object_each_default_box] #(8732) label_each_default_box[ overlap_each_default_box < self.threshold] = 0 #(8732) #Save t_classes[i] = label_each_default_box #Encode pred bboxes t_locs[i] = encode_bboxes( xy_to_cxcy(boxes[i][object_each_default_box]), self.default_boxes) #(8732, 4) # Identify priors that are positive pos_default_boxes = t_classes != 0 #(N, 8732) #Localization loss #Localization loss is computed only over positive default boxes smooth_L1_loss = nn.SmoothL1Loss() loc_loss = smooth_L1_loss(locs_pred[pos_default_boxes], t_locs[pos_default_boxes]) #Confidence loss #Apply hard negative mining #number of positive ad hard-negative default boxes per image n_positive = pos_default_boxes.sum(dim=1) n_hard_negatives = self.neg_pos * n_positive #Find the loss for all priors cross_entropy_loss = nn.CrossEntropyLoss(reduce=False) confidence_loss_all = cross_entropy_loss(cls_pred.view( -1, num_classes), t_classes.view(-1)) #(N*8732) confidence_loss_all = confidence_loss_all.view( batch_size, n_default_boxes) #(N, 8732) confidence_pos_loss = confidence_loss_all[pos_default_boxes] #Find which priors are hard-negative confidence_neg_loss = confidence_loss_all.clone() #(N, 8732) confidence_neg_loss[pos_default_boxes] = 0. confidence_neg_loss, _ = confidence_neg_loss.sort(dim=1, descending=True) hardness_ranks = torch.LongTensor(range(n_default_boxes)).unsqueeze( 0).expand_as(confidence_neg_loss).to(device) # (N, 8732) hard_negatives = hardness_ranks < n_hard_negatives.unsqueeze( 1) # (N, 8732) confidence_hard_neg_loss = confidence_neg_loss[hard_negatives] confidence_loss = ( confidence_hard_neg_loss.sum() + confidence_pos_loss.sum()) / n_positive.sum().float() return self.alpha * loc_loss + confidence_loss
def forward(self, predicted_locs, predicted_scores, boxes, labels): """ Forward propagation. :param predicted_locs: predicted locations/boxes w.r.t the prior boxes, a tensor of dimensions (N, n_priors, 4) :param predicted_scores: class scores for each of the encoded locations/boxes, a tensor of dimensions (N, n_priors, n_classes) :param boxes: true object bounding boxes in boundary coordinates, a list of N tensors :param labels: true object labels, a list of N tensors :return: multibox loss, a scalar """ batch_size = predicted_locs.size(0) n_priors = self.priors_cxcy.size(0) n_classes = predicted_scores.size(2) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") assert n_priors == predicted_locs.size(1) == predicted_scores.size(1) true_locs = torch.zeros((batch_size, n_priors, 4), dtype=torch.float).to(device) true_classes = torch.zeros((batch_size, n_priors), dtype=torch.long).to(device) # For each image for i in range(batch_size): n_objects = boxes[i].size(0) overlap = find_jaccard_overlap( boxes[i], self.priors_xy) # (n_objects, n_priors) # For each prior, find the object that has the maximum overlap overlap_for_each_prior, object_for_each_prior = overlap.max( dim=0) # (n_priors) _, prior_for_each_object = overlap.max(dim=1) # (N_o) # Then, assign each object to the corresponding maximum-overlap-prior. (This fixes 1.) object_for_each_prior[prior_for_each_object] = torch.LongTensor( range(n_objects)).to(device) # To ensure these priors qualify, artificially give them an overlap of greater than 0.5. (This fixes 2.) overlap_for_each_prior[prior_for_each_object] = 1. # Labels for each prior label_for_each_prior = labels[i][ object_for_each_prior] # (n_priors) # Set priors whose overlaps with objects are less than the threshold to be background (no object) label_for_each_prior[ overlap_for_each_prior < self.threshold] = 0 # (n_priors) # Store true_classes[i] = label_for_each_prior # Encode center-size object coordinates into the form we regressed predicted boxes to true_locs[i] = cxcy_to_gcxgcy( xy_to_cxcy(boxes[i][object_for_each_prior]), self.priors_cxcy) # (n_priors, 4) # Identify priors that are positive (object/non-background) positive_priors = true_classes != 0 # (N, n_priors) # Localization loss is computed only over positive (non-background) priors loc_loss = self.smooth_l1(predicted_locs[positive_priors], true_locs[positive_priors]) # (), scalar # Number of positive and hard-negative priors per image n_positives = positive_priors.sum(dim=1) # (N) n_hard_negatives = self.neg_pos_ratio * n_positives # (N) # First, find the loss for all priors conf_loss_all = self.cross_entropy(predicted_scores.view( -1, n_classes), true_classes.view(-1)) # (N * n_priors) conf_loss_all = conf_loss_all.view(batch_size, n_priors) # (N, n_priors) # We already know which priors are positive conf_loss_pos = conf_loss_all[positive_priors] # (sum(n_positives)) # Next, find which priors are hard-negative # To do this, sort ONLY negative priors in each image in order of decreasing loss and take top n_hard_negatives conf_loss_neg = conf_loss_all.clone() # (N, n_priors) conf_loss_neg[ positive_priors] = 0. # (N, n_priors), positive priors are ignored (never in top n_hard_negatives) conf_loss_neg, _ = conf_loss_neg.sort( dim=1, descending=True) # (N, n_priors), sorted by decreasing hardness hardness_ranks = torch.LongTensor( range(n_priors)).unsqueeze(0).expand_as(conf_loss_neg).to( device) # (N, n_priors) hard_negatives = hardness_ranks < n_hard_negatives.unsqueeze( 1) # (N, n_priors) conf_loss_hard_neg = conf_loss_neg[ hard_negatives] # (sum(n_hard_negatives)) # As in the paper, averaged over positive priors only, although computed over both positive and hard-negative priors conf_loss = (conf_loss_hard_neg.sum() + conf_loss_pos.sum() ) / n_positives.sum().float() # (), scalar # TOTAL LOSS return conf_loss + self.alpha * loc_loss
def forward(self, output_boxes, output_scores, true_boxes, true_labels): batch_size = output_boxes.size(0) n_classes = output_boxes.size(2) n_priors = self.default_cxcy.size(0) gt_locs = torch.Tensor(batch_size, n_priors, 4).to(self.device) gt_class = torch.LongTensor(batch_size, n_priors).to(self.device) for im in range(batch_size): n_objects = true_boxes[im].size(0) # compute IoU for each ground truth box with default boxes # (n_objects, 8732) overlaps = find_jaccard_overlap(true_boxes[im], self.default_xy) # find highest-overlap object for each default, and then highest- # overlap default for each object overlap_per_default, object_per_default = overlaps.max(dim=0) overlap_per_object, default_per_object = overlaps.max(dim=1) # assign object to default box with highest overlap object_per_default[default_per_object] = torch.LongTensor( range(n_objects)).to(self.device) # give these default boxes an overlap of 1 (ensure positive) overlap_per_default[default_per_object] = 1. # assign labels to the default boxes according to the best overlap default_labels = true_labels[im][object_per_default] default_labels[overlap_per_default < self.threshold] = 0 gt_class[im] = default_labels gt_locs[im] = cxcy_to_gcxgcy( xy_to_cxcy(true_boxes[im][object_per_default]), self.default_cxcy) positive_defaults = (gt_class > 0) # localization loss L_loc = self.smooth_l1(output_boxes[positive_defaults], true_boxes[positive_defaults]) # confidence loss n_positives = positive_defaults.sum(dim=1) # (N) n_hard_negatives = self.hard_neg_scale * n_positives conf_all = output_scores.view(-1, n_classes) L_conf_all = self.cross_entropy(conf_all, gt_class.view(-1)) L_conf_all = L_conf_all.view(batch_size, n_priors) # (N, 8732) # We already know which priors are positive L_conf_pos = L_conf_all[positive_defaults] # (sum(n_positives)) # Next, find which priors are hard-negative # To do this, sort ONLY negative priors in each image in order of decreasing loss and take top n_hard_negatives L_conf_neg = L_con_all.clone() # (N, 8732) L_conf_neg[ positive_defaults] = 0. # (N, 8732), positive priors are ignored (never in top n_hard_negatives) L_conf_neg, _ = L_conf_neg.sort( dim=1, descending=True) # (N, 8732), sorted by decreasing hardness hardness_ranks = torch.LongTensor( range(n_priors)).unsqueeze(0).expand_as(L_conf_neg).to( device) # (N, 8732) hard_negatives = hardness_ranks < n_hard_negatives.unsqueeze(1) L_conf_hard_neg = L_conf_neg[hard_negatives] # As in the paper, averaged over positive priors only, although computed over both positive and hard-negative priors L_conf = (L_conf_hard_neg.sum() + L_conf_pos.sum()) / n_positives.sum().float() # (), scalar loss = L_conf + self.alpha * L_loc return loss