def build_target(self, anno): """ Building a target for loss calculation is incapsulated into the detection model class. Method to be called outside - in data loader threads. Must have no side effects on self object. :param anno: list of boxes with class ids :return: (loc, cls): encoded target: location regression and classification class loc: float tensor of shape (A, 4), A - total number of anchors cls: int tensor of shape (A,) of class labels, where 0 - background, 1 - class 0, etc matches: statistics of coverage of GT boxes by anchors """ anno = self._anno_class_names_to_ids(anno) if len(anno) > 0: gt_boxes = np.stack([obj['bbox'] for obj in anno], axis=0) gt_classes = np.stack([obj['class_id'] for obj in anno], axis=0).astype(np.int32) else: gt_boxes = np.zeros((0, 4), dtype=np.float32) gt_classes = np.zeros((0, ), dtype=np.int32) gt_boxes = torch.from_numpy(gt_boxes) gt_classes = torch.from_numpy(gt_classes).long() loc, cls, matches = box_utils.match(self.iou_anchor_and_gt, gt_boxes, self.anchors_cxcywh, self.variances, gt_classes) return (loc, cls), matches
def forward(self, predictions, targets): """Multibox Loss Args: predictions (tuple): A tuple containing loc preds, conf preds, and prior boxes from SSD net. conf shape: torch.size(batch_size,num_priors,num_classes) loc shape: torch.size(batch_size,num_priors,4) priors shape: torch.size(num_priors,4) targets (tensor): Ground truth boxes and labels for a batch, shape: [batch_size,num_objs,5] (last idx is the label). """ # 推論結果をオフセット、確信度、ボックス座標にセット loc_data, conf_data, priors = predictions num = loc_data.size(0) priors = priors[:loc_data.size(1), :] num_priors = (priors.size(0)) # match priors (default boxes) and ground truth boxes # 正解座標のオフセット、正解ラベルのテンソルを作成 loc_t = torch.Tensor(num, num_priors, 4) conf_t = torch.LongTensor(num, num_priors) # バッチサイズ毎にループし、訓練データを正解座標、正解ラベルに分解 for idx in range(num): truths = targets[idx][:, :-1].data labels = targets[idx][:, -1].data defaults = priors.data # 正解座標とボックス座標のマッチング match(self.threshold, truths, defaults, self.variance, labels, loc_t, conf_t, idx) if self.use_gpu: # handbook device = 'cuda' if torch.cuda.is_available() else 'cpu' loc_t = loc_t.to(device) conf_t = conf_t.to(device) # handbook # クラス番号が0より大きいPositiveのボックスのリスト作成 pos = conf_t > 0 # Positiveのボックス数 num_pos = pos.sum(dim=1, keepdim=True) # Localization Loss (Smooth L1) # Shape: [batch,num_priors,4] # Positiveのボックスのインデックスpos_idxを取得 pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) # 推論結果のオフセット loc_p = loc_data[pos_idx].view(-1, 4) # 正解座標のオフセット loc_t = loc_t[pos_idx].view(-1, 4) # 位置の損失関数 loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum') # Compute max conf across batch for hard negative mining batch_conf = conf_data.view(-1, self.num_classes) loss_c = log_sum_exp(batch_conf) - batch_conf.gather( 1, conf_t.view(-1, 1)) # Hard Negative Mining # handbook #loss_c[pos] = 0 # filter out pos boxes for now #loss_c = loss_c.view(num, -1) loss_c = loss_c.view(num, -1) loss_c[pos] = 0 # filter out pos boxes for now # handbook _, loss_idx = loss_c.sort(1, descending=True) _, idx_rank = loss_idx.sort(1) num_pos = pos.long().sum(1, keepdim=True) num_neg = torch.clamp(self.negpos_ratio * num_pos, max=pos.size(1) - 1) neg = idx_rank < num_neg.expand_as(idx_rank) # Confidence Loss Including Positive and Negative Examples pos_idx = pos.unsqueeze(2).expand_as(conf_data) neg_idx = neg.unsqueeze(2).expand_as(conf_data) # 推論結果の確信度conf_dataをpos_idx+neg_idxで絞り込み conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view( -1, self.num_classes) # 正解ラベルのconf_tをposとnegで絞り込み targets_weighted = conf_t[(pos + neg).gt(0)] # クラス確信度の損失関数 loss_c = F.cross_entropy(conf_p, targets_weighted, reduction='sum') # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N # handbook #N = num_pos.data.sum() N = num_pos.data.sum().double() loss_l = loss_l.double() loss_c = loss_c.double() # handbook loss_l /= N loss_c /= N return loss_l, loss_c
loc_data, conf_data, priors = out num = loc_data.size(0) priors = priors[:loc_data.size(1), :] num_priors = (priors.size(0)) # match priors (default boxes) and ground truth boxes # 正解座標のオフセット、正解ラベルのテンソルを作成 loc_t = torch.Tensor(num, num_priors, 4) conf_t = torch.LongTensor(num, num_priors) # バッチサイズ毎にループし、訓練データを正解座標、正解ラベルに分解 for idx in range(num): truths = targets[idx][:, :-1].data labels = targets[idx][:, -1].data defaults = priors.data # 正解座標とボックス座標のマッチング match(0.5, truths, defaults, [0.1, 0.2], labels, loc_t, conf_t, idx) device = 'cuda' if torch.cuda.is_available() else 'cpu' loc_t = loc_t.to(device) conf_t = conf_t.to(device) # クラス番号が0より大きいPositiveのボックスのリスト作成 pos = conf_t > 0 # Positiveのボックス数 num_pos = pos.sum(dim=1, keepdim=True) print(loc_data.shape) # Localization Loss (Smooth L1) # Shape: [batch,num_priors,4] # Positiveのボックスのインデックスpos_idxを取得 pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) # 推論結果のオフセット
def forward(self, predictions, targets): """Multibox Loss Args: predictions (tuple): A tuple containing loc preds, conf preds, and prior boxes from SSD net. conf shape: torch.size(batch_size,num_priors,num_classes) loc shape: torch.size(batch_size,num_priors,4) priors shape: torch.size(num_priors,4) targets (tensor): Ground truth boxes and labels for a batch, shape: [batch_size,num_objs,5] (last idx is the label). """ loc_data, conf_data, priors = predictions num = loc_data.size(0) priors = priors[:loc_data.size(1), :] num_priors = (priors.size(0)) num_classes = self.num_classes # match priors (default boxes) and ground truth boxes loc_t = torch.Tensor(num, num_priors, 4) conf_t = torch.LongTensor(num, num_priors) for idx in range(num): truths = targets[idx][:, :-1].data labels = targets[idx][:, -1].data defaults = priors.data match(self.threshold, truths, defaults, self.variance, labels, loc_t, conf_t, idx) if self.use_gpu: loc_t = loc_t.cuda() conf_t = conf_t.cuda() # wrap targets loc_t.requires_grad = False conf_t.requires_grad = False pos = conf_t > 0 num_pos = pos.sum(dim=1, keepdim=True) # Localization Loss (Smooth L1) # Shape: [batch,num_priors,4] pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) loc_p = loc_data[pos_idx].view(-1, 4) loc_t = loc_t[pos_idx].view(-1, 4) loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) # Compute max conf across batch for hard negative mining batch_conf = conf_data.view(-1, self.num_classes) loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1)) # Hard Negative Mining loss_c = loss_c.view(num, -1) loss_c[pos] = 0 # filter out pos boxes for now _, loss_idx = loss_c.sort(1, descending=True) _, idx_rank = loss_idx.sort(1) num_pos = pos.long().sum(1, keepdim=True) num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1) neg = idx_rank < num_neg.expand_as(idx_rank) # Confidence Loss Including Positive and Negative Examples pos_idx = pos.unsqueeze(2).expand_as(conf_data) neg_idx = neg.unsqueeze(2).expand_as(conf_data) conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes) targets_weighted = conf_t[(pos+neg).gt(0)] loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False) # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N N = num_pos.data.sum().double() loss_c = loss_c.double() loss_l = loss_l.double() loss_l /= N loss_c /= N return loss_l, loss_c
def forward(self, predictions, targets, masks, num_crowds): """Multibox Loss Args: predictions (tuple): A tuple containing loc preds, conf preds, mask preds, and prior boxes from SSD net. loc shape: torch.size(batch_size,num_priors,4) conf shape: torch.size(batch_size,num_priors,num_classes) masks shape: torch.size(batch_size,num_priors,mask_dim) priors shape: torch.size(num_priors,4) proto* shape: torch.size(batch_size,mask_h,mask_w,mask_dim) targets (list<tensor>): Ground truth boxes and labels for a batch, shape: [batch_size][num_objs,5] (last idx is the label). masks (list<tensor>): Ground truth masks for each object in each image, shape: [batch_size][num_objs,im_height,im_width] num_crowds (list<int>): Number of crowd annotations per batch. The crowd annotations should be the last num_crowds elements of targets and masks. * Only if mask_type == lincomb """ loc_data = predictions["loc"] conf_data = predictions["conf"] mask_data = predictions["mask"] priors = predictions["priors"] if cfg.mask_type == mask_type.lincomb: proto_data = predictions["proto"] score_data = predictions["score"] if cfg.use_mask_scoring else None inst_data = predictions["inst"] if cfg.use_instance_coeff else None labels = [None] * len(targets) # Used in sem segm loss batch_size = loc_data.size(0) num_priors = priors.size(0) num_classes = self.num_classes loc_t = loc_data.new(batch_size, num_priors, 4) gt_box_t = loc_data.new(batch_size, num_priors, 4) conf_t = loc_data.new(batch_size, num_priors).long() idx_t = loc_data.new(batch_size, num_priors).long() if cfg.use_class_existence_loss: class_existence_t = loc_data.new(batch_size, num_classes - 1) for idx in range(batch_size): truths = targets[idx][:, :-1].data labels[idx] = targets[idx][:, -1].data.long() if cfg.use_class_existence_loss: class_existence_t[idx, :] = ( torch.eye(num_classes - 1)[labels[idx]].cuda().max(dim=0)[0]) # Split the crowd annotations because they come bundled in cur_crowds = num_crowds[idx] if cur_crowds > 0: def split(x): return (x[-cur_crowds:], x[:-cur_crowds]) crowd_boxes, truths = split(truths) # We don't use the crowd labels or masks _, labels[idx] = split(labels[idx]) _, masks[idx] = split(masks[idx]) else: crowd_boxes = None match( self.pos_threshold, self.neg_threshold, truths, priors.data, labels[idx], crowd_boxes, loc_t, conf_t, idx_t, idx, loc_data[idx], ) gt_box_t[idx, :, :] = truths[idx_t[idx]] # wrap targets loc_t = Variable(loc_t, requires_grad=False) conf_t = Variable(conf_t, requires_grad=False) idx_t = Variable(idx_t, requires_grad=False) pos = conf_t > 0 num_pos = pos.sum(dim=1, keepdim=True) # Shape: [batch,num_priors,4] pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) losses = {} # Localization Loss (Smooth L1) if cfg.train_boxes: loc_p = loc_data[pos_idx].view(-1, 4) loc_t = loc_t[pos_idx].view(-1, 4) losses["B"] = (F.smooth_l1_loss(loc_p, loc_t, reduction="sum") * cfg.bbox_alpha) if cfg.train_masks: if cfg.mask_type == mask_type.direct: if cfg.use_gt_bboxes: pos_masks = [] for idx in range(batch_size): pos_masks.append(masks[idx][idx_t[idx, pos[idx]]]) masks_t = torch.cat(pos_masks, 0) masks_p = mask_data[pos, :].view(-1, cfg.mask_dim) losses["M"] = (F.binary_cross_entropy( torch.clamp(masks_p, 0, 1), masks_t, reduction="sum", ) * cfg.mask_alpha) else: losses["M"] = self.direct_mask_loss( pos_idx, idx_t, loc_data, mask_data, priors, masks) elif cfg.mask_type == mask_type.lincomb: losses.update( self.lincomb_mask_loss( pos, idx_t, loc_data, mask_data, priors, proto_data, masks, gt_box_t, score_data, inst_data, )) if cfg.mask_proto_loss is not None: if cfg.mask_proto_loss == "l1": losses["P"] = (torch.mean(torch.abs(proto_data)) / self.l1_expected_area * self.l1_alpha) elif cfg.mask_proto_loss == "disj": losses["P"] = -torch.mean( torch.max(F.log_softmax(proto_data, dim=-1), dim=-1)[0]) # Confidence loss if cfg.use_focal_loss: if cfg.use_sigmoid_focal_loss: losses["C"] = self.focal_conf_sigmoid_loss(conf_data, conf_t) elif cfg.use_objectness_score: losses["C"] = self.focal_conf_objectness_loss( conf_data, conf_t) else: losses["C"] = self.focal_conf_loss(conf_data, conf_t) else: if cfg.use_objectness_score: losses["C"] = self.conf_objectness_loss( conf_data, conf_t, batch_size, loc_p, loc_t, priors) else: losses["C"] = self.ohem_conf_loss(conf_data, conf_t, pos, batch_size) # These losses also don't depend on anchors if cfg.use_class_existence_loss: losses["E"] = self.class_existence_loss(predictions["classes"], class_existence_t) if cfg.use_semantic_segmentation_loss: losses["S"] = self.semantic_segmentation_loss( predictions["segm"], masks, labels) # Divide all losses by the number of positives. # Don't do it for loss[P] because that doesn't depend on the anchors. total_num_pos = num_pos.data.sum().float() for k in losses: if k not in ("P", "E", "S"): losses[k] /= total_num_pos else: losses[k] /= batch_size # Loss Key: # - B: Box Localization Loss # - C: Class Confidence Loss # - M: Mask Loss # - P: Prototype Loss # - D: Coefficient Diversity Loss # - E: Class Existence Loss # - S: Semantic Segmentation Loss return losses
def __getitem__(self, idx): # total number of samples in the dataset n = len(self.files) if n > idx * self.batch_size: current_batch_size = self.batch_size else: current_batch_size = n - self.batch_size file_names = self.files[idx * current_batch_size:(idx + 1) * current_batch_size] # print batch_x.shape batch_x = [] batch_y = [] num_priors = self.priors.shape[0] for m, files in enumerate(file_names): labels = np.zeros(shape=(num_priors, self.num_classes + 4), dtype=np.float32) image_path = self.root_path / files[0] / 'JPEGImages' / files[1] annotation_path = self.root_path / files[ 0] / 'Annotations' / files[1] image_file = image_path.with_suffix('.jpg') annotation_file = annotation_path.with_suffix('.xml') # Read the image image = load_image(image_file, target_size=self.target_size) image = np.array(image, dtype=np.float32) # Get the ground truth self.ReadVOCAnnotations(annotation_file=annotation_file) ground_truth = np.array(self.TransformBNDBoxes(), dtype=np.float32) image, ground_truth[:, 1:] = self.image_data_generator.random_transforms( (image, ground_truth[:, 1:])) image = self.image_data_generator.standardize(image) image = torch.from_numpy(image).float() ground_truth = torch.from_numpy(ground_truth).float() bndbox_loc = ground_truth[:, 1:] class_ids = ground_truth[:, 0] loc, class_id = match( truths=point_form( bndbox_loc), # Convert to from (xmin, ymin, xmax, ymax) labels=class_ids, priors=self.priors, variances=[0.1, 0.2], threshold=0.5) class_id = to_categorical(class_id, num_classes=self.num_classes) labels[:, :4] = loc labels[:, 4:] = class_id batch_x.append(image) batch_y.append(labels) # batch_x = np.array(batch_x, dtype = np.float32) # batch_y = np.array(batch_y, dtype = np.float32) return (batch_x, batch_y)
def forward(self, predictions, prior_boxes, targets): """ Multibox Loss Args: predictions (tuple): A tuple containing loc preds, conf preds, and prior boxes from blazefaceNet. loc shape: torch.size(batch_size, num_prior_boxes, 4) conf shape: torch.size(batch_size, num_prior_boxes, num_classes) prior_boxes shape: torch.size(num_prior_boxes, 4) targets (Tensor): a doubletensor of ground truth boxes and labels for a batch """ loc_data, conf_data = predictions batch_size = loc_data.size(0) num_prior_boxes = loc_data.size(1) priorboxes = prior_boxes # match prior_boxes with ground truth boxes loc_target = torch.Tensor(batch_size, num_prior_boxes, 4) conf_target = torch.LongTensor(batch_size, num_prior_boxes) for idx in range(batch_size): truths = targets[idx].data labels = torch.ones([truths.size(0),1]) print("multiboxloss中的truths和labels: ",truths,labels) defaults = priorboxes.data match(self.threshold, truths, defaults, self.variance, labels, loc_target, conf_target, idx) if self.use_gpu == True: loc_target = loc_target.cuda() conf_target = conf_target.cuda() loc_target = Variable(loc_target, requires_grad=False) conf_target = Variable(conf_target, requires_grad=False) #---------------------------------------------------- pos = conf_target > 0 print(conf_target.shape) num_pos = pos.sum(dim=1, keepdim=True) # Localization Loss (Smooth L1) # Shape: [batch,num_priors,4] pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) loc_p = loc_data[pos_idx].view(-1, 4) loc_t = loc_t[pos_idx].view(-1, 4) loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) # Compute max conf across batch for hard negative mining batch_conf = conf_data.view(-1, self.num_classes) loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_target.view(-1, 1)) # Hard Negative Mining #loss_c = loss_c.view(pos.size()[0], pos.size()[1]) #add line loss_c = loss_c.view(num, -1) loss_c[pos] = 0 # filter out pos boxes for now #loss_c = loss_c.view(num, -1) _, loss_idx = loss_c.sort(1, descending=True) _, idx_rank = loss_idx.sort(1) num_pos = pos.long().sum(1, keepdim=True) num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1) neg = idx_rank < num_neg.expand_as(idx_rank) # Confidence Loss Including Positive and Negative Examples pos_idx = pos.unsqueeze(2).expand_as(conf_data) neg_idx = neg.unsqueeze(2).expand_as(conf_data) conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes) targets_weighted = conf_target[(pos+neg).gt(0)] loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False) # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N #---------------------------------------------------- N = num_pos.data.sum() print("N: ", N, type(N)) loss_l /= N loss_C /= N return loss_l, loss_c
boxes[:, 3] = boxes[:, 1] + boxes[:, 3] boxes_priors = boxes * 300 from box_utils import match import cv2, sys, os images = files_with_ext(sys.argv[1], '.JPG') xmls = files_with_ext(sys.argv[2], '.xml') scores = 0 for image in images: print(image) image_name = image xml_name = os.path.join( sys.argv[2], os.path.basename(image_name).replace('.JPG', '.xml')) objects, width, height = read_xml(xml_name) boxes = [] for obj in objects: cboxes = objects[obj] newboxes = [[int(i) for i in box] for box in cboxes] boxes += newboxes match_score = match(0.9, torch.FloatTensor(boxes), boxes_priors) scores += match_score print(match_score) print("total scores:", scores)