Example #1
0
    def build_target(self, anno):
        """
        Building a target for loss calculation is incapsulated into the detection model class.
        Method to be called outside - in data loader threads. Must have no side effects on self object.

        :param anno: list of boxes with class ids
        :return:
            (loc, cls): encoded target: location regression and classification class
                loc: float tensor of shape (A, 4), A - total number of anchors
                cls: int tensor of shape (A,) of class labels, where 0 - background, 1 - class 0, etc
            matches: statistics of coverage of GT boxes by anchors
        """

        anno = self._anno_class_names_to_ids(anno)

        if len(anno) > 0:
            gt_boxes = np.stack([obj['bbox'] for obj in anno], axis=0)
            gt_classes = np.stack([obj['class_id'] for obj in anno],
                                  axis=0).astype(np.int32)
        else:
            gt_boxes = np.zeros((0, 4), dtype=np.float32)
            gt_classes = np.zeros((0, ), dtype=np.int32)

        gt_boxes = torch.from_numpy(gt_boxes)
        gt_classes = torch.from_numpy(gt_classes).long()

        loc, cls, matches = box_utils.match(self.iou_anchor_and_gt, gt_boxes,
                                            self.anchors_cxcywh,
                                            self.variances, gt_classes)

        return (loc, cls), matches
Example #2
0
    def forward(self, predictions, targets):
        """Multibox Loss
        Args:
            predictions (tuple): A tuple containing loc preds, conf preds,
            and prior boxes from SSD net.
                conf shape: torch.size(batch_size,num_priors,num_classes)
                loc shape: torch.size(batch_size,num_priors,4)
                priors shape: torch.size(num_priors,4)

            targets (tensor): Ground truth boxes and labels for a batch,
                shape: [batch_size,num_objs,5] (last idx is the label).
        """
        # 推論結果をオフセット、確信度、ボックス座標にセット
        loc_data, conf_data, priors = predictions
        num = loc_data.size(0)
        priors = priors[:loc_data.size(1), :]
        num_priors = (priors.size(0))

        # match priors (default boxes) and ground truth boxes
        # 正解座標のオフセット、正解ラベルのテンソルを作成
        loc_t = torch.Tensor(num, num_priors, 4)
        conf_t = torch.LongTensor(num, num_priors)
        # バッチサイズ毎にループし、訓練データを正解座標、正解ラベルに分解
        for idx in range(num):
            truths = targets[idx][:, :-1].data
            labels = targets[idx][:, -1].data
            defaults = priors.data
            # 正解座標とボックス座標のマッチング
            match(self.threshold, truths, defaults, self.variance, labels,
                  loc_t, conf_t, idx)
        if self.use_gpu:
            # handbook
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            loc_t = loc_t.to(device)
            conf_t = conf_t.to(device)
            # handbook

        # クラス番号が0より大きいPositiveのボックスのリスト作成
        pos = conf_t > 0
        # Positiveのボックス数
        num_pos = pos.sum(dim=1, keepdim=True)

        # Localization Loss (Smooth L1)
        # Shape: [batch,num_priors,4]
        # Positiveのボックスのインデックスpos_idxを取得
        pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
        # 推論結果のオフセット
        loc_p = loc_data[pos_idx].view(-1, 4)
        # 正解座標のオフセット
        loc_t = loc_t[pos_idx].view(-1, 4)
        # 位置の損失関数
        loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum')

        # Compute max conf across batch for hard negative mining
        batch_conf = conf_data.view(-1, self.num_classes)
        loss_c = log_sum_exp(batch_conf) - batch_conf.gather(
            1, conf_t.view(-1, 1))

        # Hard Negative Mining
        # handbook
        #loss_c[pos] = 0  # filter out pos boxes for now
        #loss_c = loss_c.view(num, -1)
        loss_c = loss_c.view(num, -1)
        loss_c[pos] = 0  # filter out pos boxes for now
        # handbook
        _, loss_idx = loss_c.sort(1, descending=True)
        _, idx_rank = loss_idx.sort(1)
        num_pos = pos.long().sum(1, keepdim=True)
        num_neg = torch.clamp(self.negpos_ratio * num_pos, max=pos.size(1) - 1)
        neg = idx_rank < num_neg.expand_as(idx_rank)

        # Confidence Loss Including Positive and Negative Examples
        pos_idx = pos.unsqueeze(2).expand_as(conf_data)
        neg_idx = neg.unsqueeze(2).expand_as(conf_data)
        # 推論結果の確信度conf_dataをpos_idx+neg_idxで絞り込み
        conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view(
            -1, self.num_classes)
        # 正解ラベルのconf_tをposとnegで絞り込み
        targets_weighted = conf_t[(pos + neg).gt(0)]
        # クラス確信度の損失関数
        loss_c = F.cross_entropy(conf_p, targets_weighted, reduction='sum')

        # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N

        # handbook
        #N = num_pos.data.sum()
        N = num_pos.data.sum().double()
        loss_l = loss_l.double()
        loss_c = loss_c.double()
        # handbook
        loss_l /= N
        loss_c /= N
        return loss_l, loss_c
Example #3
0
loc_data, conf_data, priors = out
num = loc_data.size(0)
priors = priors[:loc_data.size(1), :]
num_priors = (priors.size(0))

# match priors (default boxes) and ground truth boxes
# 正解座標のオフセット、正解ラベルのテンソルを作成
loc_t = torch.Tensor(num, num_priors, 4)
conf_t = torch.LongTensor(num, num_priors)
# バッチサイズ毎にループし、訓練データを正解座標、正解ラベルに分解
for idx in range(num):
    truths = targets[idx][:, :-1].data
    labels = targets[idx][:, -1].data
    defaults = priors.data
    # 正解座標とボックス座標のマッチング
    match(0.5, truths, defaults, [0.1, 0.2], labels, loc_t, conf_t, idx)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
loc_t = loc_t.to(device)
conf_t = conf_t.to(device)

# クラス番号が0より大きいPositiveのボックスのリスト作成
pos = conf_t > 0
# Positiveのボックス数
num_pos = pos.sum(dim=1, keepdim=True)

print(loc_data.shape)
# Localization Loss (Smooth L1)
# Shape: [batch,num_priors,4]
# Positiveのボックスのインデックスpos_idxを取得
pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
# 推論結果のオフセット
Example #4
0
    def forward(self, predictions, targets):
        """Multibox Loss
        Args:
            predictions (tuple): A tuple containing loc preds, conf preds,
            and prior boxes from SSD net.
                conf shape: torch.size(batch_size,num_priors,num_classes)
                loc shape: torch.size(batch_size,num_priors,4)
                priors shape: torch.size(num_priors,4)

            targets (tensor): Ground truth boxes and labels for a batch,
                shape: [batch_size,num_objs,5] (last idx is the label).
        """
        loc_data, conf_data, priors = predictions
        num = loc_data.size(0)
        priors = priors[:loc_data.size(1), :]
        num_priors = (priors.size(0))
        num_classes = self.num_classes

        # match priors (default boxes) and ground truth boxes
        loc_t = torch.Tensor(num, num_priors, 4)
        conf_t = torch.LongTensor(num, num_priors)
        for idx in range(num):
            truths = targets[idx][:, :-1].data
            labels = targets[idx][:, -1].data
            defaults = priors.data
            match(self.threshold, truths, defaults, self.variance, labels,
                  loc_t, conf_t, idx)
        if self.use_gpu:
            loc_t = loc_t.cuda()
            conf_t = conf_t.cuda()
        # wrap targets
        loc_t.requires_grad = False
        conf_t.requires_grad = False

        pos = conf_t > 0
        num_pos = pos.sum(dim=1, keepdim=True)

        # Localization Loss (Smooth L1)
        # Shape: [batch,num_priors,4]
        pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
        loc_p = loc_data[pos_idx].view(-1, 4)
        loc_t = loc_t[pos_idx].view(-1, 4)
        loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)

        # Compute max conf across batch for hard negative mining
        batch_conf = conf_data.view(-1, self.num_classes)
        loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1))

        # Hard Negative Mining
        loss_c = loss_c.view(num, -1)
        loss_c[pos] = 0  # filter out pos boxes for now
        _, loss_idx = loss_c.sort(1, descending=True)
        _, idx_rank = loss_idx.sort(1)
        num_pos = pos.long().sum(1, keepdim=True)
        num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)
        neg = idx_rank < num_neg.expand_as(idx_rank)

        # Confidence Loss Including Positive and Negative Examples
        pos_idx = pos.unsqueeze(2).expand_as(conf_data)
        neg_idx = neg.unsqueeze(2).expand_as(conf_data)
        conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes)
        targets_weighted = conf_t[(pos+neg).gt(0)]
        loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False)

        # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N

        N = num_pos.data.sum().double()
        loss_c = loss_c.double()
        loss_l = loss_l.double()
        loss_l /= N
        loss_c /= N
        return loss_l, loss_c
    def forward(self, predictions, targets, masks, num_crowds):
        """Multibox Loss
        Args:
            predictions (tuple): A tuple containing loc preds, conf preds,
            mask preds, and prior boxes from SSD net.
                loc shape: torch.size(batch_size,num_priors,4)
                conf shape: torch.size(batch_size,num_priors,num_classes)
                masks shape: torch.size(batch_size,num_priors,mask_dim)
                priors shape: torch.size(num_priors,4)
                proto* shape: torch.size(batch_size,mask_h,mask_w,mask_dim)

            targets (list<tensor>): Ground truth boxes and labels for a batch,
                shape: [batch_size][num_objs,5] (last idx is the label).

            masks (list<tensor>):
                Ground truth masks for each object in each image,
                shape: [batch_size][num_objs,im_height,im_width]

            num_crowds (list<int>):
                Number of crowd annotations per batch. The crowd
                annotations should be the
                last num_crowds elements of targets and masks.
            * Only if mask_type == lincomb
        """

        loc_data = predictions["loc"]
        conf_data = predictions["conf"]
        mask_data = predictions["mask"]
        priors = predictions["priors"]

        if cfg.mask_type == mask_type.lincomb:
            proto_data = predictions["proto"]

        score_data = predictions["score"] if cfg.use_mask_scoring else None
        inst_data = predictions["inst"] if cfg.use_instance_coeff else None

        labels = [None] * len(targets)  # Used in sem segm loss

        batch_size = loc_data.size(0)
        num_priors = priors.size(0)
        num_classes = self.num_classes

        loc_t = loc_data.new(batch_size, num_priors, 4)
        gt_box_t = loc_data.new(batch_size, num_priors, 4)
        conf_t = loc_data.new(batch_size, num_priors).long()
        idx_t = loc_data.new(batch_size, num_priors).long()

        if cfg.use_class_existence_loss:
            class_existence_t = loc_data.new(batch_size, num_classes - 1)

        for idx in range(batch_size):
            truths = targets[idx][:, :-1].data
            labels[idx] = targets[idx][:, -1].data.long()

            if cfg.use_class_existence_loss:
                class_existence_t[idx, :] = (
                    torch.eye(num_classes -
                              1)[labels[idx]].cuda().max(dim=0)[0])

            # Split the crowd annotations because they come bundled in
            cur_crowds = num_crowds[idx]
            if cur_crowds > 0:

                def split(x):
                    return (x[-cur_crowds:], x[:-cur_crowds])

                crowd_boxes, truths = split(truths)

                # We don't use the crowd labels or masks
                _, labels[idx] = split(labels[idx])
                _, masks[idx] = split(masks[idx])
            else:
                crowd_boxes = None

            match(
                self.pos_threshold,
                self.neg_threshold,
                truths,
                priors.data,
                labels[idx],
                crowd_boxes,
                loc_t,
                conf_t,
                idx_t,
                idx,
                loc_data[idx],
            )

            gt_box_t[idx, :, :] = truths[idx_t[idx]]

        # wrap targets
        loc_t = Variable(loc_t, requires_grad=False)
        conf_t = Variable(conf_t, requires_grad=False)
        idx_t = Variable(idx_t, requires_grad=False)

        pos = conf_t > 0
        num_pos = pos.sum(dim=1, keepdim=True)

        # Shape: [batch,num_priors,4]
        pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)

        losses = {}

        # Localization Loss (Smooth L1)
        if cfg.train_boxes:
            loc_p = loc_data[pos_idx].view(-1, 4)
            loc_t = loc_t[pos_idx].view(-1, 4)
            losses["B"] = (F.smooth_l1_loss(loc_p, loc_t, reduction="sum") *
                           cfg.bbox_alpha)

        if cfg.train_masks:
            if cfg.mask_type == mask_type.direct:
                if cfg.use_gt_bboxes:
                    pos_masks = []
                    for idx in range(batch_size):
                        pos_masks.append(masks[idx][idx_t[idx, pos[idx]]])
                    masks_t = torch.cat(pos_masks, 0)
                    masks_p = mask_data[pos, :].view(-1, cfg.mask_dim)
                    losses["M"] = (F.binary_cross_entropy(
                        torch.clamp(masks_p, 0, 1),
                        masks_t,
                        reduction="sum",
                    ) * cfg.mask_alpha)
                else:
                    losses["M"] = self.direct_mask_loss(
                        pos_idx, idx_t, loc_data, mask_data, priors, masks)
            elif cfg.mask_type == mask_type.lincomb:
                losses.update(
                    self.lincomb_mask_loss(
                        pos,
                        idx_t,
                        loc_data,
                        mask_data,
                        priors,
                        proto_data,
                        masks,
                        gt_box_t,
                        score_data,
                        inst_data,
                    ))

                if cfg.mask_proto_loss is not None:
                    if cfg.mask_proto_loss == "l1":
                        losses["P"] = (torch.mean(torch.abs(proto_data)) /
                                       self.l1_expected_area * self.l1_alpha)
                    elif cfg.mask_proto_loss == "disj":
                        losses["P"] = -torch.mean(
                            torch.max(F.log_softmax(proto_data, dim=-1),
                                      dim=-1)[0])

        # Confidence loss
        if cfg.use_focal_loss:
            if cfg.use_sigmoid_focal_loss:
                losses["C"] = self.focal_conf_sigmoid_loss(conf_data, conf_t)
            elif cfg.use_objectness_score:
                losses["C"] = self.focal_conf_objectness_loss(
                    conf_data, conf_t)
            else:
                losses["C"] = self.focal_conf_loss(conf_data, conf_t)
        else:
            if cfg.use_objectness_score:
                losses["C"] = self.conf_objectness_loss(
                    conf_data, conf_t, batch_size, loc_p, loc_t, priors)
            else:
                losses["C"] = self.ohem_conf_loss(conf_data, conf_t, pos,
                                                  batch_size)

        # These losses also don't depend on anchors
        if cfg.use_class_existence_loss:
            losses["E"] = self.class_existence_loss(predictions["classes"],
                                                    class_existence_t)
        if cfg.use_semantic_segmentation_loss:
            losses["S"] = self.semantic_segmentation_loss(
                predictions["segm"], masks, labels)

        # Divide all losses by the number of positives.
        # Don't do it for loss[P] because that doesn't depend on the anchors.
        total_num_pos = num_pos.data.sum().float()
        for k in losses:
            if k not in ("P", "E", "S"):
                losses[k] /= total_num_pos
            else:
                losses[k] /= batch_size

        # Loss Key:
        #  - B: Box Localization Loss
        #  - C: Class Confidence Loss
        #  - M: Mask Loss
        #  - P: Prototype Loss
        #  - D: Coefficient Diversity Loss
        #  - E: Class Existence Loss
        #  - S: Semantic Segmentation Loss
        return losses
Example #6
0
    def __getitem__(self, idx):

        # total number of samples in the dataset
        n = len(self.files)

        if n > idx * self.batch_size:
            current_batch_size = self.batch_size
        else:
            current_batch_size = n - self.batch_size

        file_names = self.files[idx * current_batch_size:(idx + 1) *
                                current_batch_size]
        #         print batch_x.shape
        batch_x = []
        batch_y = []

        num_priors = self.priors.shape[0]

        for m, files in enumerate(file_names):

            labels = np.zeros(shape=(num_priors, self.num_classes + 4),
                              dtype=np.float32)

            image_path = self.root_path / files[0] / 'JPEGImages' / files[1]
            annotation_path = self.root_path / files[
                0] / 'Annotations' / files[1]

            image_file = image_path.with_suffix('.jpg')
            annotation_file = annotation_path.with_suffix('.xml')

            # Read the image
            image = load_image(image_file, target_size=self.target_size)
            image = np.array(image, dtype=np.float32)

            # Get the ground truth
            self.ReadVOCAnnotations(annotation_file=annotation_file)

            ground_truth = np.array(self.TransformBNDBoxes(), dtype=np.float32)

            image, ground_truth[:,
                                1:] = self.image_data_generator.random_transforms(
                                    (image, ground_truth[:, 1:]))
            image = self.image_data_generator.standardize(image)

            image = torch.from_numpy(image).float()
            ground_truth = torch.from_numpy(ground_truth).float()

            bndbox_loc = ground_truth[:, 1:]
            class_ids = ground_truth[:, 0]

            loc, class_id = match(
                truths=point_form(
                    bndbox_loc),  # Convert to from (xmin, ymin, xmax, ymax) 
                labels=class_ids,
                priors=self.priors,
                variances=[0.1, 0.2],
                threshold=0.5)

            class_id = to_categorical(class_id, num_classes=self.num_classes)

            labels[:, :4] = loc
            labels[:, 4:] = class_id

            batch_x.append(image)
            batch_y.append(labels)

#        batch_x = np.array(batch_x, dtype = np.float32)
#        batch_y = np.array(batch_y, dtype = np.float32)

        return (batch_x, batch_y)
Example #7
0
    def forward(self, predictions, prior_boxes, targets):
        """
        Multibox Loss
        Args:
            predictions (tuple): A tuple containing loc preds, conf preds,
                                 and prior boxes from blazefaceNet.
                loc shape:  torch.size(batch_size, num_prior_boxes, 4)
                conf shape: torch.size(batch_size, num_prior_boxes, num_classes)
                prior_boxes shape: torch.size(num_prior_boxes, 4)
            
            targets (Tensor): a doubletensor of ground truth boxes and labels for a batch
        """
        loc_data, conf_data = predictions
        batch_size = loc_data.size(0)
        num_prior_boxes = loc_data.size(1)
        priorboxes = prior_boxes

        # match prior_boxes with ground truth boxes
        loc_target = torch.Tensor(batch_size, num_prior_boxes, 4)
        conf_target = torch.LongTensor(batch_size, num_prior_boxes)
        for idx in range(batch_size):
            truths = targets[idx].data
            labels = torch.ones([truths.size(0),1])
            print("multiboxloss中的truths和labels: ",truths,labels)
            defaults = priorboxes.data
            match(self.threshold, truths, defaults, self.variance, labels,
                  loc_target, conf_target, idx)
        if self.use_gpu == True:
            loc_target = loc_target.cuda()
            conf_target = conf_target.cuda()
        loc_target = Variable(loc_target, requires_grad=False)
        conf_target = Variable(conf_target, requires_grad=False)

        #----------------------------------------------------
        pos = conf_target > 0
        print(conf_target.shape)
        num_pos = pos.sum(dim=1, keepdim=True)

        # Localization Loss (Smooth L1)
        # Shape: [batch,num_priors,4]
        pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
        loc_p = loc_data[pos_idx].view(-1, 4)
        loc_t = loc_t[pos_idx].view(-1, 4)
        loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)

        # Compute max conf across batch for hard negative mining
        batch_conf = conf_data.view(-1, self.num_classes)
        loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_target.view(-1, 1))

        # Hard Negative Mining
        #loss_c = loss_c.view(pos.size()[0], pos.size()[1]) #add line 
        loss_c = loss_c.view(num, -1)
        loss_c[pos] = 0  # filter out pos boxes for now
        #loss_c = loss_c.view(num, -1)
        _, loss_idx = loss_c.sort(1, descending=True)
        _, idx_rank = loss_idx.sort(1)
        num_pos = pos.long().sum(1, keepdim=True)
        num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)
        neg = idx_rank < num_neg.expand_as(idx_rank)

        # Confidence Loss Including Positive and Negative Examples
        pos_idx = pos.unsqueeze(2).expand_as(conf_data)
        neg_idx = neg.unsqueeze(2).expand_as(conf_data)
        conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes)
        targets_weighted = conf_target[(pos+neg).gt(0)]
        loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False)

        # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N

        #----------------------------------------------------

        
        N = num_pos.data.sum()
        print("N: ", N, type(N))
        loss_l /= N
        loss_C /= N
        return loss_l, loss_c
boxes[:, 3] = boxes[:, 1] + boxes[:, 3]

boxes_priors = boxes * 300

from box_utils import match

import cv2, sys, os

images = files_with_ext(sys.argv[1], '.JPG')
xmls = files_with_ext(sys.argv[2], '.xml')

scores = 0
for image in images:
    print(image)
    image_name = image
    xml_name = os.path.join(
        sys.argv[2],
        os.path.basename(image_name).replace('.JPG', '.xml'))
    objects, width, height = read_xml(xml_name)

    boxes = []
    for obj in objects:
        cboxes = objects[obj]
        newboxes = [[int(i) for i in box] for box in cboxes]
        boxes += newboxes
    match_score = match(0.9, torch.FloatTensor(boxes), boxes_priors)
    scores += match_score
    print(match_score)

print("total scores:", scores)