예제 #1
0
def bbox_head_loss_pre(rois, roi_indices, std, bboxes, labels):
    """Loss function for Head (pre).

    This function processes RoIs for :func:`bbox_head_loss_post`.

    Args:
        rois (iterable of arrays): An iterable of arrays of
            shape :math:`(R_l, 4)`, where :math:`R_l` is the number
            of RoIs in the :math:`l`-th feature map.
        roi_indices (iterable of arrays): An iterable of arrays of
            shape :math:`(R_l,)`.
        std (tuple of floats): Two coefficients used for encoding
            bounding boxes.
        bboxes (list of arrays): A list of arrays whose shape is
            :math:`(R_n, 4)`, where :math:`R_n` is the number of
            ground truth bounding boxes.
        labels (list of arrays): A list of arrays whose shape is
            :math:`(R_n,)`.

     Returns:
         tuple of four lists:
         :obj:`rois`, :obj:`roi_indices`, :obj:`gt_locs`, and :obj:`gt_labels`.

          * **rois**: A list of arrays of shape :math:`(R'_l, 4)`, \
              where :math:`R'_l` is the number of RoIs in the :math:`l`-th \
              feature map.
          * **roi_indices**: A list of arrays of shape :math:`(R'_l,)`.
          * **gt_locs**: A list of arrays of shape :math:`(R'_l, 4) \
              indicating the bounding boxes of ground truth.
          * **roi_indices**: A list of arrays of shape :math:`(R'_l,)` \
              indicating the classes of ground truth.
    """

    thresh = 0.5
    batchsize_per_image = 512
    fg_ratio = 0.25

    xp = cuda.get_array_module(*rois)

    n_level = len(rois)
    roi_levels = xp.hstack(
        xp.array((l, ) * len(rois[l]))
        for l in range(n_level)).astype(np.int32)
    rois = xp.vstack(rois).astype(np.float32)
    roi_indices = xp.hstack(roi_indices).astype(np.int32)

    rois_yx = (rois[:, 2:] + rois[:, :2]) / 2
    rois_hw = rois[:, 2:] - rois[:, :2]
    indices = np.unique(cuda.to_cpu(roi_indices))

    gt_locs = xp.empty_like(rois)
    gt_labels = xp.empty_like(roi_indices)
    for i in indices:
        mask = roi_indices == i

        if len(bboxes[i]) > 0:
            iou = utils.bbox_iou(rois[mask], bboxes[i])
            gt_index = iou.argmax(axis=1)

            gt_loc = bboxes[i][gt_index].copy()
        else:
            gt_loc = xp.empty_like(rois[mask])
        # tlbr -> yxhw
        gt_loc[:, 2:] -= gt_loc[:, :2]
        gt_loc[:, :2] += gt_loc[:, 2:] / 2
        # offset
        gt_loc[:, :2] = (gt_loc[:, :2] - rois_yx[mask]) / \
            rois_hw[mask] / std[0]
        gt_loc[:, 2:] = xp.log(gt_loc[:, 2:] / rois_hw[mask]) / std[1]

        if len(bboxes[i]) > 0:
            gt_label = labels[i][gt_index] + 1
            gt_label[iou.max(axis=1) < thresh] = 0
        else:
            gt_label = xp.zeros(int(mask.sum()), dtype=np.int32)

        fg_index = xp.where(gt_label > 0)[0]
        n_fg = int(batchsize_per_image * fg_ratio)
        if len(fg_index) > n_fg:
            gt_label[choice(fg_index, size=len(fg_index) - n_fg)] = -1

        bg_index = xp.where(gt_label == 0)[0]
        n_bg = batchsize_per_image - int((gt_label > 0).sum())
        if len(bg_index) > n_bg:
            gt_label[choice(bg_index, size=len(bg_index) - n_bg)] = -1

        gt_locs[mask] = gt_loc
        gt_labels[mask] = gt_label

    mask = gt_labels >= 0
    rois = rois[mask]
    roi_indices = roi_indices[mask]
    roi_levels = roi_levels[mask]
    gt_locs = gt_locs[mask]
    gt_labels = gt_labels[mask]

    masks = [roi_levels == l for l in range(n_level)]
    rois = [rois[m] for m in masks]
    roi_indices = [roi_indices[m] for m in masks]
    gt_locs = [gt_locs[m] for m in masks]
    gt_labels = [gt_labels[m] for m in masks]

    return rois, roi_indices, gt_locs, gt_labels
예제 #2
0
def rpn_loss(locs, confs, anchors, sizes, bboxes):
    """Loss function for RPN.

     Args:
         locs (iterable of arrays): An iterable of arrays whose shape is
             :math:`(N, K_l, 4)`, where :math:`K_l` is the number of
             the anchor boxes of the :math:`l`-th level.
         confs (iterable of arrays): An iterable of arrays whose shape is
             :math:`(N, K_l)`.
         anchors (list of arrays): A list of arrays returned by
             :meth:`anchors`.
         sizes (list of tuples of two ints): A list of
             :math:`(H_n, W_n)`, where :math:`H_n` and :math:`W_n`
             are height and width of the :math:`n`-th image.
         bboxes (list of arrays): A list of arrays whose shape is
             :math:`(R_n, 4)`, where :math:`R_n` is the number of
             ground truth bounding boxes.

     Returns:
         tuple of two variables:
         :obj:`loc_loss` and :obj:`conf_loss`.
    """
    fg_thresh = 0.7
    bg_thresh = 0.3
    batchsize_per_image = 256
    fg_ratio = 0.25

    locs = F.concat(locs)
    confs = F.concat(confs)

    xp = cuda.get_array_module(locs.array, confs.array)

    anchors = xp.vstack(anchors)
    anchors_yx = (anchors[:, 2:] + anchors[:, :2]) / 2
    anchors_hw = anchors[:, 2:] - anchors[:, :2]

    loc_loss = 0
    conf_loss = 0
    for i in range(len(sizes)):
        if len(bboxes[i]) > 0:
            iou = utils.bbox_iou(anchors, bboxes[i])

            gt_loc = bboxes[i][iou.argmax(axis=1)].copy()
            # tlbr -> yxhw
            gt_loc[:, 2:] -= gt_loc[:, :2]
            gt_loc[:, :2] += gt_loc[:, 2:] / 2
            # offset
            gt_loc[:, :2] = (gt_loc[:, :2] - anchors_yx) / anchors_hw
            gt_loc[:, 2:] = xp.log(gt_loc[:, 2:] / anchors_hw)
        else:
            gt_loc = xp.empty_like(anchors)

        gt_label = xp.empty(len(anchors), dtype=np.int32)
        gt_label[:] = -1

        mask = xp.logical_and(anchors[:, :2] >= 0,
                              anchors[:, 2:] < xp.array(sizes[i])).all(axis=1)

        if len(bboxes[i]) > 0:
            gt_label[xp.where(mask)[0][(iou[mask] == iou[mask].max(
                axis=0)).any(axis=1)]] = 1
            gt_label[xp.logical_and(mask, iou.max(axis=1) >= fg_thresh)] = 1

        fg_index = xp.where(gt_label == 1)[0]
        n_fg = int(batchsize_per_image * fg_ratio)
        if len(fg_index) > n_fg:
            gt_label[choice(fg_index, size=len(fg_index) - n_fg)] = -1

        if len(bboxes[i]) > 0:
            bg_index = xp.where(
                xp.logical_and(mask,
                               iou.max(axis=1) < bg_thresh))[0]
        else:
            bg_index = xp.where(mask)[0]
        n_bg = batchsize_per_image - int((gt_label == 1).sum())
        if len(bg_index) > n_bg:
            gt_label[bg_index[xp.random.randint(len(bg_index), size=n_bg)]] = 0

        n_sample = (gt_label >= 0).sum()
        loc_loss += F.sum(
            smooth_l1(locs[i][gt_label == 1], gt_loc[gt_label == 1],
                      1 / 9)) / n_sample
        conf_loss += F.sum(F.sigmoid_cross_entropy(
            confs[i][gt_label >= 0], gt_label[gt_label >= 0], reduce='no')) \
            / n_sample

    loc_loss /= len(sizes)
    conf_loss /= len(sizes)

    return loc_loss, conf_loss