def bbox_head_loss_pre(rois, roi_indices, std, bboxes, labels): """Loss function for Head (pre). This function processes RoIs for :func:`bbox_head_loss_post`. Args: rois (iterable of arrays): An iterable of arrays of shape :math:`(R_l, 4)`, where :math:`R_l` is the number of RoIs in the :math:`l`-th feature map. roi_indices (iterable of arrays): An iterable of arrays of shape :math:`(R_l,)`. std (tuple of floats): Two coefficients used for encoding bounding boxes. bboxes (list of arrays): A list of arrays whose shape is :math:`(R_n, 4)`, where :math:`R_n` is the number of ground truth bounding boxes. labels (list of arrays): A list of arrays whose shape is :math:`(R_n,)`. Returns: tuple of four lists: :obj:`rois`, :obj:`roi_indices`, :obj:`gt_locs`, and :obj:`gt_labels`. * **rois**: A list of arrays of shape :math:`(R'_l, 4)`, \ where :math:`R'_l` is the number of RoIs in the :math:`l`-th \ feature map. * **roi_indices**: A list of arrays of shape :math:`(R'_l,)`. * **gt_locs**: A list of arrays of shape :math:`(R'_l, 4) \ indicating the bounding boxes of ground truth. * **roi_indices**: A list of arrays of shape :math:`(R'_l,)` \ indicating the classes of ground truth. """ thresh = 0.5 batchsize_per_image = 512 fg_ratio = 0.25 xp = cuda.get_array_module(*rois) n_level = len(rois) roi_levels = xp.hstack( xp.array((l, ) * len(rois[l])) for l in range(n_level)).astype(np.int32) rois = xp.vstack(rois).astype(np.float32) roi_indices = xp.hstack(roi_indices).astype(np.int32) rois_yx = (rois[:, 2:] + rois[:, :2]) / 2 rois_hw = rois[:, 2:] - rois[:, :2] indices = np.unique(cuda.to_cpu(roi_indices)) gt_locs = xp.empty_like(rois) gt_labels = xp.empty_like(roi_indices) for i in indices: mask = roi_indices == i if len(bboxes[i]) > 0: iou = utils.bbox_iou(rois[mask], bboxes[i]) gt_index = iou.argmax(axis=1) gt_loc = bboxes[i][gt_index].copy() else: gt_loc = xp.empty_like(rois[mask]) # tlbr -> yxhw gt_loc[:, 2:] -= gt_loc[:, :2] gt_loc[:, :2] += gt_loc[:, 2:] / 2 # offset gt_loc[:, :2] = (gt_loc[:, :2] - rois_yx[mask]) / \ rois_hw[mask] / std[0] gt_loc[:, 2:] = xp.log(gt_loc[:, 2:] / rois_hw[mask]) / std[1] if len(bboxes[i]) > 0: gt_label = labels[i][gt_index] + 1 gt_label[iou.max(axis=1) < thresh] = 0 else: gt_label = xp.zeros(int(mask.sum()), dtype=np.int32) fg_index = xp.where(gt_label > 0)[0] n_fg = int(batchsize_per_image * fg_ratio) if len(fg_index) > n_fg: gt_label[choice(fg_index, size=len(fg_index) - n_fg)] = -1 bg_index = xp.where(gt_label == 0)[0] n_bg = batchsize_per_image - int((gt_label > 0).sum()) if len(bg_index) > n_bg: gt_label[choice(bg_index, size=len(bg_index) - n_bg)] = -1 gt_locs[mask] = gt_loc gt_labels[mask] = gt_label mask = gt_labels >= 0 rois = rois[mask] roi_indices = roi_indices[mask] roi_levels = roi_levels[mask] gt_locs = gt_locs[mask] gt_labels = gt_labels[mask] masks = [roi_levels == l for l in range(n_level)] rois = [rois[m] for m in masks] roi_indices = [roi_indices[m] for m in masks] gt_locs = [gt_locs[m] for m in masks] gt_labels = [gt_labels[m] for m in masks] return rois, roi_indices, gt_locs, gt_labels
def rpn_loss(locs, confs, anchors, sizes, bboxes): """Loss function for RPN. Args: locs (iterable of arrays): An iterable of arrays whose shape is :math:`(N, K_l, 4)`, where :math:`K_l` is the number of the anchor boxes of the :math:`l`-th level. confs (iterable of arrays): An iterable of arrays whose shape is :math:`(N, K_l)`. anchors (list of arrays): A list of arrays returned by :meth:`anchors`. sizes (list of tuples of two ints): A list of :math:`(H_n, W_n)`, where :math:`H_n` and :math:`W_n` are height and width of the :math:`n`-th image. bboxes (list of arrays): A list of arrays whose shape is :math:`(R_n, 4)`, where :math:`R_n` is the number of ground truth bounding boxes. Returns: tuple of two variables: :obj:`loc_loss` and :obj:`conf_loss`. """ fg_thresh = 0.7 bg_thresh = 0.3 batchsize_per_image = 256 fg_ratio = 0.25 locs = F.concat(locs) confs = F.concat(confs) xp = cuda.get_array_module(locs.array, confs.array) anchors = xp.vstack(anchors) anchors_yx = (anchors[:, 2:] + anchors[:, :2]) / 2 anchors_hw = anchors[:, 2:] - anchors[:, :2] loc_loss = 0 conf_loss = 0 for i in range(len(sizes)): if len(bboxes[i]) > 0: iou = utils.bbox_iou(anchors, bboxes[i]) gt_loc = bboxes[i][iou.argmax(axis=1)].copy() # tlbr -> yxhw gt_loc[:, 2:] -= gt_loc[:, :2] gt_loc[:, :2] += gt_loc[:, 2:] / 2 # offset gt_loc[:, :2] = (gt_loc[:, :2] - anchors_yx) / anchors_hw gt_loc[:, 2:] = xp.log(gt_loc[:, 2:] / anchors_hw) else: gt_loc = xp.empty_like(anchors) gt_label = xp.empty(len(anchors), dtype=np.int32) gt_label[:] = -1 mask = xp.logical_and(anchors[:, :2] >= 0, anchors[:, 2:] < xp.array(sizes[i])).all(axis=1) if len(bboxes[i]) > 0: gt_label[xp.where(mask)[0][(iou[mask] == iou[mask].max( axis=0)).any(axis=1)]] = 1 gt_label[xp.logical_and(mask, iou.max(axis=1) >= fg_thresh)] = 1 fg_index = xp.where(gt_label == 1)[0] n_fg = int(batchsize_per_image * fg_ratio) if len(fg_index) > n_fg: gt_label[choice(fg_index, size=len(fg_index) - n_fg)] = -1 if len(bboxes[i]) > 0: bg_index = xp.where( xp.logical_and(mask, iou.max(axis=1) < bg_thresh))[0] else: bg_index = xp.where(mask)[0] n_bg = batchsize_per_image - int((gt_label == 1).sum()) if len(bg_index) > n_bg: gt_label[bg_index[xp.random.randint(len(bg_index), size=n_bg)]] = 0 n_sample = (gt_label >= 0).sum() loc_loss += F.sum( smooth_l1(locs[i][gt_label == 1], gt_loc[gt_label == 1], 1 / 9)) / n_sample conf_loss += F.sum(F.sigmoid_cross_entropy( confs[i][gt_label >= 0], gt_label[gt_label >= 0], reduce='no')) \ / n_sample loc_loss /= len(sizes) conf_loss /= len(sizes) return loc_loss, conf_loss