def get_cls_reg_ctr_targets(points, gt_bboxes, bbox_scale = 0.25): """ Compute regression, classification targets for points in multiple images. Args: points (Tensor): (1, 2, 19, 19). gt_bboxes (Tensor): Ground truth bboxes of each image, (B,4), in [tl_x, tl_y, br_x, br_y] format. Returns: cls_labels (Tensor): Labels. (B, 1, 19, 19) 0 or 1, 0 means background, 1 means in the box. bbox_targets (Tensor): BBox targets. (B, 4, 19, 19) only consider the foreground, for the background should set loss as 0! centerness_targets (Tensor): (B, 1, 19, 19) only consider the foreground, for the background should set loss as 0! """ gt_bboxes = F.add_axis(gt_bboxes, axis=-1) gt_bboxes = F.add_axis(gt_bboxes, axis=-1) # (B,4,1,1) # cls_labels # 计算四个值以确定是否在内部,由于template比较大,于是缩小bbox为之前的1/2 gap = (gt_bboxes[:, 2, ...] - gt_bboxes[:, 0, ...]) * (1-bbox_scale) / 2 up_bound = points[:, 0, ...] > gt_bboxes[:, 0, ...] + gap left_bound = points[:, 1, ...] > gt_bboxes[:, 1, ...] + gap down_bound = points[:, 0, ...] < gt_bboxes[:, 2, ...] - gap right_bound = points[:, 1, ...] < gt_bboxes[:, 3, ...] - gap cls_labels = up_bound * left_bound * down_bound * right_bound cls_labels = F.add_axis(cls_labels, axis=1) # (B,1,19,19) # bbox_targets # 对于points中的每个坐标,计算偏离情况(这里每个坐标都会计算,所以会有负数) up_left = points - gt_bboxes[:, 0:2, ...] # (B, 2, 19, 19) bottom_right = gt_bboxes[:, 2:4, ...] - points bbox_targets = F.concat([up_left, bottom_right], axis = 1) # (B, 4, 19, 19) # centerness_targets up_bottom = F.minimum(up_left[:, 0, ...], bottom_right[:, 0, ...]) / F.maximum(up_left[:, 0, ...], bottom_right[:, 0, ...]) left_right = F.minimum(up_left[:, 1, ...], bottom_right[:, 1, ...]) / F.maximum(up_left[:, 1, ...], bottom_right[:, 1, ...]) centerness_targets = F.sqrt(F.abs(up_bottom * left_right)) return cls_labels, bbox_targets, centerness_targets
def mask_anchor_opr(gtboxes, im_info, anchors, labels): eps = 1e-6 gtboxes = gtboxes[:im_info[5].astype(np.int32), :] ignore_mask = (gtboxes[:, 4] < 0).astype(np.float32) mask_flag = F.zeros(labels.shape[0]) N, K = anchors.shape[0], gtboxes.shape[0] p_pred = F.broadcast_to(F.expand_dims(anchors, 1), (N, K, anchors.shape[1])) p_gt = F.broadcast_to(F.expand_dims(gtboxes, 0), (N, K, gtboxes.shape[1])) max_off = F.concat([ F.maximum(p_pred[:, :, :2], p_gt[:, :, :2]), F.minimum(p_pred[:, :, 2:4], p_gt[:, :, 2:4]) ], axis=2) I = F.maximum(max_off[:, :, 2] - max_off[:, :, 0] + 1, 0) * F.maximum( max_off[:, :, 3] - max_off[:, :, 1] + 1, 0) A = F.maximum(p_pred[:, :, 2] - p_pred[:, :, 0] + 1, 0) * F.maximum( p_pred[:, :, 3] - p_pred[:, :, 1] + 1, 0) # I = F.maximum(I, 0) # A = F.maximum(A, 0) IoA = I / (A + eps) IoA = IoA * F.expand_dims(ignore_mask, 0) mask_flag = (IoA > 0.5).sum(axis=1) > 0 labels = labels - F.equal(labels, 0).astype(np.float32) * mask_flag.astype( np.float32) return labels
def get_smooth_l1_loss( pred_bbox: Tensor, gt_bbox: Tensor, label: Tensor, sigma: int = 3, background: int = 0, ignore_label: int = -1, fix_smooth_l1: bool = False, norm_type: str = "fg", ) -> Tensor: r"""Smooth l1 loss used in RetinaNet. Args: pred_bbox (Tensor): the predicted bbox with the shape of :math:`(B, A, 4)` gt_bbox (Tensor): the ground-truth bbox with the shape of :math:`(B, A, 4)` label (Tensor): the assigned label of boxes with shape of :math:`(B, A)` sigma (int): the parameter of smooth l1 loss. Default: 1 background (int): the value of background class. Default: 0 ignore_label (int): the value of ignore class. Default: -1 fix_smooth_l1 (bool): is to use huber loss, default is False to use original smooth-l1 norm_type (str): current support 'fg', 'all', 'none': 'fg': loss will be normalized by number of fore-ground samples 'all': loss will be normalized by number of all samples 'none': not norm Returns: the calculated smooth l1 loss. """ pred_bbox = pred_bbox.reshape(-1, 4) gt_bbox = gt_bbox.reshape(-1, 4) label = label.reshape(-1) fg_mask = (label != background) * (label != ignore_label) losses = get_smooth_l1_base(pred_bbox, gt_bbox, sigma, is_fix=fix_smooth_l1) if norm_type == "fg": loss = (losses.sum(axis=1) * fg_mask).sum() / F.maximum( fg_mask.sum(), 1) elif norm_type == "all": all_mask = (label != ignore_label) loss = (losses.sum(axis=1) * fg_mask).sum() / F.maximum( all_mask.sum(), 1) else: raise NotImplementedError return loss
def get_smooth_l1_loss( pred_bbox: Tensor, gt_bbox: Tensor, labels: Tensor, beta: int = 1, background: int = 0, ignore_label: int = -1, norm_type: str = "fg", ) -> Tensor: r"""Smooth l1 loss used in RetinaNet. Args: pred_bbox (Tensor): the predicted bbox with the shape of :math:`(B, A, 4)` gt_bbox (Tensor): the ground-truth bbox with the shape of :math:`(B, A, 4)` labels (Tensor): the assigned labels of boxes with shape of :math:`(B, A)` beta (int): the parameter of smooth l1 loss. Default: 1 background (int): the value of background class. Default: 0 ignore_label (int): the value of ignore class. Default: -1 norm_type (str): current support "fg", "all", "none": "fg": loss will be normalized by number of fore-ground samples "all": loss will be normalized by number of all samples "none": not norm Returns: the calculated smooth l1 loss. """ pred_bbox = pred_bbox.reshape(-1, 4) gt_bbox = gt_bbox.reshape(-1, 4) labels = labels.reshape(-1) fg_mask = (labels != background) * (labels != ignore_label) loss = get_smooth_l1_base(pred_bbox, gt_bbox, beta) loss = (loss.sum(axis=1) * fg_mask).sum() if norm_type == "fg": loss = loss / F.maximum(fg_mask.sum(), 1) elif norm_type == "all": all_mask = labels != ignore_label loss = loss / F.maximum(all_mask.sum(), 1) elif norm_type == "none": return loss else: raise NotImplementedError return loss
def get_clipped_box(boxes, hw): """ Clip the boxes into the image region.""" # x1 >=0 box_x1 = F.maximum(F.minimum(boxes[:, 0::4], hw[1]), 0) # y1 >=0 box_y1 = F.maximum(F.minimum(boxes[:, 1::4], hw[0]), 0) # x2 < im_info[1] box_x2 = F.maximum(F.minimum(boxes[:, 2::4], hw[1]), 0) # y2 < im_info[0] box_y2 = F.maximum(F.minimum(boxes[:, 3::4], hw[0]), 0) clip_box = F.concat([box_x1, box_y1, box_x2, box_y2], axis=1) return clip_box
def forward(self, x): x1 = self.conv_frelu1(x) x1 = self.bn1(x1) x2 = self.conv_frelu2(x) x2 = self.bn2(x2) x = F.maximum(x, x1 + x2) return x
def _bce_loss_with_logits(output, labels, **kwargs): r""" Sigmoid cross entropy with logits, see tensorflow https://www.tensorflow.org/api_docs/python/tf/nn/sigmoid_cross_entropy_with_logits """ loss = F.maximum(output, 0) - output * labels + F.log(1 + F.exp(-F.abs(output))) return loss.mean()
def forward(self, pred, target, weight=None): """ pred: (B*H*W, 4) weight: (B*H*W, ) """ pred_left = pred[:, 1] pred_top = pred[:, 0] pred_right = pred[:, 3] pred_bottom = pred[:, 2] target_left = target[:, 1] target_top = target[:, 0] target_right = target[:, 3] target_bottom = target[:, 2] target_aera = (target_left + target_right) * (target_top + target_bottom) pred_aera = (pred_left + pred_right) * (pred_top + pred_bottom) w_intersect = F.minimum(pred_left, target_left) + F.minimum( pred_right, target_right) h_intersect = F.minimum(pred_bottom, target_bottom) + F.minimum( pred_top, target_top) g_w_intersect = F.maximum(pred_left, target_left) + F.maximum( pred_right, target_right) g_h_intersect = F.maximum(pred_bottom, target_bottom) + F.maximum( pred_top, target_top) ac_uion = g_w_intersect * g_h_intersect area_intersect = w_intersect * h_intersect area_union = target_aera + pred_aera - area_intersect ious = (area_intersect + 1.0) / (area_union + 1.0) gious = ious - (ac_uion - area_union) / ac_uion if self.loc_loss_type == 'iou': losses = -F.log(ious) elif self.loc_loss_type == 'linear_iou': losses = 1 - ious elif self.loc_loss_type == 'giou': losses = 1 - gious else: raise NotImplementedError if weight is not None: return (losses * weight).sum() else: return losses.sum()
def get_cls_reg_ctr_targets(self, points, gt_bboxes, bbox_scale=0.15): """ Compute regression, classification targets for points in multiple images. Args: points (Tensor): (1, 2, 37, 37). 每个点在原图上对应的点的位置 gt_bboxes (Tensor): Ground truth bboxes of each image, (B,4), in [tl_x, tl_y, br_x, br_y] format. 左上角右下角 原图上的bbox框 Returns: cls_labels (Tensor): Labels. (B, 1, 37, 37) 0 or 1, 0 means background, 1 means in the box. bbox_targets (Tensor): BBox targets. (B, 4, 37, 37) only consider the foreground, for the background should set loss as 0! centerness_targets (Tensor): (B, 1, 37, 37) only consider the foreground, for the background should set loss as 0! """ B, _ = gt_bboxes.shape gt_bboxes = F.add_axis(gt_bboxes, axis=-1) gt_bboxes = F.add_axis(gt_bboxes, axis=-1) # (B,4,1,1) # cls_labels # 计算四个值以确定是否在内部,由于template比较大,于是缩小bbox为之前的1/4 gap = (gt_bboxes[:, 2, ...] - gt_bboxes[:, 0, ...]) * (1 - bbox_scale) / 2 #求出bbox的边长 up_bound = points[:, 0, ...] > gt_bboxes[:, 0, ...] + gap left_bound = points[:, 1, ...] > gt_bboxes[:, 1, ...] + gap down_bound = points[:, 0, ...] < gt_bboxes[:, 2, ...] - gap right_bound = points[:, 1, ...] < gt_bboxes[:, 3, ...] - gap cls_labels = up_bound * left_bound * down_bound * right_bound cls_labels = F.add_axis(cls_labels, axis=1) # (B, 1, 37, 37) cls_labels.requires_grad = False # bbox_targets # 对于points中的每个坐标,计算偏离情况(这里每个坐标都会计算,所以会有负数) up_left = points - gt_bboxes[:, 0:2, ...] # (B, 2, 37, 37) score map每个点和左上角点的差 bottom_right = gt_bboxes[:, 2:4, ...] - points bbox_targets = F.concat([up_left, bottom_right], axis=1) # (B, 4, 37, 37) bbox_targets.requires_grad = False # centerness_targets up_bottom = F.minimum(up_left[:, 0, ...], bottom_right[:, 0, ...]) / F.maximum( up_left[:, 0, ...], bottom_right[:, 0, ...]) left_right = F.minimum(up_left[:, 1, ...], bottom_right[:, 1, ...]) / F.maximum( up_left[:, 1, ...], bottom_right[:, 1, ...]) centerness_targets = F.sqrt(F.abs(up_bottom * left_right)) centerness_targets = F.add_axis(centerness_targets, axis=1) # (B,1,37,37) centerness_targets.requires_grad = False return cls_labels, bbox_targets, centerness_targets
def roi_pool( rpn_fms, rois, stride, pool_shape, pooler_type="roi_align", ): rois = rois.detach() assert len(stride) == len(rpn_fms) canonical_level = 4 canonical_box_size = 224 min_level = int(math.log2(stride[0])) max_level = int(math.log2(stride[-1])) num_fms = len(rpn_fms) box_area = (rois[:, 3] - rois[:, 1]) * (rois[:, 4] - rois[:, 2]) assigned_level = F.floor(canonical_level + F.log(F.sqrt(box_area) / canonical_box_size) / np.log(2)).astype("int32") assigned_level = F.minimum(assigned_level, max_level) assigned_level = F.maximum(assigned_level, min_level) assigned_level = assigned_level - min_level # avoid empty assignment assigned_level = F.concat([ assigned_level, F.arange(num_fms, dtype="int32", device=assigned_level.device) ], ) rois = F.concat([rois, F.zeros((num_fms, rois.shape[-1]))]) pool_list, inds_list = [], [] for i in range(num_fms): _, inds = F.cond_take(assigned_level == i, assigned_level) level_rois = rois[inds] if pooler_type == "roi_pool": pool_fm = F.nn.roi_pooling(rpn_fms[i], level_rois, pool_shape, mode="max", scale=1.0 / stride[i]) elif pooler_type == "roi_align": pool_fm = F.nn.roi_align( rpn_fms[i], level_rois, pool_shape, mode="average", spatial_scale=1.0 / stride[i], sample_points=2, aligned=True, ) pool_list.append(pool_fm) inds_list.append(inds) fm_order = F.argsort(F.concat(inds_list, axis=0)) pool_feature = F.concat(pool_list, axis=0) pool_feature = pool_feature[fm_order][:-num_fms] return pool_feature
def forward(self, x): B, C, _, _ = x.shape # avg_dims = tuple(range(2, len(x.shape))) # [2 ,3 ] nu2 = F.expand_dims(F.pow(x, 2).reshape(B, C, -1).mean(axis=-1, keepdims=True), axis=-1) # [B, C, 1, 1] x = x / F.sqrt(nu2 + F.abs(self.eps)) return F.maximum(self.gamma * x + self.beta, self.tau)
def layernorm(x): original_shape = x.shape x = x.reshape(original_shape[0], -1) m = F.mean(x, axis=1, keepdims=True) v = F.mean((x - m)**2, axis=1, keepdims=True) x = (x - m) / F.maximum(F.sqrt(v), 1e-6) x = x.reshape(original_shape) return x
def forward(self, input): """ Forward pass of the function. """ tau = self.conv_frelu(input) tau = self.bn_frelu(tau) output = F.maximum(input, tau) return output
def roi_pool( rpn_fms, rois, stride, pool_shape, roi_type="roi_align", ): assert len(stride) == len(rpn_fms) canonical_level = 4 canonical_box_size = 224 min_level = math.log2(stride[0]) max_level = math.log2(stride[-1]) num_fms = len(rpn_fms) box_area = (rois[:, 3] - rois[:, 1]) * (rois[:, 4] - rois[:, 2]) level_assignments = F.floor(canonical_level + F.log(box_area.sqrt() / canonical_box_size) / np.log(2)) level_assignments = F.minimum(level_assignments, max_level) level_assignments = F.maximum(level_assignments, min_level) level_assignments = level_assignments - min_level # avoid empty assignment level_assignments = F.concat( [level_assignments, mge.tensor(np.arange(num_fms, dtype=np.int32))], ) rois = F.concat([rois, mge.zeros((num_fms, rois.shapeof(-1)))]) pool_list, inds_list = [], [] for i in range(num_fms): mask = level_assignments == i _, inds = F.cond_take(mask == 1, mask) level_rois = rois.ai[inds] if roi_type == "roi_pool": pool_fm = F.roi_pooling(rpn_fms[i], level_rois, pool_shape, mode="max", scale=1.0 / stride[i]) elif roi_type == "roi_align": pool_fm = F.roi_align( rpn_fms[i], level_rois, pool_shape, mode="average", spatial_scale=1.0 / stride[i], sample_points=2, aligned=True, ) pool_list.append(pool_fm) inds_list.append(inds) fm_order = F.concat(inds_list, axis=0) fm_order = F.argsort(fm_order.reshape(1, -1))[1].reshape(-1) pool_feature = F.concat(pool_list, axis=0) pool_feature = pool_feature.ai[fm_order][:-num_fms] return pool_feature
def box_overlap_opr(box: Tensor, gt: Tensor) -> Tensor: """ Given two lists of boxes of size N and M, compute the IoU (intersection over union) between __all__ N x M pairs of boxes. The box order must be (xmin, ymin, xmax, ymax). Args: boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively. Returns: Tensor: IoU, sized [N,M]. """ # box = boxes1 # gt = boxes2 # target_shape = (boxes1.shape[0], boxes2.shape[0], 4) N, K = box.shape[0], gt.shape[0] b_box = F.broadcast_to(F.expand_dims(box, 1), (N, K, box.shape[1])) b_gt = F.broadcast_to(F.expand_dims(gt, 0), (N, K, gt.shape[1])) # b_gt = F.expand_dims(gt, 0).broadcast_to(N, K, gt.shape[1]) # b_box = F.expand_dims(boxes1, 1).broadcast(*target_shape) # b_gt = F.expand_dims(boxes2, 0).broadcast(*target_shape) iw = F.minimum(b_box[:, :, 2], b_gt[:, :, 2]) - F.maximum( b_box[:, :, 0], b_gt[:, :, 0]) ih = F.minimum(b_box[:, :, 3], b_gt[:, :, 3]) - F.maximum( b_box[:, :, 1], b_gt[:, :, 1]) inter = F.maximum(iw, 0) * F.maximum(ih, 0) area_box = F.maximum(box[:, 2] - box[:, 0], 0) * F.maximum( box[:, 3] - box[:, 1], 0) area_gt = F.maximum(gt[:, 2] - gt[:, 0], 0) * F.maximum( gt[:, 3] - gt[:, 1], 0) # area_target_shape = (box.shape[0], gt.shapeof()[0]) b_area_box = F.broadcast_to(F.expand_dims(area_box, 1), (N, K)) b_area_gt = F.broadcast_to(F.expand_dims(area_gt, 0), (N, K)) # b_area_box = F.expand_dims(area_box, 1).broadcast_to(N, K) # b_area_gt = F.expand_dims(area_gt, 0).broadcast_to(N, K) # b_area_box = F.add_axis(area_box, 1).broadcast(*area_target_shape) # b_area_gt = F.add_axis(area_gt, 0).broadcast(*area_target_shape) union = b_area_box + b_area_gt - inter overlaps = F.maximum(inter / union, 0) return overlaps
def softmax_cross_entropy(pred, label, axis=1, ignore_index=255): offset = F.zero_grad(pred.max(axis=axis, keepdims=True)) pred = pred - offset log_prob = pred - F.log(F.exp(pred).sum(axis=axis, keepdims=True)) mask = 1 - F.equal(label, ignore_index) vlabel = label * mask loss = -(F.indexing_one_hot(log_prob, vlabel, axis) * mask).sum() / F.maximum(mask.sum(), 1) return loss
def softmax_loss(score, label, ignore_label=-1): max_score = F.zero_grad(score.max(axis=1, keepdims=True)) score -= max_score log_prob = score - F.log(F.exp(score).sum(axis=1, keepdims=True)) mask = (label != ignore_label) vlabel = label * mask loss = -(F.indexing_one_hot(log_prob, vlabel.astype("int32"), 1) * mask).sum() loss = loss / F.maximum(mask.sum(), 1) return loss
def get_focal_loss( logits: Tensor, labels: Tensor, ignore_label: int = -1, background: int = 0, alpha: float = 0.5, gamma: float = 0, norm_type: str = "fg", ) -> Tensor: r"""Focal Loss for Dense Object Detection: <https://arxiv.org/pdf/1708.02002.pdf> .. math:: FL(p_t) = -\alpha_t(1-p_t)^\gamma \log(p_t) Args: logits (Tensor): the predicted logits with the shape of :math:`(B, A, C)` labels (Tensor): the assigned labels of boxes with shape of :math:`(B, A)` ignore_label (int): the value of ignore class. Default: -1 background (int): the value of background class. Default: 0 alpha (float): parameter to mitigate class imbalance. Default: 0.5 gamma (float): parameter to mitigate easy/hard loss imbalance. Default: 0 norm_type (str): current support "fg", "none": "fg": loss will be normalized by number of fore-ground samples "none": not norm Returns: the calculated focal loss. """ class_range = F.arange(1, logits.shape[2] + 1) labels = F.add_axis(labels, axis=2) scores = F.sigmoid(logits) pos_part = (1 - scores)**gamma * layers.logsigmoid(logits) neg_part = scores**gamma * layers.logsigmoid(-logits) pos_loss = -(labels == class_range) * pos_part * alpha neg_loss = (-(labels != class_range) * (labels != ignore_label) * neg_part * (1 - alpha)) loss = (pos_loss + neg_loss).sum() if norm_type == "fg": fg_mask = (labels != background) * (labels != ignore_label) return loss / F.maximum(fg_mask.sum(), 1) elif norm_type == "none": return loss else: raise NotImplementedError
def softmax_loss(pred, label, ignore_label=-1): max_pred = pred.max(axis=1, keepdims=True).detach() pred -= max_pred log_prob = pred - F.log(F.exp(pred).sum(axis=1, keepdims=True)) mask = 1 - F.equal(label, ignore_label) vlabel = label * mask.astype(np.float32) loss = -(F.nn.indexing_one_hot(log_prob, vlabel.astype(np.int32), 1).flatten() * mask) loss = loss.sum() / F.maximum(mask.sum(), 1) return loss
def iou_l1_loss(pred, max_overlaps, gt, ignore_label=-1, background=0): pred = pred.reshape(pred.shape[0], -1, max_overlaps.shape[2]) abs_x = F.abs(pred - max_overlaps) mask_bg = 1 - F.equal(gt, background).astype(np.float32) mask_ig = 1 - F.equal(gt, ignore_label).astype(np.float32) mask = mask_bg * mask_ig mask = mask.reshape(mask.shape[0], -1, pred.shape[2]) loss = (abs_x * mask).sum() / F.maximum(mask.sum(), 1) return loss
def smooth_l1_loss_retina(pred, gt, label, sigma=3, background=0, ignore_label=-1, axis=2): value = _smooth_l1_base(pred, gt, sigma) mask, mask_ig = _get_mask_of_label(label, background, ignore_label) loss = (value.sum(axis=axis) * mask).sum() / F.maximum(mask.sum(), 1) return loss
def compute_gemini_loss(self, prob, bbox_targets, labels): c = prob.shape[1] prob = prob.reshape(-1, 2, c).transpose(1, 0, 2) a, b = prob[0], prob[1] loss0 = self.compute_emd_loss(a, b, bbox_targets, labels) loss1 = self.compute_emd_loss(b, a, bbox_targets, labels) loss = F.stack([loss0, loss1], axis=1) vlabel = (labels > -1).reshape(-1, 2).sum(axis=1) > 1 emd_loss = loss.min(axis=1).sum() / F.maximum(vlabel.sum(), 1) return emd_loss
def softmax_loss(scores: Tensor, labels: Tensor, ignore_label: int = -1) -> Tensor: max_scores = F.zero_grad(scores.max(axis=1, keepdims=True)) scores -= max_scores log_prob = scores - F.log(F.exp(scores).sum(axis=1, keepdims=True)) mask = labels != ignore_label vlabels = labels * mask loss = -(F.indexing_one_hot(log_prob, vlabels.astype("int32"), 1) * mask).sum() loss = loss / F.maximum(mask.sum(), 1) return loss
def compute_gemini_loss_opr(self, prob, bbox_targets, labels): prob = prob.reshape(prob.shape[0], 2, -1) n, _, c = prob.shape prob = prob.transpose(1, 0, 2) a, b = prob[0], prob[1] loss0 = self.compute_emd_loss_opr(a, b, bbox_targets, labels) loss1 = self.compute_emd_loss_opr(b, a, bbox_targets, labels) loss = F.stack([loss0, loss1], dim=1) emd_loss = loss.min(axis=1)[0].sum() / F.maximum(loss.shape[0], 1) loss = {'rcnn_emd_loss': emd_loss} return loss
def get_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor: """ Given two lists of boxes of size N and M, compute the IoU (intersection over union) between __all__ N x M pairs of boxes. The box order must be (xmin, ymin, xmax, ymax). Args: boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively. Returns: Tensor: IoU, sized [N,M]. """ box = boxes1 gt = boxes2 target_shape = (boxes1.shape[0], boxes2.shapeof()[0], 4) b_box = F.add_axis(boxes1, 1).broadcast(*target_shape) b_gt = F.add_axis(boxes2, 0).broadcast(*target_shape) iw = F.minimum(b_box[:, :, 2], b_gt[:, :, 2]) - F.maximum( b_box[:, :, 0], b_gt[:, :, 0] ) ih = F.minimum(b_box[:, :, 3], b_gt[:, :, 3]) - F.maximum( b_box[:, :, 1], b_gt[:, :, 1] ) inter = F.maximum(iw, 0) * F.maximum(ih, 0) area_box = (box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1]) area_gt = (gt[:, 2] - gt[:, 0]) * (gt[:, 3] - gt[:, 1]) area_target_shape = (box.shape[0], gt.shapeof()[0]) b_area_box = F.add_axis(area_box, 1).broadcast(*area_target_shape) b_area_gt = F.add_axis(area_gt, 0).broadcast(*area_target_shape) union = b_area_box + b_area_gt - inter overlaps = F.maximum(inter / union, 0) return overlaps
def roi_pool(rpn_fms, rois, stride, pool_shape, roi_type='roi_align', labels=None, bbox_targets=None): assert len(stride) == len(rpn_fms) canonical_level = 4 canonical_box_size = 224 min_level = math.log2(stride[0]) max_level = math.log2(stride[-1]) num_fms = len(rpn_fms) box_sizes = F.sqrt((rois[:, 3] - rois[:, 1]) * (rois[:, 4] - rois[:, 2])) level_assignments = F.floor( canonical_level + F.log(box_sizes / canonical_box_size) / np.log(2) ) level_assignments = F.minimum(level_assignments, max_level) level_assignments = F.maximum(level_assignments, min_level) level_assignments = level_assignments - min_level available_masks = F.concat( [mge.ones(level_assignments.shapeof()[0]), mge.zeros(num_fms)], axis=0) level_assignments = F.concat([level_assignments, mge.tensor(np.arange(num_fms, dtype=np.int32))], axis=0) rois = F.concat([rois, mge.zeros((num_fms, rois.shapeof()[-1]))], axis=0) if labels is not None: labels = F.concat([labels, mge.ones((num_fms, labels.shapeof()[-1]))], axis=0) bbox_targets = F.concat([bbox_targets, mge.zeros((num_fms, bbox_targets.shapeof()[-1]))], axis=0) pool_list, inds_list = [], [] for i in range(len(rpn_fms)): mask = level_assignments == i inds = mask_to_inds(mask) rois_fm = rois.ai[inds] if roi_type == 'roi_pool': pool_fm = F.roi_pooling( rpn_fms[i], rois_fm, pool_shape, mode='max', scale=1.0/stride[i]) elif roi_type == 'roi_align': pool_fm = F.roi_align( rpn_fms[i], rois_fm, pool_shape, mode='average', spatial_scale=1.0/stride[i], sample_points=2, aligned=True) pool_list.append(pool_fm) inds_list.append(inds) fm_order = F.concat(inds_list, axis=0) pool_feature = F.concat(pool_list, axis=0) ordered_available_masks = available_masks.ai[fm_order] available_inds = mask_to_inds(ordered_available_masks) pool_feature = pool_feature.ai[available_inds] rois = rois.ai[fm_order, :].ai[available_inds, :] if labels is not None: labels = labels.ai[fm_order].ai[available_inds] bbox_targets = bbox_targets.ai[fm_order, :].ai[available_inds, :] return pool_feature, rois, F.zero_grad(labels), F.zero_grad(bbox_targets) else: return pool_feature, rois, None, None
def smooth_l1_loss_rcnn(pred, gt, label, sigma=1, background=0, ignore_label=-1): """ pred : (minibatch, class_num, 4) gt : (minibatch, 4) label : (minibatch, ) """ loss = smooth_l1_loss_rcnn_opr(pred, gt, label, sigma) loss = loss.sum() / F.maximum((label > 0).sum(), 1) return loss
def compute_regular_loss(self, prob, bbox_targets, labels): offsets, cls_scores = prob[:, :-self.n], prob[:, -self.n:] n = offsets.shape[0] offsets = offsets.reshape(n, -1, 4) cls_loss = softmax_loss(cls_scores, labels) bbox_loss = smooth_l1_loss_rcnn_opr(offsets, bbox_targets, labels, config.rcnn_smooth_l1_beta) bbox_loss = bbox_loss.sum() / F.maximum((labels > 0).sum(), 1) loss = {} loss['{}_cls_loss'.format(self.name)] = cls_loss loss['{}_bbox_loss'.format(self.name)] = bbox_loss return loss
def forward(self, fpn_fms, rcnn_rois, im_info=None, gt_boxes=None): rcnn_rois, labels, bbox_targets = self.get_ground_truth( rcnn_rois, im_info, gt_boxes) fpn_fms = [fpn_fms[x] for x in self.in_features] pool_features = layers.roi_pool( fpn_fms, rcnn_rois, self.stride, self.pooling_size, self.pooling_method, ) flatten_feature = F.flatten(pool_features, start_axis=1) roi_feature = F.relu(self.fc1(flatten_feature)) roi_feature = F.relu(self.fc2(roi_feature)) pred_logits = self.pred_cls(roi_feature) pred_offsets = self.pred_delta(roi_feature) if self.training: # loss for rcnn classification loss_rcnn_cls = F.loss.cross_entropy(pred_logits, labels, axis=1) # loss for rcnn regression pred_offsets = pred_offsets.reshape(-1, self.cfg.num_classes, 4) num_samples = labels.shape[0] fg_mask = labels > 0 loss_rcnn_bbox = layers.smooth_l1_loss( pred_offsets[fg_mask, labels[fg_mask] - 1], bbox_targets[fg_mask], self.cfg.rcnn_smooth_l1_beta, ).sum() / F.maximum(num_samples, 1) loss_dict = { "loss_rcnn_cls": loss_rcnn_cls, "loss_rcnn_bbox": loss_rcnn_bbox, } return loss_dict else: # slice 1 for removing background pred_scores = F.softmax(pred_logits, axis=1)[:, 1:] pred_offsets = pred_offsets.reshape(-1, 4) target_shape = (rcnn_rois.shape[0], self.cfg.num_classes, 4) # rois (N, 4) -> (N, 1, 4) -> (N, 80, 4) -> (N * 80, 4) base_rois = F.broadcast_to( F.expand_dims(rcnn_rois[:, 1:5], axis=1), target_shape).reshape(-1, 4) pred_bbox = self.box_coder.decode(base_rois, pred_offsets) return pred_bbox, pred_scores
def forward(self, data, idx, roi): N, H, W, C = data.shape xmax = roi[:, 1, 0] xmin = roi[:, 0, 0] ymax = roi[:, 1, 1] ymin = roi[:, 0, 1] scale = F.maximum((xmax - xmin) / W, (ymax - ymin) / H) I = F.broadcast_to(self.I, (N, )) M = F.broadcast_to(self.M, (N, 3, 3)) M[:, 0, 0] = scale M[:, 0, 2] = xmin M[:, 1, 1] = scale M[:, 1, 2] = ymin M[:, 2, 2] = I resized = (F.warp_perspective(data, M, (H, W), mat_idx=idx, border_mode="CONSTANT", format="NHWC").transpose( 0, 3, 1, 2).astype(np.float32)) return resized