def _calc_ious(self, anchor, bbox, inside_index): # ious between the anchors and the gt boxes ious = bbox_iou(anchor, bbox) argmax_ious = ious.argmax(axis=1) max_ious = ious[np.arange(len(inside_index)), argmax_ious] gt_argmax_ious = ious.argmax(axis=0) gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])] gt_argmax_ious = np.where(ious == gt_max_ious)[0] return argmax_ious, max_ious, gt_argmax_ious
def mask_voting( rois, cls_probs, mask_probs, n_class, H, W, score_thresh=0.7, nms_thresh=0.3, mask_merge_thresh=0.5, binary_thresh=0.4): mask_size = mask_probs.shape[-1] v_labels = np.empty((0, ), dtype=np.int32) v_masks = np.empty((0, mask_size, mask_size), dtype=np.float32) v_bboxes = np.empty((0, 4), dtype=np.float32) v_cls_probs = np.empty((0, ), dtype=np.float32) for l in range(0, n_class - 1): # non maximum suppression cls_prob_l = cls_probs[:, l+1] thresh_mask = cls_prob_l >= 0.001 bbox_l = rois[thresh_mask] cls_prob_l = cls_prob_l[thresh_mask] keep = non_maximum_suppression( bbox_l, nms_thresh, cls_prob_l, limit=100) bbox_l = bbox_l[keep] cls_prob_l = cls_prob_l[keep] n_bbox_l = len(bbox_l) v_mask_l = np.zeros((n_bbox_l, mask_size, mask_size)) v_bbox_l = np.zeros((n_bbox_l, 4)) for i, bbox in enumerate(bbox_l): iou = bbox_iou(rois, bbox[np.newaxis, :]) idx = np.where(iou > mask_merge_thresh)[0] mask_weights = cls_probs[idx, l + 1] mask_weights = mask_weights / mask_weights.sum() mask_prob_l = mask_probs[idx] rois_l = rois[idx] orig_mask, v_bbox_l[i] = mask_aggregation( rois_l, mask_prob_l, mask_weights, H, W, binary_thresh) v_mask_l[i] = cv2.resize( orig_mask.astype(np.float32), (mask_size, mask_size)) score_thresh_mask = cls_prob_l > score_thresh v_cls_prob_l = cls_prob_l[score_thresh_mask] v_mask_l = v_mask_l[score_thresh_mask] v_bbox_l = v_bbox_l[score_thresh_mask] v_label_l = np.repeat(l, v_bbox_l.shape[0]) v_cls_probs = np.concatenate((v_cls_probs, v_cls_prob_l)) v_masks = np.concatenate((v_masks, v_mask_l)) v_bboxes = np.concatenate((v_bboxes, v_bbox_l)) v_labels = np.concatenate((v_labels, v_label_l)) return v_labels, v_masks, v_bboxes, v_cls_probs
def __call__( self, roi, mask, label, bbox, loc_normalize_mean=(0., 0., 0., 0.), loc_normalize_std=(0.2, 0.2, 0.5, 0.5), mask_size=(21, 21), ): """Assigns ground truth to sampled proposals. This function samples total of :obj:`self.n_sample` RoIs from the combination of :obj:`roi`, :obj:`mask`, :obj:`label` and :obj: `bbox`. The RoIs are assigned with the ground truth class labels as well as bounding box offsets and scales to match the ground truth bounding boxes. As many as :obj:`pos_ratio * self.n_sample` RoIs are sampled as foregrounds. Offsets and scales of bounding boxes are calculated using :func:`chainercv.links.model.faster_rcnn.bbox2loc`. Also, types of input arrays and output arrays are same. Here are notations. * :math:`S` is the total number of sampled RoIs, which equals \ :obj:`self.n_sample`. * :math:`L` is number of object classes possibly including the \ background. * :math:`H` is the image height. * :math:`W` is the image width. * :math:`RH` is the mask height. * :math:`RW` is the mask width. Args: roi (array): Region of Interests (RoIs) from which we sample. Its shape is :math:`(R, 4)` mask (array): The coordinates of ground truth masks. Its shape is :math:`(R', H, W)`. label (array): Ground truth bounding box labels. Its shape is :math:`(R',)`. Its range is :math:`[0, L - 1]`, where :math:`L` is the number of foreground classes. bbox (array): The coordinates of ground truth bounding boxes. Its shape is :math:`(R', 4)`. loc_normalize_mean (tuple of four floats): Mean values to normalize coordinates of bounding boxes. loc_normalize_std (tuple of four floats): Standard deviation of the coordinates of bounding boxes. mask_size (tuple of int or int): Generated mask size, which is equal to :math:`(RH, RW)`. Returns: (array, array, array, array): * **sample_roi**: Regions of interests that are sampled. \ Its shape is :math:`(S, 4)`. * **gt_roi_mask**: Masks assigned to sampled RoIs. Its shape is \ :math:`(S, RH, RW)`. * **gt_roi_label**: Labels assigned to sampled RoIs. Its shape is \ :math:`(S,)`. Its range is :math:`[0, L]`. The label with \ value 0 is the background. * **gt_roi_loc**: Offsets and scales to match \ the sampled RoIs to the ground truth bounding boxes. \ Its shape is :math:`(S, 4)`. """ xp = cuda.get_array_module(roi) roi = cuda.to_cpu(roi) mask = cuda.to_cpu(mask) label = cuda.to_cpu(label) bbox = cuda.to_cpu(bbox) if not isinstance(mask_size, tuple): mask_size = (mask_size, mask_size) n_bbox, _ = bbox.shape roi = np.concatenate((roi, bbox), axis=0) if self.n_sample is None: n_sample = roi.shape[0] else: n_sample = self.n_sample pos_roi_per_image = np.round(n_sample * self.pos_ratio) iou = bbox_iou(roi, bbox) gt_assignment = iou.argmax(axis=1) max_iou = iou.max(axis=1) # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class]. # The label with value 0 is the background. gt_roi_label = label[gt_assignment] + 1 # Select foreground RoIs as those with >= pos_iou_thresh IoU. pos_index = np.where(max_iou >= self.pos_iou_thresh)[0] pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size)) if pos_index.size > 0: pos_index = np.random.choice(pos_index, size=pos_roi_per_this_image, replace=False) # Select background RoIs as those within # [neg_iou_thresh_lo, neg_iou_thresh_hi). neg_index = np.where((max_iou < self.neg_iou_thresh_hi) & (max_iou >= self.neg_iou_thresh_lo))[0] neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image neg_roi_per_this_image = int( min(neg_roi_per_this_image, neg_index.size)) if neg_index.size > 0: neg_index = np.random.choice(neg_index, size=neg_roi_per_this_image, replace=False) # The indices that we're selecting (both foreground and background). keep_index = np.append(pos_index, neg_index) gt_roi_label = gt_roi_label[keep_index] gt_roi_label[pos_roi_per_this_image:] = 0 # negative labels --> 0 sample_roi = roi[keep_index] # locs # Compute offsets and scales to match sampled RoIs to the GTs. loc_normalize_mean = np.array(loc_normalize_mean, np.float32) loc_normalize_std = np.array(loc_normalize_std, np.float32) gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]]) gt_roi_loc = gt_roi_loc - loc_normalize_mean gt_roi_loc = gt_roi_loc / loc_normalize_std # masks gt_roi_mask = -1 * np.ones( (len(keep_index), mask_size[0], mask_size[1]), dtype=np.int32) for i, pos_ind in enumerate(pos_index): bb = np.round(sample_roi[i]).astype(np.int) gt_msk = mask[gt_assignment[pos_ind]] gt_roi_msk = gt_msk[bb[0]:bb[2], bb[1]:bb[3]] gt_roi_msk = resize( gt_roi_msk.astype(np.float32)[None], mask_size)[0] gt_roi_msk = (gt_roi_msk >= self.binary_thresh).astype(np.int) gt_roi_mask[i] = gt_roi_msk if xp != np: sample_roi = cuda.to_gpu(sample_roi) gt_roi_mask = cuda.to_gpu(gt_roi_mask) gt_roi_label = cuda.to_gpu(gt_roi_label) gt_roi_loc = cuda.to_gpu(gt_roi_loc) return sample_roi, gt_roi_mask, gt_roi_label, gt_roi_loc
def mask_voting(roi_cmask_prob, bbox, roi_cls_prob, size, score_thresh, nms_thresh, mask_merge_thresh, binary_thresh, limit=100, bg_label=0): """Refine mask probabilities by merging multiple masks. First, this function discard invalid masks with non maximum suppression. Then, it merges masks with weight calculated from class probabilities and iou. This function improves the mask qualities by merging overlapped masks predicted as the same object class. Here are notations used. * :math:`R'` is the total number of RoIs produced across batches. * :math:`L` is the number of classes excluding the background. * :math:`RH` is the height of pooled image. * :math:`RW` is the height of pooled image. Args: roi_cmask_prob (array): A mask probability array whose shape is :math:`(R, RH, RW)`. bbox (array): A bounding box array whose shape is :math:`(R, 4)`. cls_prob (array): A class probability array whose shape is :math:`(R, L + 1)`. size (tuple of int): Original image size. score_thresh (float): A threshold value of the class score. nms_thresh (float): A threshold value of non maximum suppression. mask_merge_thresh (float): A threshold value of the bounding box iou for mask merging. binary_thresh (float): A threshold value of mask score for mask merging. limit (int): The maximum number of outputs. bg_label (int): The id of the background label. Returns: array, array, array, array: * **v_cmask_prob**: Merged mask probability. Its shapes is \ :math:`(N, RH, RW)`. * **v_bbox**: Bounding boxes for the merged masks. Its shape is \ :math:`(N, 4)`. * **v_label**: Class labels for the merged masks. Its shape is \ :math:`(N, )`. * **v_score**: Class probabilities for the merged masks. Its shape \ is :math:`(N, )`. """ roi_cmask_size = roi_cmask_prob.shape[1:] n_class = roi_cls_prob.shape[1] v_cmask_prob = [] v_bbox = [] v_label = [] v_cls_prob = [] cls_score = [] cls_bbox = [] for label in range(0, n_class): # background if label == bg_label: continue # non maximum suppression score_l = roi_cls_prob[:, label] keep_indices = non_maximum_suppression(bbox, nms_thresh, score_l) bbox_l = bbox[keep_indices] score_l = score_l[keep_indices] cls_bbox.append(bbox_l) cls_score.append(score_l) sorted_score = np.sort(np.concatenate(cls_score))[::-1] n_keep = min(len(sorted_score), limit) score_thresh = max(sorted_score[n_keep - 1], score_thresh) for label in range(0, n_class): # background if label == bg_label: continue bbox_l = cls_bbox[label - 1] score_l = cls_score[label - 1] keep_indices = np.where(score_l >= score_thresh) bbox_l = bbox_l[keep_indices] score_l = score_l[keep_indices] v_cmask_prob_l = [] v_bbox_l = [] v_score_l = [] for i, bb in enumerate(bbox_l): iou = bbox_iou(bbox, bb[np.newaxis, :]) keep_indices = np.where(iou >= mask_merge_thresh)[0] cmask_weight = roi_cls_prob[keep_indices, label] cmask_weight = cmask_weight / cmask_weight.sum() cmask_prob_i = roi_cmask_prob[keep_indices] bbox_i = bbox[keep_indices] m_cmask, m_bbox = _mask_aggregation(bbox_i, cmask_prob_i, cmask_weight, size, binary_thresh) if m_cmask is not None and m_bbox is not None: m_cmask = resize(m_cmask.astype(np.float32), roi_cmask_size) v_cmask_prob_l.append(m_cmask) v_bbox_l.append(m_bbox) v_score_l.append(score_l[i]) if len(v_cmask_prob_l) > 0: v_cmask_prob_l = np.concatenate(v_cmask_prob_l) v_bbox_l = np.concatenate(v_bbox_l) v_score_l = np.array(v_score_l) v_label_l = np.repeat(label - 1, v_bbox_l.shape[0]) v_label_l = v_label_l.astype(np.int32) v_cmask_prob.append(v_cmask_prob_l) v_bbox.append(v_bbox_l) v_label.append(v_label_l) v_cls_prob.append(v_score_l) if len(v_cmask_prob) > 0: v_cmask_prob = np.concatenate(v_cmask_prob) v_bbox = np.concatenate(v_bbox) v_label = np.concatenate(v_label) v_cls_prob = np.concatenate(v_cls_prob) else: v_cmask_prob = np.empty((0, roi_cmask_size[0], roi_cmask_size[1])) v_bbox = np.empty((0, 4)) v_label = np.empty((0, )) v_cls_prob = np.empty((0, )) return v_cmask_prob, v_bbox, v_label, v_cls_prob
def __call__(self, roi, bbox, label, mask, loc_normalize_mean=(0., 0., 0., 0.), loc_normalize_std=(0.1, 0.1, 0.2, 0.2)): """Assigns ground truth to sampled proposals. This function samples total of :obj:`self.n_sample` RoIs from the combination of :obj:`roi` and :obj:`bbox`. The RoIs are assigned with the ground truth class labels as well as bounding box offsets and scales to match the ground truth bounding boxes. As many as :obj:`pos_ratio * self.n_sample` RoIs are sampled as foregrounds. Offsets and scales of bounding boxes are calculated using :func:`chainercv.links.model.faster_rcnn.bbox2loc`. Also, types of input arrays and output arrays are same. Here are notations. * :math:`S` is the total number of sampled RoIs, which equals \ :obj:`self.n_sample`. * :math:`L` is number of object classes possibly including the \ background. Args: roi (array): Region of Interests (RoIs) from which we sample. Its shape is :math:`(R, 4)` bbox (array): The coordinates of ground truth bounding boxes. Its shape is :math:`(R', 4)`. label (array): Ground truth bounding box labels. Its shape is :math:`(R',)`. Its range is :math:`[0, L - 1]`, where :math:`L` is the number of foreground classes. loc_normalize_mean (tuple of four floats): Mean values to normalize coordinates of bouding boxes. loc_normalize_std (tupler of four floats): Standard deviation of the coordinates of bounding boxes. Returns: (array, array, array): * **sample_roi**: Regions of interests that are sampled. \ Its shape is :math:`(S, 4)`. * **gt_roi_loc**: Offsets and scales to match \ the sampled RoIs to the ground truth bounding boxes. \ Its shape is :math:`(S, 4)`. * **gt_roi_label**: Labels assigned to sampled RoIs. Its shape is \ :math:`(S,)`. Its range is :math:`[0, L]`. The label with \ value 0 is the background. """ xp = cuda.get_array_module(roi) roi = cuda.to_cpu(roi) bbox = cuda.to_cpu(bbox) label = cuda.to_cpu(label) mask = cuda.to_cpu(mask) n_bbox, _ = bbox.shape roi = np.concatenate((roi, bbox), axis=0) pos_roi_per_image = np.round(self.n_sample * self.pos_ratio) iou = bbox_iou(roi, bbox) gt_assignment = iou.argmax(axis=1) max_iou = iou.max(axis=1) # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class]. # The label with value 0 is the background. gt_roi_label = label[gt_assignment] + 1 # Select foreground RoIs as those with >= pos_iou_thresh IoU. pos_index = np.where(max_iou >= self.pos_iou_thresh)[0] pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size)) if pos_index.size > 0: pos_index = np.random.choice(pos_index, size=pos_roi_per_this_image, replace=False) # Select background RoIs as those within # [neg_iou_thresh_lo, neg_iou_thresh_hi). neg_index = np.where((max_iou < self.neg_iou_thresh_hi) & (max_iou >= self.neg_iou_thresh_lo))[0] neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image neg_roi_per_this_image = int( min(neg_roi_per_this_image, neg_index.size)) if neg_index.size > 0: neg_index = np.random.choice(neg_index, size=neg_roi_per_this_image, replace=False) # The indices that we're selecting (both positive and negative). keep_index = np.append(pos_index, neg_index) gt_roi_label = gt_roi_label[keep_index] gt_roi_label[pos_roi_per_this_image:] = 0 # negative labels --> 0 sample_roi = roi[keep_index] # sampled <- proposed # Compute offsets and scales to match sampled RoIs to the GTs. gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]]) gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32)) / np.array(loc_normalize_std, np.float32)) # Prepare groundtruth masks gt_roi_mask = [] _, h, w = mask.shape for i, idx in enumerate(gt_assignment[pos_index]): A = mask[idx, np.max((int(sample_roi[i, 0]), 0)):np.min((int(sample_roi[i, 2]), h)), np.max((int(sample_roi[i, 1]), 0)):np.min((int(sample_roi[i, 3]), w))] gt_roi_mask.append( cv2.resize(A, (self.roi_size * 2, self.roi_size * 2))) #debug: visualize masks #cv2.imwrite("gt_assignment_mask.png",mask[0,np.max((int(sample_roi[0,0]),0)):np.min((int(sample_roi[0,2]),h)), np.max((int(sample_roi[0,1]),0)):np.min((int(sample_roi[0,3]),w))]*255) #cv2.imwrite("gt_roi_mask.png",gt_roi_mask[0]*244)# if xp != np: sample_roi = cuda.to_gpu(sample_roi) gt_roi_loc = cuda.to_gpu(gt_roi_loc) gt_roi_label = cuda.to_gpu(gt_roi_label) gt_roi_mask = cuda.to_gpu(np.stack(gt_roi_mask).astype(np.int32)) else: gt_roi_mask = np.stack(gt_roi_mask).astype(np.int32) return sample_roi, gt_roi_loc, gt_roi_label, gt_roi_mask
def calc_detection_voc_prec_rec(pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults=None, iou_thresh=0.5): """Calculate precision and recall based on evaluation code of PASCAL VOC. This function calculates precision and recall of predicted bounding boxes obtained from a dataset which has :math:`N` images. The code is based on the evaluation code used in PASCAL VOC Challenge. Args: pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N` sets of bounding boxes. Its index corresponds to an index for the base dataset. Each element of :obj:`pred_bboxes` is a set of coordinates of bounding boxes. This is an array whose shape is :math:`(R, 4)`, where :math:`R` corresponds to the number of bounding boxes, which may vary among boxes. The second axis corresponds to :obj:`y_min, x_min, y_max, x_max` of a bounding box. pred_labels (iterable of numpy.ndarray): An iterable of labels. Similar to :obj:`pred_bboxes`, its index corresponds to an index for the base dataset. Its length is :math:`N`. pred_scores (iterable of numpy.ndarray): An iterable of confidence scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`, its index corresponds to an index for the base dataset. Its length is :math:`N`. gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth bounding boxes whose length is :math:`N`. An element of :obj:`gt_bboxes` is a bounding box whose shape is :math:`(R, 4)`. Note that the number of bounding boxes in each image does not need to be same as the number of corresponding predicted boxes. gt_labels (iterable of numpy.ndarray): An iterable of ground truth labels which are organized similarly to :obj:`gt_bboxes`. gt_difficults (iterable of numpy.ndarray): An iterable of boolean arrays which is organized similarly to :obj:`gt_bboxes`. This tells whether the corresponding ground truth bounding box is difficult or not. By default, this is :obj:`None`. In that case, this function considers all bounding boxes to be not difficult. iou_thresh (float): A prediction is correct if its Intersection over Union with the ground truth is above this value.. Returns: tuple of two lists: This function returns two lists: :obj:`prec` and :obj:`rec`. * :obj:`prec`: A list of arrays. :obj:`prec[l]` is precision \ for class :math:`l`. If class :math:`l` does not exist in \ either :obj:`pred_labels` or :obj:`gt_labels`, :obj:`prec[l]` is \ set to :obj:`None`. * :obj:`rec`: A list of arrays. :obj:`rec[l]` is recall \ for class :math:`l`. If class :math:`l` that is not marked as \ difficult does not exist in \ :obj:`gt_labels`, :obj:`rec[l]` is \ set to :obj:`None`. """ pred_bboxes = iter(pred_bboxes) pred_labels = iter(pred_labels) pred_scores = iter(pred_scores) gt_bboxes = iter(gt_bboxes) gt_labels = iter(gt_labels) if gt_difficults is None: gt_difficults = itertools.repeat(None) else: gt_difficults = iter(gt_difficults) n_pos = defaultdict(int) score = defaultdict(list) match = defaultdict(list) for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \ six.moves.zip( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults): if gt_difficult is None: gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool) for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)): pred_mask_l = pred_label == l pred_bbox_l = pred_bbox[pred_mask_l] pred_score_l = pred_score[pred_mask_l] # sort by score order = pred_score_l.argsort()[::-1] pred_bbox_l = pred_bbox_l[order] pred_score_l = pred_score_l[order] gt_mask_l = gt_label == l gt_bbox_l = gt_bbox[gt_mask_l] gt_difficult_l = gt_difficult[gt_mask_l] n_pos[l] += np.logical_not(gt_difficult_l).sum() score[l].extend(pred_score_l) if len(pred_bbox_l) == 0: continue if len(gt_bbox_l) == 0: match[l].extend((0, ) * pred_bbox_l.shape[0]) continue # VOC evaluation follows integer typed bounding boxes. pred_bbox_l = pred_bbox_l.copy() pred_bbox_l[:, 2:] += 1 gt_bbox_l = gt_bbox_l.copy() gt_bbox_l[:, 2:] += 1 iou = bbox_iou(pred_bbox_l, gt_bbox_l) gt_index = iou.argmax(axis=1) # set -1 if there is no matching ground truth gt_index[iou.max(axis=1) < iou_thresh] = -1 del iou selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) for gt_idx in gt_index: if gt_idx >= 0: if gt_difficult_l[gt_idx]: match[l].append(-1) else: if not selec[gt_idx]: match[l].append(1) else: match[l].append(0) selec[gt_idx] = True else: match[l].append(0) for iter_ in (pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults): if next(iter_, None) is not None: raise ValueError('Length of input iterables need to be same.') n_fg_class = max(n_pos.keys()) + 1 prec = [None] * n_fg_class rec = [None] * n_fg_class for l in n_pos.keys(): score_l = np.array(score[l]) match_l = np.array(match[l], dtype=np.int8) order = score_l.argsort()[::-1] match_l = match_l[order] tp = np.cumsum(match_l == 1) fp = np.cumsum(match_l == 0) # If an element of fp + tp is 0, # the corresponding element of prec[l] is nan. prec[l] = tp / (fp + tp) # If n_pos[l] is 0, rec[l] is None. if n_pos[l] > 0: rec[l] = tp / n_pos[l] return prec, rec
def __call__(self, rois, bboxes, whole_mask, labels): rois = cuda.to_cpu(rois) bboxes = cuda.to_cpu(bboxes) whole_mask = cuda.to_cpu(whole_mask) labels = cuda.to_cpu(labels) n_bbox, _ = bboxes.shape rois = np.concatenate((rois, bboxes), axis=0) if self.n_sample is None: n_sample = rois.shape[0] else: n_sample = self.n_sample fg_rois_per_image = np.round(n_sample * self.fg_ratio) iou = bbox_iou(rois, bboxes) gt_assignment = iou.argmax(axis=1) max_iou = iou.max(axis=1) # Select foreground RoIs as those with >= fg_iou_thresh IoU. fg_indices = np.where(max_iou >= self.fg_iou_thresh)[0] fg_rois_per_this_image = int(min(fg_rois_per_image, fg_indices.size)) if fg_indices.size > 0: fg_indices = np.random.choice(fg_indices, size=fg_rois_per_this_image, replace=False) # Select background RoIs as those within # [bg_iou_thresh_lo, bg_iou_thresh_hi). bg_indices = np.where((max_iou < self.bg_iou_thresh_hi) & (max_iou >= self.bg_iou_thresh_lo))[0] bg_rois_per_this_image = n_sample - fg_rois_per_this_image bg_rois_per_this_image = int( min(bg_rois_per_this_image, bg_indices.size)) if bg_indices.size > 0: bg_indices = np.random.choice(bg_indices, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both foreground and background). keep_indices = np.append(fg_indices, bg_indices) # sample_rois sample_rois = rois[keep_indices] # locs # Compute offsets and scales to match sampled RoIs to the GTs. loc_normalize_mean = np.array(self.loc_normalize_mean, np.float32) loc_normalize_std = np.array(self.loc_normalize_std, np.float32) gt_roi_locs = bbox2loc(sample_rois, bboxes[gt_assignment[keep_indices]]) gt_roi_locs = gt_roi_locs - loc_normalize_mean gt_roi_locs = gt_roi_locs / loc_normalize_std # masks gt_roi_masks = -1 * np.ones( (len(keep_indices), self.mask_size, self.mask_size), dtype=np.int32) for i, fg_index in enumerate(fg_indices): roi = np.round(sample_rois[i]).astype(np.int32) gt_roi = np.round(bboxes[gt_assignment[fg_index]]) gt_roi = gt_roi.astype(np.int32) gt_mask = whole_mask[gt_assignment[fg_index]] gt_roi_mask = fcis.mask.intersect_bbox_mask( roi, gt_roi, gt_mask, self.mask_size) gt_roi_mask = cv2.resize(gt_roi_mask.astype(np.float32), (self.mask_size, self.mask_size)) gt_roi_mask = gt_roi_mask >= self.binary_thresh gt_roi_mask = gt_roi_mask.astype(np.int32) gt_roi_masks[i, ...] = gt_roi_mask # labels # The label with value 0 is the background. gt_roi_labels = labels[gt_assignment[keep_indices]] # set labels of bg_rois to be 0 gt_roi_labels[fg_rois_per_this_image:] = 0 sample_rois = cuda.to_gpu(sample_rois) gt_roi_locs = cuda.to_gpu(gt_roi_locs) gt_roi_masks = cuda.to_gpu(gt_roi_masks) gt_roi_labels = cuda.to_gpu(gt_roi_labels) return sample_rois, gt_roi_locs, gt_roi_masks, gt_roi_labels
def __call__(self, roi, bbox, label, mask, loc_normalize_mean=(0., 0., 0., 0.), loc_normalize_std=(0.1, 0.1, 0.2, 0.2)): """Assigns ground truth to sampled proposals. This function samples total of :obj:`self.n_sample` RoIs from the combination of :obj:`roi` and :obj:`bbox`. The RoIs are assigned with the ground truth class labels as well as bounding box offsets and scales to match the ground truth bounding boxes. As many as :obj:`pos_ratio * self.n_sample` RoIs are sampled as foregrounds. Offsets and scales of bounding boxes are calculated using :func:`chainercv.links.model.faster_rcnn.bbox2loc`. Also, types of input arrays and output arrays are same. Here are notations. * :math:`S` is the total number of sampled RoIs, which equals \ :obj:`self.n_sample`. * :math:`L` is number of object classes possibly including the \ background. Args: roi (array): Region of Interests (RoIs) from which we sample. Its shape is :math:`(R, 4)` bbox (array): The coordinates of ground truth bounding boxes. Its shape is :math:`(R', 4)`. label (array): Ground truth bounding box labels. Its shape is :math:`(R',)`. Its range is :math:`[0, L - 1]`, where :math:`L` is the number of foreground classes. loc_normalize_mean (tuple of four floats): Mean values to normalize coordinates of bouding boxes. loc_normalize_std (tupler of four floats): Standard deviation of the coordinates of bounding boxes. Returns: (array, array, array): * **sample_roi**: Regions of interests that are sampled. \ Its shape is :math:`(S, 4)`. * **gt_roi_loc**: Offsets and scales to match \ the sampled RoIs to the ground truth bounding boxes. \ Its shape is :math:`(S, 4)`. * **gt_roi_label**: Labels assigned to sampled RoIs. Its shape is \ :math:`(S,)`. Its range is :math:`[0, L]`. The label with \ value 0 is the background. """ xp = cuda.get_array_module(roi) roi = cuda.to_cpu(roi) bbox = cuda.to_cpu(bbox) label = cuda.to_cpu(label) n_bbox, _ = bbox.shape if n_bbox == 0: raise ValueError('Empty bbox is not supported.') roi = np.concatenate((roi, bbox), axis=0) pos_roi_per_image = np.round(self.n_sample * self.pos_ratio) iou = bbox_iou(roi, bbox) gt_assignment = iou.argmax(axis=1) max_iou = iou.max(axis=1) # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class]. # The label with value 0 is the background. gt_roi_label = label[gt_assignment] + 1 # Select foreground RoIs as those with >= pos_iou_thresh IoU. pos_index = np.where(max_iou >= self.pos_iou_thresh)[0] pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size)) if pos_index.size > 0: pos_index = np.random.choice(pos_index, size=pos_roi_per_this_image, replace=False) # Select background RoIs as those within # [neg_iou_thresh_lo, neg_iou_thresh_hi). neg_index = np.where((max_iou < self.neg_iou_thresh_hi) & (max_iou >= self.neg_iou_thresh_lo))[0] neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image neg_roi_per_this_image = int( min(neg_roi_per_this_image, neg_index.size)) if neg_index.size > 0: neg_index = np.random.choice(neg_index, size=neg_roi_per_this_image, replace=False) # The indices that we're selecting (both positive and negative). keep_index = np.append(pos_index, neg_index) gt_roi_label = gt_roi_label[keep_index] gt_roi_label[pos_roi_per_this_image:] = 0 # negative labels --> 0 sample_roi = roi[keep_index] # Compute offsets and scales to match sampled RoIs to the GTs. gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]]) gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32)) / np.array(loc_normalize_std, np.float32)) # Compute gt masks gt_roi_mask = -np.ones( (len(sample_roi), self.mask_size, self.mask_size), dtype=np.int32) for i, pos_ind in enumerate(pos_index): roi = np.round(sample_roi[i]).astype(np.int32) gt_mask = mask[gt_assignment[pos_ind]] gt_roi_mask_i = gt_mask[roi[0]:roi[2], roi[1]:roi[3]] gt_roi_mask_i_score = ( np.arange(gt_roi_mask_i.max() + 1) == gt_roi_mask_i[..., None]).astype( np.float32) # label -> onehot gt_roi_mask_i_score = cv2.resize(gt_roi_mask_i_score, (self.mask_size, self.mask_size)) if gt_roi_mask_i_score.ndim == 2: gt_roi_mask_i_score = gt_roi_mask_i_score.reshape( gt_roi_mask_i_score.shape[:2] + (1, )) gt_roi_mask_i = np.argmax(gt_roi_mask_i_score, axis=2) gt_roi_mask[i] = gt_roi_mask_i.astype(np.int32) if xp != np: sample_roi = cuda.to_gpu(sample_roi) gt_roi_loc = cuda.to_gpu(gt_roi_loc) gt_roi_label = cuda.to_gpu(gt_roi_label) gt_roi_mask = cuda.to_gpu(gt_roi_mask) return sample_roi, gt_roi_loc, gt_roi_label, gt_roi_mask
def __call__(self, roi, bbox, label, loc_normalize_mean=(0., 0., 0., 0.), loc_normalize_std=(0.1, 0.1, 0.2, 0.2)): """Assigns ground truth to sampled proposals. This function samples total of :obj:`self.n_sample` RoIs from the combination of :obj:`roi` and :obj:`bbox`. The RoIs are assigned with the ground truth class labels as well as bounding box offsets and scales to match the ground truth bounding boxes. As many as :obj:`pos_ratio * self.n_sample` RoIs are sampled as foregrounds. Offsets and scales of bounding boxes are calculated using :func:`chainercv.links.model.faster_rcnn.bbox2loc`. Also, types of input arrays and output arrays are same. Here are notations. * :math:`S` is the total number of sampled RoIs, which equals \ :obj:`self.n_sample`. * :math:`L` is number of object classes possibly including the \ background. Args: roi (array): Region of Interests (RoIs) from which we sample. Its shape is :math:`(R, 4)` bbox (array): The coordinates of ground truth bounding boxes. Its shape is :math:`(R', 4)`. label (array): Ground truth bounding box labels. Its shape is :math:`(R',)`. Its range is :math:`[0, L - 1]`, where :math:`L` is the number of foreground classes. loc_normalize_mean (tuple of four floats): Mean values to normalize coordinates of bouding boxes. loc_normalize_std (tupler of four floats): Standard deviation of the coordinates of bounding boxes. Returns: (array, array, array): * **sample_roi**: Regions of interests that are sampled. \ Its shape is :math:`(S, 4)`. * **gt_roi_loc**: Offsets and scales to match \ the sampled RoIs to the ground truth bounding boxes. \ Its shape is :math:`(S, 4)`. * **gt_roi_label**: Labels assigned to sampled RoIs. Its shape is \ :math:`(S,)`. Its range is :math:`[0, L]`. The label with \ value 0 is the background. """ xp = cuda.get_array_module(roi) roi = cuda.to_cpu(roi) bbox = cuda.to_cpu(bbox) label = cuda.to_cpu(label) n_bbox, _ = bbox.shape roi = np.concatenate((roi, bbox), axis=0) pos_roi_per_image = np.round(self.n_sample * self.pos_ratio) iou = bbox_iou(roi, bbox) gt_assignment = iou.argmax(axis=1) max_iou = iou.max(axis=1) # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class]. # The label with value 0 is the background. gt_roi_label = label[gt_assignment] + 1 # Select foreground RoIs as those with >= pos_iou_thresh IoU. pos_index = np.where(max_iou >= self.pos_iou_thresh)[0] pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size)) if pos_index.size > 0: pos_index = np.random.choice( pos_index, size=pos_roi_per_this_image, replace=False) # Select background RoIs as those within # [neg_iou_thresh_lo, neg_iou_thresh_hi). neg_index = np.where((max_iou < self.neg_iou_thresh_hi) & (max_iou >= self.neg_iou_thresh_lo))[0] neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image neg_roi_per_this_image = int(min(neg_roi_per_this_image, neg_index.size)) if neg_index.size > 0: neg_index = np.random.choice( neg_index, size=neg_roi_per_this_image, replace=False) # The indices that we're selecting (both positive and negative). keep_index = np.append(pos_index, neg_index) gt_roi_label = gt_roi_label[keep_index] gt_roi_label[pos_roi_per_this_image:] = 0 # negative labels --> 0 sample_roi = roi[keep_index] # Compute offsets and scales to match sampled RoIs to the GTs. gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]]) gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32) ) / np.array(loc_normalize_std, np.float32)) if xp != np: sample_roi = cuda.to_gpu(sample_roi) gt_roi_loc = cuda.to_gpu(gt_roi_loc) gt_roi_label = cuda.to_gpu(gt_roi_label) return sample_roi, gt_roi_loc, gt_roi_label
def __call__(self, roi, bbox, label, mask, levels, loc_normalize_mean=(0., 0., 0., 0.), loc_normalize_std=(0.1, 0.1, 0.2, 0.2), mask_size=14, binary_mask=True): """ binary_mask = False -> keypoint """ xp = cuda.get_array_module(roi) roi = cuda.to_cpu(roi) bbox = cuda.to_cpu(bbox) label = cuda.to_cpu(label) mask = cuda.to_cpu(mask) levels = cuda.to_cpu(levels) n_bbox, _ = bbox.shape n_proposal = roi.shape[0] roi = np.concatenate((roi, bbox), axis=0) # assign feature levels of ground truth boxes bbox_levels = map_rois_to_fpn_levels(np, bbox) levels = np.concatenate([levels, bbox_levels]) pos_roi_per_image = np.round(self.n_sample * self.pos_ratio) iou = bbox_iou(roi, bbox) gt_assignment = iou.argmax(axis=1) max_iou = iou.max(axis=1) # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class]. # The label with value 0 is the background. gt_roi_label = label[gt_assignment] + 1 # Select foreground RoIs as those with >= pos_iou_thresh IoU. pos_index = np.where(max_iou >= self.pos_iou_thresh)[0] pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size)) if pos_index.size > 0: pos_index = np.random.choice(pos_index, size=pos_roi_per_this_image, replace=False) # Select background RoIs as those within # [neg_iou_thresh_lo, neg_iou_thresh_hi). neg_index = np.where((max_iou < self.neg_iou_thresh_hi) & (max_iou >= self.neg_iou_thresh_lo))[0] neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image neg_roi_per_this_image = int( min(neg_roi_per_this_image, neg_index.size)) if neg_index.size > 0: neg_index = np.random.choice(neg_index, size=neg_roi_per_this_image, replace=False) # The indices that we're selecting (both positive and negative). keep_index = np.append(pos_index, neg_index) gt_roi_label = gt_roi_label[keep_index] gt_roi_label[pos_roi_per_this_image:] = 0 # negative labels --> 0 sample_roi = roi[keep_index] sample_levels = levels[keep_index] # Compute offsets and scales to match sampled RoIs to the GTs. gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]]) gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32)) / np.array(loc_normalize_std, np.float32)) # https://engineer.dena.jp/2017/12/chainercvmask-r-cnn.html gt_roi_mask = [] _, h, w = mask.shape if binary_mask: for i, idx in enumerate(gt_assignment[pos_index]): A = mask[idx, np.max((int(sample_roi[i, 0]), 0)):np.min((int(sample_roi[i, 2]), h)), np.max((int(sample_roi[i, 1]), 0)):np.min((int(sample_roi[i, 3]), w))] gt_roi_mask.append( cv2.resize(A, (mask_size, mask_size)).astype(np.int32)) else: for i, idx in enumerate(gt_assignment[pos_index]): m = np.zeros((mask_size, mask_size), dtype=np.int32) # remind: shape of keypoints is (N, 17, 3), N is number of bbox, 17 is number of keypoints, 3 is (x, y, v) # v=0: unlabeled, v=1, labeled but invisible, v=2 labeled and visible # bbox's (y0, x0), (y1, x1) y0, x0, y1, x1 = list(map(int, sample_roi[i, :4])) kp = mask[idx] # shape is (17, 3) # convert keypoints coordinate (y, x) into mask coordinate system [0, mask_size]x[0, mask_size] kp[:, :2] = (kp[:, :2] - [y0, x0]) / \ [max(y1 - y0, 1), max(x1 - x0, 1)] * mask_size # mask_size x mask_size 空間でどこにあるかをラベルとして扱う(あとでsoftmax cross entropyする) # -1でignoreされる keypoint_labels = np.zeros(kp.shape[0], dtype=np.int32) for j, r in enumerate(kp): y, x, v = list(map(int, r)) if v == 2 and 0 <= y and y < mask_size and 0 <= x and x < mask_size: keypoint_labels[j] = y * mask_size + x else: keypoint_labels[j] = -1 gt_roi_mask.append(keypoint_labels) gt_roi_mask = xp.array(gt_roi_mask) if xp != np: sample_roi = cuda.to_gpu(sample_roi) gt_roi_loc = cuda.to_gpu(gt_roi_loc) gt_roi_label = cuda.to_gpu(gt_roi_label) gt_roi_mask = cuda.to_gpu(gt_roi_mask) sample_levels = cuda.to_gpu(sample_levels) return sample_roi, sample_levels, gt_roi_loc, gt_roi_label, gt_roi_mask
def calc_detection_inria_prec_rec( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults=None, iou_thresh=0.5): pred_bboxes = iter(pred_bboxes) pred_labels = iter(pred_labels) pred_scores = iter(pred_scores) gt_bboxes = iter(gt_bboxes) gt_labels = iter(gt_labels) if gt_difficults is None: gt_difficults = itertools.repeat(None) else: gt_difficults = iter(gt_difficults) n_pos = defaultdict(int) score = defaultdict(list) match = defaultdict(list) for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \ six.moves.zip( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults): if gt_difficult is None: gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool) for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)): pred_mask_l = pred_label == l pred_bbox_l = pred_bbox[pred_mask_l] pred_score_l = pred_score[pred_mask_l] # sort by score order = pred_score_l.argsort()[::-1] pred_bbox_l = pred_bbox_l[order] pred_score_l = pred_score_l[order] gt_mask_l = gt_label == l gt_bbox_l = gt_bbox[gt_mask_l] gt_difficult_l = gt_difficult[gt_mask_l] n_pos[l] += np.logical_not(gt_difficult_l).sum() score[l].extend(pred_score_l) if len(pred_bbox_l) == 0: continue if len(gt_bbox_l) == 0: match[l].extend((0,) * pred_bbox_l.shape[0]) continue # VOC evaluation follows integer typed bounding boxes. pred_bbox_l = pred_bbox_l.copy() pred_bbox_l[:, 2:] += 1 gt_bbox_l = gt_bbox_l.copy() gt_bbox_l[:, 2:] += 1 iou = bbox_iou(pred_bbox_l, gt_bbox_l) gt_index = iou.argmax(axis=1) # set -1 if there is no matching ground truth gt_index[iou.max(axis=1) < iou_thresh] = -1 del iou selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) for gt_idx in gt_index: if gt_idx >= 0: if gt_difficult_l[gt_idx]: match[l].append(-1) else: if not selec[gt_idx]: match[l].append(1) else: match[l].append(0) selec[gt_idx] = True else: match[l].append(0) for iter_ in ( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults): if next(iter_, None) is not None: raise ValueError('Length of input iterables need to be same.') n_fg_class = max(n_pos.keys()) + 1 prec = [None] * n_fg_class rec = [None] * n_fg_class print("###n_pos.keys()= ", n_pos.keys()) for l in n_pos.keys(): score_l = np.array(score[l]) match_l = np.array(match[l], dtype=np.int8) print("###score_l= ", len(score_l), score_l) print("###match_l= ", len(match_l), match_l) order = score_l.argsort()[::-1] match_l = match_l[order] tp = np.cumsum(match_l == 1) fp = np.cumsum(match_l == 0) print("###tp= ", len(tp), tp) print("###fp= ", len(fp), fp) print("###n_pos[l]= ", n_pos[l]) # If an element of fp + tp is 0, # the corresponding element of prec[l] is nan. prec[l] = tp / (fp + tp) # If n_pos[l] is 0, rec[l] is None. if n_pos[l] > 0: rec[l] = tp / n_pos[l] print("###rec= ", len(rec[l]), rec[l]) return prec, rec
def mask_head_loss_pre(rois, roi_indices, gt_masks, gt_bboxes, gt_head_labels, segm_size): """Loss function for Mask Head (pre). This function processes RoIs for :func:`mask_head_loss_post` by selecting RoIs for mask loss calculation and preparing ground truth network output. Args: rois (iterable of arrays): An iterable of arrays of shape :math:`(R_l, 4)`, where :math:`R_l` is the number of RoIs in the :math:`l`-th feature map. roi_indices (iterable of arrays): An iterable of arrays of shape :math:`(R_l,)`. gt_masks (iterable of arrays): An iterable of arrays whose shape is :math:`(R_n, H, W)`, where :math:`R_n` is the number of ground truth objects. gt_head_labels (iterable of arrays): An iterable of arrays of shape :math:`(R_l,)`. This is a collection of ground-truth labels assigned to :obj:`rois` during bounding box localization stage. The range of value is :math:`(0, n\_class - 1)`. segm_size (int): Size of the ground truth network output. Returns: tuple of four lists: :obj:`mask_rois`, :obj:`mask_roi_indices`, :obj:`gt_segms`, and :obj:`gt_mask_labels`. * **rois**: A list of arrays of shape :math:`(R'_l, 4)`, \ where :math:`R'_l` is the number of RoIs in the :math:`l`-th \ feature map. * **roi_indices**: A list of arrays of shape :math:`(R'_l,)`. * **gt_segms**: A list of arrays of shape :math:`(R'_l, M, M). \ :math:`M` is the argument :obj:`segm_size`. * **gt_mask_labels**: A list of arrays of shape :math:`(R'_l,)` \ indicating the classes of ground truth. """ xp = cuda.get_array_module(*rois) n_level = len(rois) roi_levels = xp.hstack( xp.array((l,) * len(rois[l])) for l in range(n_level)).astype(np.int32) rois = xp.vstack(rois).astype(np.float32) roi_indices = xp.hstack(roi_indices).astype(np.int32) gt_head_labels = xp.hstack(gt_head_labels) index = (gt_head_labels > 0).nonzero()[0] mask_roi_levels = roi_levels[index] mask_rois = rois[index] mask_roi_indices = roi_indices[index] gt_mask_labels = gt_head_labels[index] gt_segms = xp.empty( (len(mask_rois), segm_size, segm_size), dtype=np.float32) for i in np.unique(cuda.to_cpu(mask_roi_indices)): gt_mask = gt_masks[i] gt_bbox = gt_bboxes[i] index = (mask_roi_indices == i).nonzero()[0] mask_roi = mask_rois[index] iou = bbox_iou(mask_roi, gt_bbox) gt_index = iou.argmax(axis=1) gt_segms[index] = xp.array( mask_to_segm(gt_mask, mask_roi, segm_size, gt_index)) flag_masks = [mask_roi_levels == l for l in range(n_level)] mask_rois = [mask_rois[m] for m in flag_masks] mask_roi_indices = [mask_roi_indices[m] for m in flag_masks] gt_segms = [gt_segms[m] for m in flag_masks] gt_mask_labels = [gt_mask_labels[m] for m in flag_masks] return mask_rois, mask_roi_indices, gt_segms, gt_mask_labels
def mask_voting( rois, mask_probs, cls_probs, n_class, H, W, score_thresh=0.7, nms_thresh=0.3, mask_merge_thresh=0.5, binary_thresh=0.4, max_num=100): mask_size = mask_probs.shape[-1] v_labels = np.empty((0, ), dtype=np.int32) v_masks = np.empty((0, mask_size, mask_size), dtype=np.float32) v_bboxes = np.empty((0, 4), dtype=np.float32) v_cls_probs = np.empty((0, ), dtype=np.float32) tmp_all_scores = np.empty((0, ), dtype=np.float32) tmp_cls_probs = [] tmp_bbox = [] for label in range(0, n_class): if label == 0: # l == 0 is background continue # non maximum suppression cls_prob_l = cls_probs[:, label] keep_indices = non_maximum_suppression( rois, nms_thresh, cls_prob_l, limit=max_num) bbox_l = rois[keep_indices] cls_prob_l = cls_prob_l[keep_indices] tmp_bbox.append(bbox_l) tmp_cls_probs.append(cls_prob_l) tmp_all_scores = np.concatenate((tmp_all_scores, cls_prob_l)) sorted_all_scores = np.sort(tmp_all_scores)[::-1] keep_num = min(len(sorted_all_scores), max_num) thresh = max(sorted_all_scores[keep_num - 1], 1e-3) for label in range(0, n_class): if label == 0: continue bbox_l = tmp_bbox[label - 1] cls_prob_l = tmp_cls_probs[label - 1] keep_indices = np.where(cls_prob_l >= thresh) bbox_l = bbox_l[keep_indices] cls_prob_l = cls_prob_l[keep_indices] v_mask_l = np.empty((0, mask_size, mask_size), dtype=np.float32) v_bbox_l = np.empty((0, 4), dtype=np.float32) v_cls_prob_l = np.empty((0, ), dtype=np.float32) for i, bbox in enumerate(bbox_l): iou = bbox_iou(rois, bbox[np.newaxis, :]) idx = np.where(iou >= mask_merge_thresh)[0] mask_weights = cls_probs[idx, label] mask_weights = mask_weights / mask_weights.sum() mask_prob_l = mask_probs[idx] rois_l = rois[idx] clipped_bbox, clipped_mask = mask_aggregation( rois_l, mask_prob_l, mask_weights, H, W, binary_thresh) if clipped_bbox is not None and clipped_mask is not None: clipped_mask = cv2.resize( clipped_mask.astype(np.float32), (mask_size, mask_size)) v_mask_l = np.concatenate((v_mask_l, clipped_mask[None])) v_bbox_l = np.concatenate((v_bbox_l, clipped_bbox[None])) v_cls_prob_l = np.concatenate( (v_cls_prob_l, cls_prob_l[i][None])) keep_indices = v_cls_prob_l > score_thresh v_mask_l = v_mask_l[keep_indices] v_bbox_l = v_bbox_l[keep_indices] v_cls_prob_l = v_cls_prob_l[keep_indices] v_label_l = np.repeat(label, v_bbox_l.shape[0]) v_masks = np.concatenate((v_masks, v_mask_l)) v_bboxes = np.concatenate((v_bboxes, v_bbox_l)) v_labels = np.concatenate((v_labels, v_label_l)) v_cls_probs = np.concatenate((v_cls_probs, v_cls_prob_l)) return v_bboxes, v_masks, v_labels, v_cls_probs
def calc_detection_voc_prec_rec( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults=None, iou_thresh=0.5): """Calculate precision and recall based on evaluation code of PASCAL VOC. This function calculates precision and recall of predicted bounding boxes obtained from a dataset which has :math:`N` images. The code is based on the evaluation code used in PASCAL VOC Challenge. Args: pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N` sets of bounding boxes. Its index corresponds to an index for the base dataset. Each element of :obj:`pred_bboxes` is a set of coordinates of bounding boxes. This is an array whose shape is :math:`(R, 4)`, where :math:`R` corresponds to the number of bounding boxes, which may vary among boxes. The second axis corresponds to :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box. pred_labels (iterable of numpy.ndarray): An iterable of labels. Similar to :obj:`pred_bboxes`, its index corresponds to an index for the base dataset. Its length is :math:`N`. pred_scores (iterable of numpy.ndarray): An iterable of confidence scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`, its index corresponds to an index for the base dataset. Its length is :math:`N`. gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth bounding boxes whose length is :math:`N`. An element of :obj:`gt_bboxes` is a bounding box whose shape is :math:`(R, 4)`. Note that the number of bounding boxes in each image does not need to be same as the number of corresponding predicted boxes. gt_labels (iterable of numpy.ndarray): An iterable of ground truth labels which are organized similarly to :obj:`gt_bboxes`. gt_difficults (iterable of numpy.ndarray): An iterable of boolean arrays which is organized similarly to :obj:`gt_bboxes`. This tells whether the corresponding ground truth bounding box is difficult or not. By default, this is :obj:`None`. In that case, this function considers all bounding boxes to be not difficult. iou_thresh (float): A prediction is correct if its Intersection over Union with the ground truth is above this value.. Returns: tuple of two lists: This function returns two lists: :obj:`prec` and :obj:`rec`. * :obj:`prec`: A list of arrays. :obj:`prec[l]` is precision \ for class :math:`l`. If class :math:`l` does not exist in \ either :obj:`pred_labels` or :obj:`gt_labels`, :obj:`prec[l]` is \ set to :obj:`None`. * :obj:`rec`: A list of arrays. :obj:`rec[l]` is recall \ for class :math:`l`. If class :math:`l` that is not marked as \ difficult does not exist in \ :obj:`gt_labels`, :obj:`rec[l]` is \ set to :obj:`None`. """ pred_bboxes = iter(pred_bboxes) pred_labels = iter(pred_labels) pred_scores = iter(pred_scores) gt_bboxes = iter(gt_bboxes) gt_labels = iter(gt_labels) if gt_difficults is None: gt_difficults = itertools.repeat(None) else: gt_difficults = iter(gt_difficults) n_pos = defaultdict(int) score = defaultdict(list) match = defaultdict(list) for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \ six.moves.zip( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults): if gt_difficult is None: gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool) for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)): pred_mask_l = pred_label == l pred_bbox_l = pred_bbox[pred_mask_l] pred_score_l = pred_score[pred_mask_l] # sort by score order = pred_score_l.argsort()[::-1] pred_bbox_l = pred_bbox_l[order] pred_score_l = pred_score_l[order] gt_mask_l = gt_label == l gt_bbox_l = gt_bbox[gt_mask_l] gt_difficult_l = gt_difficult[gt_mask_l] n_pos[l] += np.logical_not(gt_difficult_l).sum() score[l].extend(pred_score_l) if len(pred_bbox_l) == 0: continue if len(gt_bbox_l) == 0: match[l].extend((0,) * pred_bbox_l.shape[0]) continue # VOC evaluation follows integer typed bounding boxes. pred_bbox_l = pred_bbox_l.copy() pred_bbox_l[:, 2:] += 1 gt_bbox_l = gt_bbox_l.copy() gt_bbox_l[:, 2:] += 1 iou = bbox_iou(pred_bbox_l, gt_bbox_l) gt_index = iou.argmax(axis=1) # set -1 if there is no matching ground truth gt_index[iou.max(axis=1) < iou_thresh] = -1 del iou selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) for gt_idx in gt_index: if gt_idx >= 0: if gt_difficult_l[gt_idx]: match[l].append(-1) else: if not selec[gt_idx]: match[l].append(1) else: match[l].append(0) selec[gt_idx] = True else: match[l].append(0) for iter_ in ( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults): if next(iter_, None) is not None: raise ValueError('Length of input iterables need to be same.') n_fg_class = max(n_pos.keys()) + 1 prec = [None] * n_fg_class rec = [None] * n_fg_class for l in n_pos.keys(): score_l = np.array(score[l]) match_l = np.array(match[l], dtype=np.int8) order = score_l.argsort()[::-1] match_l = match_l[order] tp = np.cumsum(match_l == 1) fp = np.cumsum(match_l == 0) # If an element of fp + tp is 0, # the corresponding element of prec[l] is nan. prec[l] = tp / (fp + tp) # If n_pos[l] is 0, rec[l] is None. if n_pos[l] > 0: rec[l] = tp / n_pos[l] return prec, rec