def _get_iou(self, bbox1_val, bbox2_val): """Get IoU for two sets of bounding boxes. It also checks that both implementations return the same before returning. Args: bbox1_val: Array of shape (total_bbox1, 4). bbox2_val: Array of shape (total_bbox2, 4). Returns: iou: Array of shape (total_bbox1, total_bbox2) """ bbox1 = tf.placeholder(tf.float32, (None, 4)) bbox2 = tf.placeholder(tf.float32, (None, 4)) iou = bbox_overlap_tf(bbox1, bbox2) with self.test_session() as sess: iou_val_tf = sess.run(iou, feed_dict={ bbox1: np.array(bbox1_val), bbox2: np.array(bbox2_val), }) iou_val_np = bbox_overlap(np.array(bbox1_val), np.array(bbox2_val)) self.assertAllClose(iou_val_np, iou_val_tf) return iou_val_tf
def get_valid_match_iou(i, j, gt_boxes, predicted_boxes, iou_threshold): """ Returns all groundtruth classes, predicted classes, matched groundtruth classes, matched predicted classes with iou greater than iou_threshold, after filtering the predicted classes with confidence greater than confidence_threshold for one image given in image_path with format input_image_format Args: i: index in groundtruth boxes list to compare j: index in predicted boxes list to compare with gt_boxes: list of 4 arrayed groundtruth bounding boxes of format [[xmin, ymin, xmax, ymax], [xmin1, ymin1, xmax1, ymax1], ..] predicted_boxes: list of 4 arrayed predicted bounding boxes of format [[xmin, ymin, xmax, ymax], [xmin1, ymin1, xmax1, ymax1], ..] iou_threshold: float, IOU threshold below which the match of the predicted bounding box with the ground truth box is invalid Returns: list: [index_gt, index_predicted, iou] or None """ iou = bbox_overlap( np.array(gt_boxes[i]).reshape(1, 4), np.array(predicted_boxes[j]).reshape(1, 4))[0][0] if iou >= iou_threshold: return [i, j, iou]
def calculate_metrics(output_per_batch, num_classes): """Calculates mAP and mAR from the detector's output. The procedure for calculating the average precision for class ``C`` is as follows (see `VOC mAP metric`_ for more details): Start by ranking all the predictions (for a given image and said class) in order of confidence. Each of these predictions is marked as correct (true positive, when it has a IoU-threshold greater or equal to `iou_thresholds`) or incorrect (false positive, in the other case). This matching is performed greedily over the confidence scores, so a higher-confidence prediction will be matched over another lower-confidence one even if the latter has better IoU. Also, each prediction is matched at most once, so repeated detections are counted as false positives. We then integrate over the interpolated PR curve, thus obtaining the value for the class' average precision. This interpolation makes sure the precision curve is monotonically decreasing; for this, we go through the precisions and make sure it's always decreasing. The integration is performed over 101 fixed points over the curve (``[0.0, 0.01, ..., 1.0]``). Average the result among all the classes to obtain the final, ``mAP``, value. Args: output_per_batch (dict): Output of the detector to calculate mAP. Expects the following keys: ``bboxes``, ``classes``, ``scores``, ``gt_bboxes``, ``gt_classes``. Under each key, there should be a list of the results per batch as returned by the detector. num_classes (int): Number of classes on the dataset. Returns: (``np.ndarray``, ``ndarray``) tuple. The first value is an array of size (`num_classes`,), with the AP value per class, while the second one is an array for the AR. .. _VOC mAP metric: http://host.robots.ox.ac.uk/pascal/VOC/pubs/everingham10.pdf """ iou_thresholds = np.linspace(0.50, 0.95, np.round((0.95 - 0.50) / 0.05) + 1) # 101 recall levels, same as COCO evaluation. rec_thresholds = np.linspace(0.00, 1.00, np.round((1.00 - 0.00) / 0.01) + 1) # List; first by class, then by example. Each entry is a tuple of ndarrays # of size (D_{c,i},), for tp/fp labels and for score, where D_{c,i} is the # number of detected boxes for class `c` on image `i`. tp_fp_labels_by_class = [[] for _ in range(num_classes)] num_examples_per_class = [0 for _ in range(num_classes)] # For each image, order predictions by score and classify each as a true # positive or a false positive. num_batches = len(output_per_batch["bboxes"]) for idx in range(num_batches): # Get the results of the batch. classes = output_per_batch["classes"][idx] # (D_{c,i},) bboxes = output_per_batch["bboxes"][idx] # (D_{c,i}, 4) scores = output_per_batch["scores"][idx] # (D_{c,i},) gt_classes = output_per_batch["gt_classes"][idx] gt_bboxes = output_per_batch["gt_bboxes"][idx] # Analysis must be made per-class. for cls in range(num_classes): # Get the bounding boxes of `cls` only. cls_bboxes = bboxes[classes == cls, :] cls_scores = scores[classes == cls] cls_gt_bboxes = gt_bboxes[gt_classes == cls, :] num_gt = cls_gt_bboxes.shape[0] num_examples_per_class[cls] += num_gt # Sort by score descending, so we prioritize higher-confidence # results when matching. sorted_indices = np.argsort(-cls_scores) # Whether the ground-truth has been previously detected. is_detected = np.zeros((num_gt, len(iou_thresholds))) # TP/FP labels for detected bboxes of (class, image). tp_fp_labels = np.zeros((len(sorted_indices), len(iou_thresholds))) if num_gt == 0: # If no ground truth examples for class, all predictions must # be false positives. tp_fp_labels_by_class[cls].append( (tp_fp_labels, cls_scores[sorted_indices])) continue # Get the IoUs for the class' bboxes. ious = bbox_overlap(cls_bboxes, cls_gt_bboxes) # Greedily assign bboxes to ground truths (highest score first). for bbox_idx in sorted_indices: gt_match = np.argmax(ious[bbox_idx, :]) # TODO: Try to vectorize. for iou_idx, iou_threshold in enumerate(iou_thresholds): if ious[bbox_idx, gt_match] >= iou_threshold: # Over IoU threshold. if not is_detected[gt_match, iou_idx]: # And first detection: it's a true positive. tp_fp_labels[bbox_idx, iou_idx] = True is_detected[gt_match, iou_idx] = True tp_fp_labels_by_class[cls].append( (tp_fp_labels, cls_scores[sorted_indices])) # Calculate average precision per class. ap_per_class = np.zeros((num_classes, len(iou_thresholds))) ar_per_class = np.zeros((num_classes, len(iou_thresholds))) for cls in range(num_classes): tp_fp_labels = tp_fp_labels_by_class[cls] num_examples = num_examples_per_class[cls] # Flatten the tp/fp labels into a single ndarray. labels, scores = zip(*tp_fp_labels) labels = np.concatenate(labels) scores = np.concatenate(scores) # Sort the tp/fp labels by decreasing confidence score and calculate # precision and recall at every position of this ranked output. sorted_indices = np.argsort(-scores) true_positives = labels[sorted_indices, :] false_positives = 1 - true_positives sum_true_positives = np.cumsum(true_positives, axis=0) sum_false_positives = np.cumsum(false_positives, axis=0) recall = sum_true_positives.astype(float) / num_examples precision = np.divide(sum_true_positives.astype(float), sum_true_positives + sum_false_positives) # Find AP by integrating over PR curve, with interpolated precision. for iou_idx in range(len(iou_thresholds)): p = precision[:, iou_idx] r = recall[:, iou_idx] # Interpolate the precision. (Make it monotonically-increasing.) for i in range(len(p) - 1, 0, -1): if p[i] > p[i - 1]: p[i - 1] = p[i] ap = 0 inds = np.searchsorted(r, rec_thresholds) for ridx, pidx in enumerate(inds): if pidx >= len(r): # Out of bounds, no recall higher than threshold for any of # the remaining thresholds (as they're ordered). break ap += p[pidx] / len(rec_thresholds) ap_per_class[cls, iou_idx] = ap if len(r): ar_per_class[cls, iou_idx] = r[-1] else: ar_per_class[cls, iou_idx] = 0 return ap_per_class, ar_per_class
def calculate_map(output_per_batch, num_classes, iou_threshold=0.5): """Calculates mAP@iou_threshold from the detector's output. The procedure for calculating the average precision for class ``C`` is as follows (see `VOC mAP metric`_ for more details): Start by ranking all the predictions (for a given image and said class) in order of confidence. Each of these predictions is marked as correct (true positive, when it has a IoU-threshold greater or equal to `iou_threshold`) or incorrect (false positive, in the other case). This matching is performed greedily over the confidence scores, so a higher-confidence prediction will be matched over another lower-confidence one even if the latter has better IoU. Also, each prediction is matched at most once, so repeated detections are counted as false positives. We then integrate over the interpolated PR curve, thus obtaining the value for the class' average precision. This interpolation makes sure the precision curve is monotonically decreasing; for this, at each recall point ``r``, the precision is the maximum precision value among all recalls higher than ``r``. The integration is performed over 11 fixed points over the curve (``[0.0, 0.1, ..., 1.0]``). Average the result among all the classes to obtain the final, ``mAP``, value. Args: output_per_batch (dict): Output of the detector to calculate mAP. Expects the following keys: ``bboxes``, ``classes``, ``scores``, ``gt_bboxes``, ``gt_classes``. Under each key, there should be a list of the results per batch as returned by the detector. num_classes (int): Number of classes on the dataset. threshold (float): IoU threshold for considering a match. Returns: (``np.float``, ``ndarray``) tuple. The first value is the mAP, while the second is an array of size (`num_classes`,), with the AP value per class. Note: The "difficult example" flag of VOC dataset is being ignored. Todo: * Use VOC2012-style for integrating the curve. That is, use all recall points instead of a fixed number of points like in VOC2007. .. _VOC mAP metric: http://host.robots.ox.ac.uk/pascal/VOC/pubs/everingham10.pdf """ # List; first by class, then by example. Each entry is a tuple of ndarrays # of size (D_{c,i},), for tp/fp labels and for score, where D_{c,i} is the # number of detected boxes for class `c` on image `i`. tp_fp_labels_by_class = [[] for _ in range(num_classes)] num_examples_per_class = [0 for _ in range(num_classes)] # For each image, order predictions by score and classify each as a true # positive or a false positive. num_batches = len(output_per_batch['bboxes']) for idx in range(num_batches): # Get the results of the batch. classes = output_per_batch['classes'][idx] # (D_{c,i},) bboxes = output_per_batch['bboxes'][idx] # (D_{c,i}, 4) scores = output_per_batch['scores'][idx] # (D_{c,i},) gt_classes = output_per_batch['gt_classes'][idx] gt_bboxes = output_per_batch['gt_bboxes'][idx] # Analysis must be made per-class. for cls in range(num_classes): # Get the bounding boxes of `cls` only. cls_bboxes = bboxes[classes == cls, :] cls_scores = scores[classes == cls] cls_gt_bboxes = gt_bboxes[gt_classes == cls, :] num_gt = cls_gt_bboxes.shape[0] num_examples_per_class[cls] += num_gt # Sort by score descending, so we prioritize higher-confidence # results when matching. sorted_indices = np.argsort(-cls_scores) # Whether the ground-truth has been previously detected. is_detected = np.zeros(num_gt) # TP/FP labels for detected bboxes of (class, image). tp_fp_labels = np.zeros(len(sorted_indices)) if num_gt == 0: # If no ground truth examples for class, all predictions must # be false positives. tp_fp_labels_by_class[cls].append( (tp_fp_labels, cls_scores[sorted_indices]) ) continue # Get the IoUs for the class' bboxes. ious = bbox_overlap(cls_bboxes, cls_gt_bboxes) # Greedily assign bboxes to ground truths (highest score first). for bbox_idx in sorted_indices: gt_match = np.argmax(ious[bbox_idx, :]) if ious[bbox_idx, gt_match] >= iou_threshold: # Over IoU threshold. if not is_detected[gt_match]: # And first detection: it's a true positive. tp_fp_labels[bbox_idx] = True is_detected[gt_match] = True tp_fp_labels_by_class[cls].append( (tp_fp_labels, cls_scores[sorted_indices]) ) # Calculate average precision per class. ap_per_class = np.zeros(num_classes) for cls in range(num_classes): tp_fp_labels = tp_fp_labels_by_class[cls] num_examples = num_examples_per_class[cls] # Flatten the tp/fp labels into a single ndarray. labels, scores = zip(*tp_fp_labels) labels = np.concatenate(labels) scores = np.concatenate(scores) # Sort the tp/fp labels by decreasing confidence score and calculate # precision and recall at every position of this ranked output. sorted_indices = np.argsort(-scores) true_positives = labels[sorted_indices] false_positives = 1 - true_positives cum_true_positives = np.cumsum(true_positives) cum_false_positives = np.cumsum(false_positives) recall = cum_true_positives.astype(float) / num_examples precision = np.divide( cum_true_positives.astype(float), cum_true_positives + cum_false_positives ) # Find AP by integrating over PR curve, with interpolated precision. ap = 0 for t in np.linspace(0, 1, 11): if not np.any(recall >= t): # Recall is never higher than `t`, continue. continue ap += np.max(precision[recall >= t]) / 11 # Interpolated. ap_per_class[cls] = ap # Finally, mAP. mean_ap = np.mean(ap_per_class) return mean_ap, ap_per_class