def evaluate(self):
    """Compute evaluation result.

    Returns:
      A named tuple with the following fields -
        average_precision: float numpy array of average precision for
            each class.
        mean_ap: mean average precision of all classes, float scalar
        precisions: List of precisions, each precision is a float numpy
            array
        recalls: List of recalls, each recall is a float numpy array
        corloc: numpy float array
        mean_corloc: Mean CorLoc score for each class, float scalar
    """
    if (self.num_gt_instances_per_class == 0).any():
      logging.warn(
          'The following classes have no ground truth examples: %s',
          np.squeeze(np.argwhere(self.num_gt_instances_per_class == 0)) +
          self.label_id_offset)

    if self.use_weighted_mean_ap:
      all_scores = np.array([], dtype=float)
      all_tp_fp_labels = np.array([], dtype=bool)
    for class_index in range(self.num_class):
      if self.num_gt_instances_per_class[class_index] == 0:
        continue
      if not self.scores_per_class[class_index]:
        scores = np.array([], dtype=float)
        tp_fp_labels = np.array([], dtype=float)
      else:
        scores = np.concatenate(self.scores_per_class[class_index])
        tp_fp_labels = np.concatenate(self.tp_fp_labels_per_class[class_index])
      if self.use_weighted_mean_ap:
        all_scores = np.append(all_scores, scores)
        all_tp_fp_labels = np.append(all_tp_fp_labels, tp_fp_labels)
      logging.info('Scores and tpfp per class label: %d', class_index)
      logging.info(tp_fp_labels)
      logging.info(scores)
      precision, recall = metrics.compute_precision_recall(
          scores, tp_fp_labels, self.num_gt_instances_per_class[class_index])
      self.precisions_per_class.append(precision)
      self.recalls_per_class.append(recall)
      average_precision = metrics.compute_average_precision(precision, recall)
      self.average_precision_per_class[class_index] = average_precision

    self.corloc_per_class = metrics.compute_cor_loc(
        self.num_gt_imgs_per_class,
        self.num_images_correctly_detected_per_class)

    if self.use_weighted_mean_ap:
      num_gt_instances = np.sum(self.num_gt_instances_per_class)
      precision, recall = metrics.compute_precision_recall(
          all_scores, all_tp_fp_labels, num_gt_instances)
      mean_ap = metrics.compute_average_precision(precision, recall)
    else:
      mean_ap = np.nanmean(self.average_precision_per_class)
    mean_corloc = np.nanmean(self.corloc_per_class)
    return ObjectDetectionEvalMetrics(
        self.average_precision_per_class, mean_ap, self.precisions_per_class,
        self.recalls_per_class, self.corloc_per_class, mean_corloc)
Example #2
0
    def evaluate(self):
        """Computes evaluation result.

    Returns:
      A named tuple with the following fields -
        average_precision: a float number corresponding to average precision.
        precisions: an array of precisions.
        recalls: an array of recalls.
        recall@50: recall computed on 50 top-scoring samples.
        recall@100: recall computed on 100 top-scoring samples.
        median_rank@50: median rank computed on 50 top-scoring samples.
        median_rank@100: median rank computed on 100 top-scoring samples.
    """
        if self._num_gt_instances == 0:
            logging.warn('No ground truth instances')

        if not self._scores:
            scores = np.array([], dtype=float)
            tp_fp_labels = np.array([], dtype=bool)
        else:
            scores = np.concatenate(self._scores)
            tp_fp_labels = np.concatenate(self._tp_fp_labels)
            relation_field_values = np.concatenate(self._relation_field_values)

        for relation_field_value, _ in (
                self._num_gt_instances_per_relationship.iteritems()):
            precisions, recalls = metrics.compute_precision_recall(
                scores[relation_field_values == relation_field_value],
                tp_fp_labels[relation_field_values == relation_field_value],
                self._num_gt_instances_per_relationship[relation_field_value])
            self._average_precisions[
                relation_field_value] = metrics.compute_average_precision(
                    precisions, recalls)

        self._mean_average_precision = np.mean(
            self._average_precisions.values())

        self._precisions, self._recalls = metrics.compute_precision_recall(
            scores, tp_fp_labels, self._num_gt_instances)
        self._weighted_average_precision = metrics.compute_average_precision(
            self._precisions, self._recalls)

        self._recall_50 = (metrics.compute_recall_at_k(self._tp_fp_labels,
                                                       self._num_gt_instances,
                                                       50))
        self._median_rank_50 = (metrics.compute_median_rank_at_k(
            self._tp_fp_labels, 50))
        self._recall_100 = (metrics.compute_recall_at_k(
            self._tp_fp_labels, self._num_gt_instances, 100))
        self._median_rank_100 = (metrics.compute_median_rank_at_k(
            self._tp_fp_labels, 100))

        return VRDDetectionEvalMetrics(
            self._weighted_average_precision, self._mean_average_precision,
            self._average_precisions, self._precisions, self._recalls,
            self._recall_50, self._recall_100, self._median_rank_50,
            self._median_rank_100)
Example #3
0
    def test_compute_precision_recall(self):
        num_gt = 10
        scores = np.array([0.4, 0.3, 0.6, 0.2, 0.7, 0.1], dtype=float)
        labels = np.array([0, 1, 1, 0, 0, 1], dtype=bool)
        labels_float_type = np.array([0, 1, 1, 0, 0, 1], dtype=float)
        accumulated_tp_count = np.array([0, 1, 1, 2, 2, 3], dtype=float)
        expected_precision = accumulated_tp_count / np.array(
            [1, 2, 3, 4, 5, 6])
        expected_recall = accumulated_tp_count / num_gt

        precision, recall = metrics.compute_precision_recall(
            scores, labels, num_gt)
        precision_float_type, recall_float_type = metrics.compute_precision_recall(
            scores, labels_float_type, num_gt)

        self.assertAllClose(precision, expected_precision)
        self.assertAllClose(recall, expected_recall)
        self.assertAllClose(precision_float_type, expected_precision)
        self.assertAllClose(recall_float_type, expected_recall)
Example #4
0
 def test_compute_precision_recall_and_ap_no_groundtruth(self):
     num_gt = 0
     scores = np.array([0.4, 0.3, 0.6, 0.2, 0.7, 0.1], dtype=float)
     labels = np.array([0, 0, 0, 0, 0, 0], dtype=bool)
     expected_precision = None
     expected_recall = None
     precision, recall = metrics.compute_precision_recall(
         scores, labels, num_gt)
     self.assertEquals(precision, expected_precision)
     self.assertEquals(recall, expected_recall)
     ap = metrics.compute_average_precision(precision, recall)
     self.assertTrue(np.isnan(ap))
Example #5
0
 def test_compute_precision_recall_float(self):
     num_gt = 10
     scores = np.array([0.4, 0.3, 0.6, 0.2, 0.7, 0.1], dtype=float)
     labels_float = np.array([0, 1, 1, 0.5, 0, 1], dtype=float)
     expected_precision = np.array(
         [0., 0.5, 0.33333333, 0.5, 0.55555556, 0.63636364], dtype=float)
     expected_recall = np.array([0., 0.1, 0.1, 0.2, 0.25, 0.35],
                                dtype=float)
     precision, recall = metrics.compute_precision_recall(
         scores, labels_float, num_gt)
     self.assertAllClose(precision, expected_precision)
     self.assertAllClose(recall, expected_recall)