def evaluate(self): """Computes evaluation result. Returns: A named tuple with the following fields - average_precision: a float number corresponding to average precision. precisions: an array of precisions. recalls: an array of recalls. recall@50: recall computed on 50 top-scoring samples. recall@100: recall computed on 100 top-scoring samples. median_rank@50: median rank computed on 50 top-scoring samples. median_rank@100: median rank computed on 100 top-scoring samples. """ if self._num_gt_instances == 0: logging.warn('No ground truth instances') if not self._scores: scores = np.array([], dtype=float) tp_fp_labels = np.array([], dtype=bool) else: scores = np.concatenate(self._scores) tp_fp_labels = np.concatenate(self._tp_fp_labels) relation_field_values = np.concatenate(self._relation_field_values) for relation_field_value, _ in ( self._num_gt_instances_per_relationship.iteritems()): precisions, recalls = metrics.compute_precision_recall( scores[relation_field_values == relation_field_value], tp_fp_labels[relation_field_values == relation_field_value], self._num_gt_instances_per_relationship[relation_field_value]) self._average_precisions[ relation_field_value] = metrics.compute_average_precision( precisions, recalls) self._mean_average_precision = np.mean(self._average_precisions.values()) self._precisions, self._recalls = metrics.compute_precision_recall( scores, tp_fp_labels, self._num_gt_instances) self._weighted_average_precision = metrics.compute_average_precision( self._precisions, self._recalls) self._recall_50 = ( metrics.compute_recall_at_k(self._tp_fp_labels, self._num_gt_instances, 50)) self._median_rank_50 = ( metrics.compute_median_rank_at_k(self._tp_fp_labels, 50)) self._recall_100 = ( metrics.compute_recall_at_k(self._tp_fp_labels, self._num_gt_instances, 100)) self._median_rank_100 = ( metrics.compute_median_rank_at_k(self._tp_fp_labels, 100)) return VRDDetectionEvalMetrics( self._weighted_average_precision, self._mean_average_precision, self._average_precisions, self._precisions, self._recalls, self._recall_50, self._recall_100, self._median_rank_50, self._median_rank_100)
def evaluate(self): """Computes evaluation result. Returns: A named tuple with the following fields - average_precision: a float number corresponding to average precision. precisions: an array of precisions. recalls: an array of recalls. recall@50: recall computed on 50 top-scoring samples. recall@100: recall computed on 100 top-scoring samples. median_rank@50: median rank computed on 50 top-scoring samples. median_rank@100: median rank computed on 100 top-scoring samples. """ if self._num_gt_instances == 0: logging.warn('No ground truth instances') if not self._scores: scores = np.array([], dtype=float) tp_fp_labels = np.array([], dtype=bool) else: scores = np.concatenate(self._scores) tp_fp_labels = np.concatenate(self._tp_fp_labels) relation_field_values = np.concatenate(self._relation_field_values) for relation_field_value, _ in self._num_gt_instances_per_relationship.items( ): precisions, recalls = metrics.compute_precision_recall( scores[relation_field_values == relation_field_value], tp_fp_labels[relation_field_values == relation_field_value], self._num_gt_instances_per_relationship[relation_field_value]) self._average_precisions[ relation_field_value] = metrics.compute_average_precision( precisions, recalls) self._mean_average_precision = np.mean( list(self._average_precisions.values())) self._precisions, self._recalls = metrics.compute_precision_recall( scores, tp_fp_labels, self._num_gt_instances) self._weighted_average_precision = metrics.compute_average_precision( self._precisions, self._recalls) self._recall_50 = (metrics.compute_recall_at_k(self._tp_fp_labels, self._num_gt_instances, 50)) self._median_rank_50 = (metrics.compute_median_rank_at_k( self._tp_fp_labels, 50)) self._recall_100 = (metrics.compute_recall_at_k( self._tp_fp_labels, self._num_gt_instances, 100)) self._median_rank_100 = (metrics.compute_median_rank_at_k( self._tp_fp_labels, 100)) return VRDDetectionEvalMetrics( self._weighted_average_precision, self._mean_average_precision, self._average_precisions, self._precisions, self._recalls, self._recall_50, self._recall_100, self._median_rank_50, self._median_rank_100)
def test_compute_recall_at_k(self): num_gt = 4 tp_fp = [ np.array([1, 0, 0], dtype=float), np.array([0, 1], dtype=float), np.array([0, 0, 0, 0, 0], dtype=float) ] tp_fp_bool = [ np.array([True, False, False], dtype=bool), np.array([False, True], dtype=float), np.array([False, False, False, False, False], dtype=float) ] recall_1 = metrics.compute_recall_at_k(tp_fp, num_gt, 1) recall_3 = metrics.compute_recall_at_k(tp_fp, num_gt, 3) recall_5 = metrics.compute_recall_at_k(tp_fp, num_gt, 5) recall_3_bool = metrics.compute_recall_at_k(tp_fp_bool, num_gt, 3) self.assertAlmostEqual(recall_1, 0.25) self.assertAlmostEqual(recall_3, 0.5) self.assertAlmostEqual(recall_3_bool, 0.5) self.assertAlmostEqual(recall_5, 0.5)