def test_average_performance(): target = torch.Tensor([[1, 1, 0, -1], [1, 1, 0, -1], [0, -1, 1, -1], [0, 1, 0, -1], [0, 1, 0, -1]]) pred = torch.Tensor([[0.9, 0.8, 0.3, 0.2], [0.1, 0.2, 0.2, 0.1], [0.7, 0.5, 0.9, 0.3], [0.8, 0.1, 0.1, 0.2], [0.8, 0.1, 0.1, 0.2]]) # target and pred should both be np.ndarray or torch.Tensor with pytest.raises(TypeError): target_list = target.tolist() _ = average_performance(pred, target_list) # target and pred should be in the same shape with pytest.raises(AssertionError): target_shorter = target[:-1] _ = average_performance(pred, target_shorter) assert average_performance(pred, target) == average_performance(pred, target, thr=0.5) assert average_performance(pred, target, thr=0.5, k=2) \ == average_performance(pred, target, thr=0.5) assert average_performance(pred, target, thr=0.3) == pytest.approx( (31.25, 43.75, 36.46, 33.33, 42.86, 37.50), rel=1e-2) assert average_performance(pred, target, k=2) == pytest.approx( (43.75, 50.00, 46.67, 40.00, 57.14, 47.06), rel=1e-2)
def evaluate(self, results, metric='mAP', metric_options=None, logger=None, **deprecated_kwargs): """Evaluate the dataset. Args: results (list): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. Default value is 'mAP'. Options are 'mAP', 'CP', 'CR', 'CF1', 'OP', 'OR' and 'OF1'. metric_options (dict, optional): Options for calculating metrics. Allowed keys are 'k' and 'thr'. Defaults to None logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Defaults to None. deprecated_kwargs (dict): Used for containing deprecated arguments. Returns: dict: evaluation results """ if metric_options is None: metric_options = {'thr': 0.5} if deprecated_kwargs != {}: warnings.warn('Option arguments for metrics has been changed to ' '`metric_options`.') metric_options = {**deprecated_kwargs} if isinstance(metric, str): metrics = [metric] else: metrics = metric allowed_metrics = ['mAP', 'CP', 'CR', 'CF1', 'OP', 'OR', 'OF1'] eval_results = {} results = np.vstack(results) gt_labels = self.get_gt_labels() num_imgs = len(results) assert len(gt_labels) == num_imgs, 'dataset testing results should '\ 'be of the same length as gt_labels.' invalid_metrics = set(metrics) - set(allowed_metrics) if len(invalid_metrics) != 0: raise ValueError(f'metric {invalid_metrics} is not supported.') if 'mAP' in metrics: mAP_value = mAP(results, gt_labels) eval_results['mAP'] = mAP_value if len(set(metrics) - {'mAP'}) != 0: performance_keys = ['CP', 'CR', 'CF1', 'OP', 'OR', 'OF1'] performance_values = average_performance(results, gt_labels, **metric_options) for k, v in zip(performance_keys, performance_values): if k in metrics: eval_results[k] = v return eval_results
def evaluate(self, results, metric='mAP', logger=None, **eval_kwargs): """Evaluate the dataset. Args: results (list): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. Default value is 'mAP'. Options are 'mAP', 'CP', 'CR', 'CF1', 'OP', 'OR' and 'OF1'. logger (logging.Logger | None | str): Logger used for printing related information during evaluation. Default: None. Returns: dict: evaluation results """ if isinstance(metric, str): metrics = [metric] else: metrics = metric allowed_metrics = ['mAP', 'CP', 'CR', 'CF1', 'OP', 'OR', 'OF1'] eval_results = {} results = np.vstack(results) gt_labels = self.get_gt_labels() num_imgs = len(results) assert len(gt_labels) == num_imgs, 'dataset testing results should '\ 'be of the same length as gt_labels.' invalid_metrics = set(metrics) - set(allowed_metrics) if len(invalid_metrics) != 0: raise KeyError(f'metirc {invalid_metrics} is not supported.') if 'mAP' in metrics: mAP_value = mAP(results, gt_labels) eval_results['mAP'] = mAP_value metrics.remove('mAP') if len(metrics) != 0: performance_keys = ['CP', 'CR', 'CF1', 'OP', 'OR', 'OF1'] performance_values = average_performance(results, gt_labels, **eval_kwargs) for k, v in zip(performance_keys, performance_values): if k in metrics: eval_results[k] = v return eval_results