Example #1
0
    def evaluate(self,
                 results,
                 metric='accuracy',
                 metric_options={'topk': (1, 5)},
                 logger=None):
        """Evaluate the dataset.

        Args:
            results (list): Testing results of the dataset.
            metric (str | list[str]): Metrics to be evaluated.
                Default value is `accuracy`.
            logger (logging.Logger | None | str): Logger used for printing
                related information during evaluation. Default: None.
        Returns:
            dict: evaluation results
        """
        if not isinstance(metric, str):
            assert len(metric) == 1
            metric = metric[0]
        allowed_metrics = ['accuracy']
        if metric not in allowed_metrics:
            raise KeyError(f'metric {metric} is not supported')

        eval_results = {}
        if metric == 'accuracy':
            topk = metric_options.get('topk')
            results = np.vstack(results)
            gt_labels = self.get_gt_labels()
            num_imgs = len(results)
            assert len(gt_labels) == num_imgs
            acc = accuracy(results, gt_labels, topk)
            eval_results = {f'top-{k}': a.item() for k, a in zip(topk, acc)}
        return eval_results
Example #2
0
    def evaluate(self,
                 results,
                 metric='accuracy',
                 metric_options={'topk': (1, 5)},
                 logger=None):
        """Evaluate the dataset.

        Args:
            results (list): Testing results of the dataset.
            metric (str | list[str]): Metrics to be evaluated.
                Default value is `accuracy`.
            logger (logging.Logger | None | str): Logger used for printing
                related information during evaluation. Default: None.
        Returns:
            dict: evaluation results
        """
        if isinstance(metric, str):
            metrics = [metric]
        else:
            metrics = metric
        allowed_metrics = ['accuracy', 'precision', 'recall', 'f1_score']
        eval_results = {}
        for metric in metrics:
            if metric not in allowed_metrics:
                raise KeyError(f'metric {metric} is not supported.')
            results = np.vstack(results)
            gt_labels = self.get_gt_labels()
            num_imgs = len(results)
            assert len(gt_labels) == num_imgs
            if metric == 'accuracy':
                topk = metric_options.get('topk')
                acc = accuracy(results, gt_labels, topk)
                eval_result = {f'top-{k}': a.item() for k, a in zip(topk, acc)}
            elif metric == 'precision':
                precision_value = precision(results, gt_labels)
                eval_result = {'precision': precision_value}
            elif metric == 'recall':
                recall_value = recall(results, gt_labels)
                eval_result = {'recall': recall_value}
            elif metric == 'f1_score':
                f1_score_value = f1_score(results, gt_labels)
                eval_result = {'f1_score': f1_score_value}
            eval_results.update(eval_result)
        return eval_results
Example #3
0
    def evaluate(self,
                 results,
                 metric='accuracy',
                 metric_options={'topk': (1, 5)},
                 logger=None):
        """Evaluate the dataset.

        Args:
            results (list): Testing results of the dataset.
            metric (str | list[str]): Metrics to be evaluated.
                Default value is `accuracy`.
            metric_options (dict): Options for calculating metrics. Allowed
                keys are 'topk', 'thrs' and 'average_mode'.
            logger (logging.Logger | None | str): Logger used for printing
                related information during evaluation. Default: None.
        Returns:
            dict: evaluation results
        """
        if isinstance(metric, str):
            metrics = [metric]
        else:
            metrics = metric
        allowed_metrics = [
            'accuracy', 'precision', 'recall', 'f1_score', 'support'
        ]
        eval_results = {}
        results = np.vstack(results)
        gt_labels = self.get_gt_labels()
        num_imgs = len(results)
        assert len(gt_labels) == num_imgs, 'dataset testing results should '\
            'be of the same length as gt_labels.'

        invalid_metrics = set(metrics) - set(allowed_metrics)
        if len(invalid_metrics) != 0:
            raise ValueError(f'metirc {invalid_metrics} is not supported.')

        topk = metric_options.get('topk', (1, 5))
        thrs = metric_options.get('thrs')
        average_mode = metric_options.get('average_mode', 'macro')

        if 'accuracy' in metrics:
            acc = accuracy(results, gt_labels, topk=topk, thrs=thrs)
            if isinstance(topk, tuple):
                eval_results_ = {
                    f'accuracy_top-{k}': a
                    for k, a in zip(topk, acc)
                }
            else:
                eval_results_ = {'accuracy': acc}
            if isinstance(thrs, tuple):
                for key, values in eval_results_.items():
                    eval_results.update({
                        f'{key}_thr_{thr:.2f}': value.item()
                        for thr, value in zip(thrs, values)
                    })
            else:
                eval_results.update(
                    {k: v.item()
                     for k, v in eval_results_.items()})

        if 'support' in metrics:
            support_value = support(results,
                                    gt_labels,
                                    average_mode=average_mode)
            eval_results['support'] = support_value

        precision_recall_f1_keys = ['precision', 'recall', 'f1_score']
        if len(set(metrics) & set(precision_recall_f1_keys)) != 0:
            precision_recall_f1_values = precision_recall_f1(
                results, gt_labels, average_mode=average_mode, thrs=thrs)
            for key, values in zip(precision_recall_f1_keys,
                                   precision_recall_f1_values):
                if key in metrics:
                    if isinstance(thrs, tuple):
                        eval_results.update({
                            f'{key}_thr_{thr:.2f}': value
                            for thr, value in zip(thrs, values)
                        })
                    else:
                        eval_results[key] = values

        return eval_results
Example #4
0
    def evaluate(self,
                 results,
                 metric='accuracy',
                 metric_options={'topk': (1, 5)},
                 logger=None):
        """Evaluate the dataset.

        Args:
            results (list): Testing results of the dataset.
            metric (str | list[str]): Metrics to be evaluated.
                Default value is `accuracy`.
            logger (logging.Logger | None | str): Logger used for printing
                related information during evaluation. Default: None.
        Returns:
            dict: evaluation results
        """
        if not isinstance(metric, str):
            assert len(metric) == 1
            metric = metric[0]
        #allowed_metrics = ['accuracy']
        #if metric not in allowed_metrics:
        #raise KeyError(f'metric {metric} is not supported')

        eval_results = {}
        if metric == 'accuracy':
            topk = metric_options.get('topk')
            results = np.vstack(results)
            gt_labels = self.get_gt_labels()
            num_imgs = len(results)
            assert len(gt_labels) == num_imgs
            acc = accuracy(results, gt_labels, topk)
            loss = cross_entropy(torch.Tensor(results),
                                 torch.Tensor(gt_labels).long()).mean()
            loss = round(float(loss), 3)
            eval_results = {f'top-{k}': a.item() for k, a in zip(topk, acc)}
            eval_results['loss'] = loss
        elif metric == 'acc_and_auc':
            topk = metric_options.get('topk')
            results = np.vstack(results)
            gt_labels = self.get_gt_labels()
            num_imgs = len(results)
            assert len(gt_labels) == num_imgs
            acc = accuracy(results, gt_labels, topk)
            loss = cross_entropy(torch.Tensor(results),
                                 torch.Tensor(gt_labels).long()).mean()
            loss = round(float(loss), 3)
            auc = roc_auc_score(gt_labels, results[:, 1])
            eval_results = {f'top-{k}': a.item() for k, a in zip(topk, acc)}
            eval_results['loss'] = loss
            eval_results['auc'] = auc
        elif metric == 'all':
            # gt_labels.shape (n,) n is num of samples
            # results.shape(n, 2) 2 is n_classes
            topk = metric_options.get('topk')
            results = np.vstack(results)
            gt_labels = self.get_gt_labels()
            num_imgs = len(results)
            assert len(gt_labels) == num_imgs
            loss = cross_entropy(torch.Tensor(results),
                                 torch.Tensor(gt_labels).long()).mean()
            loss = round(float(loss), 3)
            auc = roc_auc_score(gt_labels, results[:, 1])
            #acc = accuracy(results, gt_labels, topk)
            #prec, rec, f1, _ = precision_recall_fscore_support(gt_labels, results[:, 1].round(), average="binary")
            #specificity = get_specificity(torch.Tensor(results[:,1]), torch.Tensor(gt_labels))
            prec, rec, f1, specificity, acc = self.get_best_metrics(
                gt_labels, results[:, 1])

            #eval_results = {f'top-{k}': a.item() for k, a in zip(topk, acc)}
            eval_results['loss'] = loss
            eval_results['acc'] = acc
            eval_results['auc'] = auc
            eval_results['f1'] = f1
            eval_results['recall'] = rec
            eval_results['precision'] = prec
            eval_results['specificity'] = specificity
        return eval_results