def evaluate(self, results, metric='accuracy', metric_options={'topk': (1, 5)}, logger=None): """Evaluate the dataset. Args: results (list): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. Default value is `accuracy`. logger (logging.Logger | None | str): Logger used for printing related information during evaluation. Default: None. Returns: dict: evaluation results """ if not isinstance(metric, str): assert len(metric) == 1 metric = metric[0] allowed_metrics = ['accuracy'] if metric not in allowed_metrics: raise KeyError(f'metric {metric} is not supported') eval_results = {} if metric == 'accuracy': topk = metric_options.get('topk') results = np.vstack(results) gt_labels = self.get_gt_labels() num_imgs = len(results) assert len(gt_labels) == num_imgs acc = accuracy(results, gt_labels, topk) eval_results = {f'top-{k}': a.item() for k, a in zip(topk, acc)} return eval_results
def evaluate(self, results, metric='accuracy', metric_options={'topk': (1, 5)}, logger=None): """Evaluate the dataset. Args: results (list): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. Default value is `accuracy`. logger (logging.Logger | None | str): Logger used for printing related information during evaluation. Default: None. Returns: dict: evaluation results """ if isinstance(metric, str): metrics = [metric] else: metrics = metric allowed_metrics = ['accuracy', 'precision', 'recall', 'f1_score'] eval_results = {} for metric in metrics: if metric not in allowed_metrics: raise KeyError(f'metric {metric} is not supported.') results = np.vstack(results) gt_labels = self.get_gt_labels() num_imgs = len(results) assert len(gt_labels) == num_imgs if metric == 'accuracy': topk = metric_options.get('topk') acc = accuracy(results, gt_labels, topk) eval_result = {f'top-{k}': a.item() for k, a in zip(topk, acc)} elif metric == 'precision': precision_value = precision(results, gt_labels) eval_result = {'precision': precision_value} elif metric == 'recall': recall_value = recall(results, gt_labels) eval_result = {'recall': recall_value} elif metric == 'f1_score': f1_score_value = f1_score(results, gt_labels) eval_result = {'f1_score': f1_score_value} eval_results.update(eval_result) return eval_results
def evaluate(self, results, metric='accuracy', metric_options={'topk': (1, 5)}, logger=None): """Evaluate the dataset. Args: results (list): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. Default value is `accuracy`. metric_options (dict): Options for calculating metrics. Allowed keys are 'topk', 'thrs' and 'average_mode'. logger (logging.Logger | None | str): Logger used for printing related information during evaluation. Default: None. Returns: dict: evaluation results """ if isinstance(metric, str): metrics = [metric] else: metrics = metric allowed_metrics = [ 'accuracy', 'precision', 'recall', 'f1_score', 'support' ] eval_results = {} results = np.vstack(results) gt_labels = self.get_gt_labels() num_imgs = len(results) assert len(gt_labels) == num_imgs, 'dataset testing results should '\ 'be of the same length as gt_labels.' invalid_metrics = set(metrics) - set(allowed_metrics) if len(invalid_metrics) != 0: raise ValueError(f'metirc {invalid_metrics} is not supported.') topk = metric_options.get('topk', (1, 5)) thrs = metric_options.get('thrs') average_mode = metric_options.get('average_mode', 'macro') if 'accuracy' in metrics: acc = accuracy(results, gt_labels, topk=topk, thrs=thrs) if isinstance(topk, tuple): eval_results_ = { f'accuracy_top-{k}': a for k, a in zip(topk, acc) } else: eval_results_ = {'accuracy': acc} if isinstance(thrs, tuple): for key, values in eval_results_.items(): eval_results.update({ f'{key}_thr_{thr:.2f}': value.item() for thr, value in zip(thrs, values) }) else: eval_results.update( {k: v.item() for k, v in eval_results_.items()}) if 'support' in metrics: support_value = support(results, gt_labels, average_mode=average_mode) eval_results['support'] = support_value precision_recall_f1_keys = ['precision', 'recall', 'f1_score'] if len(set(metrics) & set(precision_recall_f1_keys)) != 0: precision_recall_f1_values = precision_recall_f1( results, gt_labels, average_mode=average_mode, thrs=thrs) for key, values in zip(precision_recall_f1_keys, precision_recall_f1_values): if key in metrics: if isinstance(thrs, tuple): eval_results.update({ f'{key}_thr_{thr:.2f}': value for thr, value in zip(thrs, values) }) else: eval_results[key] = values return eval_results
def evaluate(self, results, metric='accuracy', metric_options={'topk': (1, 5)}, logger=None): """Evaluate the dataset. Args: results (list): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. Default value is `accuracy`. logger (logging.Logger | None | str): Logger used for printing related information during evaluation. Default: None. Returns: dict: evaluation results """ if not isinstance(metric, str): assert len(metric) == 1 metric = metric[0] #allowed_metrics = ['accuracy'] #if metric not in allowed_metrics: #raise KeyError(f'metric {metric} is not supported') eval_results = {} if metric == 'accuracy': topk = metric_options.get('topk') results = np.vstack(results) gt_labels = self.get_gt_labels() num_imgs = len(results) assert len(gt_labels) == num_imgs acc = accuracy(results, gt_labels, topk) loss = cross_entropy(torch.Tensor(results), torch.Tensor(gt_labels).long()).mean() loss = round(float(loss), 3) eval_results = {f'top-{k}': a.item() for k, a in zip(topk, acc)} eval_results['loss'] = loss elif metric == 'acc_and_auc': topk = metric_options.get('topk') results = np.vstack(results) gt_labels = self.get_gt_labels() num_imgs = len(results) assert len(gt_labels) == num_imgs acc = accuracy(results, gt_labels, topk) loss = cross_entropy(torch.Tensor(results), torch.Tensor(gt_labels).long()).mean() loss = round(float(loss), 3) auc = roc_auc_score(gt_labels, results[:, 1]) eval_results = {f'top-{k}': a.item() for k, a in zip(topk, acc)} eval_results['loss'] = loss eval_results['auc'] = auc elif metric == 'all': # gt_labels.shape (n,) n is num of samples # results.shape(n, 2) 2 is n_classes topk = metric_options.get('topk') results = np.vstack(results) gt_labels = self.get_gt_labels() num_imgs = len(results) assert len(gt_labels) == num_imgs loss = cross_entropy(torch.Tensor(results), torch.Tensor(gt_labels).long()).mean() loss = round(float(loss), 3) auc = roc_auc_score(gt_labels, results[:, 1]) #acc = accuracy(results, gt_labels, topk) #prec, rec, f1, _ = precision_recall_fscore_support(gt_labels, results[:, 1].round(), average="binary") #specificity = get_specificity(torch.Tensor(results[:,1]), torch.Tensor(gt_labels)) prec, rec, f1, specificity, acc = self.get_best_metrics( gt_labels, results[:, 1]) #eval_results = {f'top-{k}': a.item() for k, a in zip(topk, acc)} eval_results['loss'] = loss eval_results['acc'] = acc eval_results['auc'] = auc eval_results['f1'] = f1 eval_results['recall'] = rec eval_results['precision'] = prec eval_results['specificity'] = specificity return eval_results