def precision_recall_f1_support_sequence_labelling(y_true, y_pred): """Compute precision, recall, f1 and support for sequence labelling tasks. For given gold (`y_true`) and predicted (`y_pred`) sequence labels, returns the precision, recall, f1 and support per label, and the macro and micro average of these scores across labels. Expects `y_true` and `y_pred` to be a sequence of IOB1/2, IOE1/2, or IOBES formatted labels. Args: y_true (list): List of IOB1/2, IOE1/2, or IOBES formatted sequence labels. y_pred (list): List of IOB1/2, IOE1/2, or IOBES formatted sequence labels. Returns: A dictionary of scores keyed by the labels in `y_true` where each score is a 4-tuple containing precision, recall, f1 and support. Additionally includes the keys 'Macro avg' and 'Micro avg' containing the macro and micro averages across scores. """ scores = {} # Unique labels, not including NEG labels = list( {tag.split('-')[-1] for tag in set(y_true) if tag != OUTSIDE}) labels.sort( ) # ensures labels displayed in same order across runs / partitions for label in labels: y_true_lab = [ tag if tag.endswith(label) else OUTSIDE for tag in y_true ] y_pred_lab = [ tag if tag.endswith(label) else OUTSIDE for tag in y_pred ] # TODO (John): Open a pull request to seqeval with a new function that returns all these # scores in one call. There is a lot of repeated computation here. precision = precision_score(y_true_lab, y_pred_lab) recall = recall_score(y_true_lab, y_pred_lab) f1 = f1_score(y_true_lab, y_pred_lab) support = len(set(get_entities(y_true_lab))) scores[label] = precision, recall, f1, support # Get macro and micro performance metrics averages macro_precision = mean([v[0] for v in scores.values()]) macro_recall = mean([v[1] for v in scores.values()]) macro_f1 = mean([v[2] for v in scores.values()]) total_support = sum([v[3] for v in scores.values()]) micro_precision = precision_score(y_true, y_pred) micro_recall = recall_score(y_true, y_pred) micro_f1 = f1_score(y_true, y_pred) scores[ 'Macro avg'] = macro_precision, macro_recall, macro_f1, total_support scores[ 'Micro avg'] = micro_precision, micro_recall, micro_f1, total_support return scores
def evaluate(gold_data, parsed_data, fnversion=1.7): tic() gold_frames, pred_frames = [],[] gold_args, pred_args = [],[] gold_fulls, pred_fulls = [],[] for i in range(len(parsed_data)): gold = gold_data[i] parsed = parsed_data[i] gold_frame = [i for i in gold[2] if i != '_'][0] pred_frame = [i for i in parsed[2] if i != '_'][0] gold_arg = [i for i in gold[3] if i != 'X'] pred_arg = [i for i in parsed[3]] gold_frames.append(gold_frame) pred_frames.append(pred_frame) weighted_gold_frame, weighted_pred_frame, weighted_gold_arg, weighted_pred_arg = weighting(gold_frame, pred_frame, gold_arg, pred_arg, fnversion=fnversion) gold_args.append(weighted_gold_arg) pred_args.append(weighted_pred_arg) gold_full = [] gold_full += weighted_gold_frame gold_full += weighted_gold_arg pred_full = [] pred_full += weighted_pred_frame pred_full += weighted_pred_arg gold_fulls.append(gold_full) pred_fulls.append(pred_full) acc = accuracy_score(gold_frames, pred_frames) arg_f1 = f1_score(gold_args, pred_args) arg_precision = precision_score(gold_args, pred_args) arg_recall = recall_score(gold_args, pred_args) full_f1 = f1_score(gold_fulls, pred_fulls) full_precision = precision_score(gold_fulls, pred_fulls) full_recall = recall_score(gold_fulls, pred_fulls) result = (acc, arg_precision, arg_recall, arg_f1, full_precision, full_recall, full_f1) print('evaluation is complete:',tac()) return result
def score(y_true, y_pred): """Wrapper of seqeval metrics """ precision = precision_score(y_true, y_pred) recall = recall_score(y_true, y_pred) f1 = f1_score(y_true, y_pred) return precision, recall, f1
def get_slot_metrics(preds, labels): assert len(preds) == len(labels) return { "slot_precision": precision_score(labels, preds), "slot_recall": recall_score(labels, preds), "slot_f1": f1_score(labels, preds) }
def _eval_end(self, outputs): "Evaluation called for both Val and Test" val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean() entity_preds = np.concatenate([x["entity_preds"] for x in outputs], axis=0) entity_preds = np.argmax(entity_preds, axis=2) intent_acc = torch.stack([x["intent_acc"] for x in outputs]).mean() out_label_ids = np.concatenate([x["entity_targets"] for x in outputs], axis=0) out_label_list = [[] for _ in range(out_label_ids.shape[0])] entity_preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != FLAGS.pad_token_label_id: out_label_list[i].append(self.id2entity[out_label_ids[i][j]]) entity_preds_list[i].append(self.id2entity[entity_preds[i][j]]) results = { "val_loss": val_loss_mean, "precision": precision_score(out_label_list, entity_preds_list), "recall": recall_score(out_label_list, entity_preds_list), "f1": f1_score(out_label_list, entity_preds_list), "intent_acc": intent_acc } ret = {k: v for k, v in results.items()} ret["log"] = results return ret, entity_preds_list, out_label_list
def compute_metrics(p: EvalPrediction) -> Dict: preds_list, out_label_list, matrix = align_predictions( p.predictions, p.label_ids) return { "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), # classification_report, accuracy_score "accuracy": accuracy_score(out_label_list, preds_list), "matrix": matrix, # "multilabel_confusion_matrix": multilabel_confusion_matrix(multi_out_label_list, multi_preds_list, labels=["B", "I", "O"]), # "confusion_matrix": confusion_matrix(out_label_list, preds_list), # "multilabel_confusion_matrix": multilabel_confusion_matrix(out_label_list, preds_list), # "balanced_accuracy_score": balanced_accuracy_score(out_label_list, preds_list), # "roc_auc_score": roc_auc_score(out_label_list, preds_list), # "top_k_accuracy_score": top_k_accuracy_score(out_label_list, preds_list), "classification_report": classification_report(out_label_list, preds_list) }
def compute_pos_metrics(label_map, p: EvalPrediction) -> Dict: def align_predictions( predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]: # Token predictions need to be reduced to word predictions preds = np.argmax(predictions, axis=2) batch_size, seq_len = preds.shape out_label_list = [[] for _ in range(batch_size)] preds_list = [[] for _ in range(batch_size)] for i in range(batch_size): for j in range(seq_len): if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index: out_label_list[i].append(label_map[label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) return preds_list, out_label_list preds_list, out_label_list = align_predictions(p.predictions, p.label_ids) return { "accuracy_score": accuracy_score(out_label_list, preds_list), "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), }
def compute_metrics_from_preds_and_labels(cls, task, preds, labels): label_mask = np.stack([row["label_mask"] for row in labels]) # Account for smart-truncate assert (label_mask[:, preds.shape[-1]:] == 0).all() label_mask = label_mask[:, :preds.shape[-1]].astype(bool) labels_for_eval = [label["pos_list"] for label in labels] preds_for_eval = [] assert len(labels) == preds.shape[0] for i in range(len(labels)): relevant_preds = preds[i][label_mask[i]] relevant_preds_pos = [ task.LABEL_BIMAP.b[pos_id] for pos_id in relevant_preds ] preds_for_eval.append(relevant_preds_pos) minor = { "precision": seqeval_metrics.precision_score(labels_for_eval, preds_for_eval), "recall": seqeval_metrics.recall_score(labels_for_eval, preds_for_eval), "f1": seqeval_metrics.f1_score(labels_for_eval, preds_for_eval), } return Metrics( major=minor["f1"], minor=minor, )
def get_slot_metrics(preds, labels): '''According IOB1 scheme to evaluate seq''' return { 'slot_f1': f1_score(labels, preds), 'slot_recall': recall_score(labels, preds), 'slot_precision': precision_score(labels, preds), }
def evaluation(model, data_loader, index_to_label, vocab_dict, paras, device): """ Contributor: Peng Qianqian: conlleval.pl for model evaluation. Oyang Sizhuo: conlleval.pl for model evaluation. """ model.eval() total_pred_label = [] total_ture_label = [] with torch.no_grad(): for step, batch in enumerate(data_loader): batch_data, batch_label = batch batch_data_list = [data.split('&&&') for data in batch_data] batch_label_list = [label.split('&&&') for label in batch_label] input_ids, mask = batch_data_processing(batch_data_list, paras.max_length, vocab_dict.get('[PAD]'), vocab_dict.get('[CLS]'), vocab_dict.get('[SEP]')) input_ids = input_ids.to(device) mask = mask.to(device) batch_max_length = input_ids.shape[1] predict_result = model(input_ids, mask) predict_label_list = convert_index_to_label( predict_result, index_to_label) ture_label_list = label_truncation(batch_label_list, batch_max_length) if args.print_example: logger.debug('Example:') logger.debug(f'predict: {predict_label_list[0]}') logger.debug(f'ture: {ture_label_list[0]}') for predict_list, ture_list in zip(predict_label_list, ture_label_list): if len(predict_list) != len(ture_list): logger.debug('different length.') logger.debug( f'predict: {len(predict_list)}, ture: {len(ture_list)}' ) logger.debug(f'{predict_list}\n{ture_list}') continue total_pred_label.append(predict_list) total_ture_label.append(ture_list) logger.debug(f'total ture_label: {len(total_ture_label)}, ' f'total pred_label: {len(total_pred_label)}') acc = accuracy_score(total_ture_label, total_pred_label) precision = precision_score(total_ture_label, total_pred_label) recall = recall_score(total_ture_label, total_pred_label) f1 = f1_score(total_ture_label, total_pred_label) return acc, precision, recall, f1
def compute_metrics_ner(p: EvalPrediction, id_to_label): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ id_to_label[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ id_to_label[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report report = classification_report(true_labels, true_predictions, output_dict=True) return { "accuracy_score": accuracy_score(true_labels, true_predictions), "precision": precision_score(true_labels, true_predictions), "recall": recall_score(true_labels, true_predictions), "f1": f1_score(true_labels, true_predictions), "macro_precision": report["macro avg"]["precision"], "macro_recall": report["macro avg"]["recall"], "macro_f1": report["macro avg"]["f1-score"], "micro_precision": report["micro avg"]["precision"], "micro_recall": report["micro avg"]["recall"], "micro_f1": report["micro avg"]["f1-score"], "weighted_precision": report["weighted avg"]["precision"], "weighted_recall": report["weighted avg"]["recall"], "weighted_f1": report["weighted avg"]["f1-score"] }
def get_metric(self, reset: bool): if not reset: return {} all_prediction_sequence = [] all_gold_sequence = [] results = [] for doc_id in self.gold_labels.keys(): prediction = self.span_to_label_sequence(self.prediction[doc_id]) gold = self.span_to_label_sequence(self.gold_labels[doc_id]) all_prediction_sequence.append(prediction) all_gold_sequence.append(gold) results.append({ "words": self.doc_id_to_words[doc_id], "gold": gold, "prediction": prediction }) if self.prediction_save_path is not None: with open(self.prediction_save_path, "w") as f: json.dump(results, f) return dict( f1=f1_score(all_gold_sequence, all_prediction_sequence, scheme=IOB2), precision=precision_score(all_gold_sequence, all_prediction_sequence, scheme=IOB2), recall=recall_score(all_gold_sequence, all_prediction_sequence, scheme=IOB2), )
def evaluate(y_pred, id_to_word, test_words, test_tags, id_to_tag, model_name, training_time, dataset): """ Evaluate current model using CoNLL script. """ true_list = [] pred_list = [] word_true_perd =[] print("Calculating results and printing classification report") print(54 * "-") for i in range(len(y_pred)): p = np.argmax(y_pred[i], axis=-1) for word_id, tag_id, pred in zip(test_words[i], test_tags[i], p): if id_to_word[word_id] != 'PAD': true_list.append(id_to_tag[tag_id]) pred_list.append(id_to_tag[pred]) word_true_perd.append([id_to_word[word_id],id_to_tag[tag_id],id_to_tag[pred]]) today = date.today() current_time = today.strftime("%d%m%Y") save_obj(word_true_perd,'word_true_perd'+str(current_time)) precision = precision_score(true_list, pred_list) recall = recall_score(true_list, pred_list) f1score = f1_score(true_list, pred_list) print(classification_report(true_list, pred_list)) cur_list = [model_name, dataset, training_time, precision, recall, f1score] return cur_list
def get_slot_metrics(preds, labels): assert len(preds) == len(labels) return OrderedDict({ "Slot Precision": precision_score(labels, preds), "Slot Recall": recall_score(labels, preds), "Slot F1": f1_score(labels, preds) })
def compute_metrics(pred): labels = pred.label_ids preds = pred.predictions.argmax(-1) filtered_labels = [] filtered_preds = [] for i in range(labels.shape[0]): filtered_labels_inner = [] filtered_preds_inner = [] for j in range(labels.shape[1]): if labels[i][j] != -100: filtered_labels_inner.append(id_to_label(labels[i][j])) filtered_preds_inner.append(id_to_label(preds[i][j])) filtered_labels.append(filtered_labels_inner) filtered_preds.append(filtered_preds_inner) return { 'accuracy': metrics.accuracy_score(filtered_labels, filtered_preds), 'f1': metrics.f1_score(filtered_labels, filtered_preds), 'precision': metrics.precision_score(filtered_labels, filtered_preds), 'recall': metrics.recall_score(filtered_labels, filtered_preds), }
def _eval_end(self, outputs): "Task specific validation" val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean() preds = np.concatenate([x["pred"] for x in outputs], axis=0) preds = np.argmax(preds, axis=2) out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0) label_map = {i: label for i, label in enumerate(self.labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != self.pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) results = { "val_loss": val_loss_mean, "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), } if self.is_logger(): logger.info(self.proc_rank) logger.info("***** Eval results *****") for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) tensorboard_logs = results ret = {k: v for k, v in results.items()} ret["log"] = tensorboard_logs return ret, preds_list, out_label_list
def compute_metrics(preds_list, out_label_list): return { "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), }
def __call__(self, eval_pred: EvalPrediction) -> Dict: """Computes accuracy precision, recall and f1 based on the list of IOB2 labels. Positions with labels with a value of -100 will be filtered out both from true labela dn prediction. Args: eval_pred (EvalPrediction): the predictions and targets to be matched as np.ndarrays. Returns: (Dict): a dictionary with accuracy_score, precision, recall and f1. """ predictions, labels = eval_pred predictions = np.argmax(predictions, axis=-1) # Remove ignored index (special tokens) true_predictions = [ [self.label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ] true_labels = [ [self.label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ] print("\n"+" " * 80) print(classification_report(true_labels, true_predictions)) return { "accuracy_score": accuracy_score(true_labels, true_predictions), "precision": precision_score(true_labels, true_predictions), "recall": recall_score(true_labels, true_predictions), "f1": f1_score(true_labels, true_predictions), }
def on_epoch_end(self, epoch, logs=None): if (epoch + 1) % self.step == 0: y_pred = self.kash_model.predict(self.valid_x, batch_size=self.batch_size) if self.kash_model.task in [ macros.TaskType.LABELING, macros.TaskType.RAW_LABELING ]: y_true = [ seq[:len(y_pred[index])] for index, seq in enumerate(self.valid_y) ] precision = seq_metrics.precision_score(y_true, y_pred) recall = seq_metrics.recall_score(y_true, y_pred) f1 = seq_metrics.f1_score(y_true, y_pred) else: y_true = self.valid_y precision = metrics.precision_score(y_true, y_pred, average=self.average) recall = metrics.recall_score(y_true, y_pred, average=self.average) f1 = metrics.f1_score(y_true, y_pred, average=self.average) self.logs[epoch] = { 'precision': precision, 'recall': recall, 'f1': f1 } print( f"\nepoch: {epoch} precision: {precision:.6f}, recall: {recall:.6f}, f1: {f1:.6f}" )
def eval(args, model, dataset, label_list): dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn) all_true_labels, all_pred_labels = [], [] with torch.no_grad(): for batch in tqdm(dataloader): inputs = {k: v.to(args.device) for k, v in batch.items()} outputs = model(**inputs) #predictions = outputs[1].permute(1, 0, 2).detach().cpu().numpy() predictions = outputs[1] label_ids = batch['label_ids'].permute(1, 0).detach().cpu().numpy() preds_list, out_label_list = align_predictions( predictions, label_ids, label_list) all_true_labels += out_label_list all_pred_labels += preds_list report = classification_report(all_true_labels, all_pred_labels) logger.info(report) return { 'precision': precision_score(all_true_labels, all_pred_labels), 'recall': recall_score(all_true_labels, all_pred_labels), 'f1': f1_score(all_true_labels, all_pred_labels), }
def all_metrics(pred_tag, true_tag): print(classification_report(pred_tag, true_tag)) print('=' * 25) print("Precision: \t", precision_score(pred_tag, true_tag)) print("Recall: \t", recall_score(pred_tag, true_tag)) print("F1: \t\t", f1_score(pred_tag, true_tag))
def test(dataloader, model, label_list): size = len(dataloader.dataset) model.eval() test_loss, correct = 0, 0 with torch.no_grad(): all_pred_y = list() all_y = list() print("testing") for X, y, masks in tqdm(dataloader): X, y = X.to(device), y.to(device) masks = masks.to(device) # output = model(X) # loss = 0 - crf(output, y) loss, pred_y = model(X, y, masks) test_loss += loss all_pred_y.extend(pred_y) all_y.extend(y.cpu().numpy().tolist()) all_pred_y_label = [[label_list[t1] for t1 in t2] for t2 in all_pred_y] all_y_label = [[label_list[t1] for t1 in t2] for t2 in all_y] print('p', precision_score(all_pred_y_label, all_y_label)) print('r', recall_score(all_pred_y_label, all_y_label)) print('f1', f1_score(all_pred_y_label, all_y_label)) print('acc', accuracy_score(all_pred_y_label, all_y_label)) test_loss /= size print(f"Avg loss: {test_loss:>8f} \n")
def _eval_end(self, outputs): """Evaluation called for both Val and Test""" val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean() preds = np.concatenate([x['pred'] for x in outputs], axis=0) preds = np.argmax(preds, axis=2) out_label_ids = np.concatenate([x['target'] for x in outputs], axis=0) label_map = {i: label for i, label in enumerate(self.labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != self.pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) results = { 'val_loss': val_loss_mean, 'precision': precision_score(out_label_list, preds_list), 'recall': recall_score(out_label_list, preds_list), 'f1': f1_score(out_label_list, preds_list), } ret = {k: v for k, v in results.items()} ret['log'] = results return ret, preds_list, out_label_list
def evaluate(self, data, labels): """Evaluate the performance of ner model. Args: data: list of tokenized texts (, like ``[['我', '是', '中', '国', '人']]`` labels: list of list of str, the corresponding label strings """ features, y = self.preprocessor.prepare_input(data, labels) pred_probs = self.model.predict(features) if self.preprocessor.use_bert: pred_probs = pred_probs[:, 1:-1, :] # remove <CLS> and <SEQ> lengths = [ min(len(label), pred_prob.shape[0]) for label, pred_prob in zip(labels, pred_probs) ] y_pred = self.preprocessor.label_decode(pred_probs, lengths) r = metrics.recall_score(labels, y_pred) p = metrics.precision_score(labels, y_pred) f1 = metrics.f1_score(labels, y_pred) print('Recall: {}, Precision: {}, F1: {}'.format(r, p, f1)) print(metrics.classification_report(labels, y_pred)) return f1
def _eval_end(self, outputs): val_loss_mean = T.stack([x["val_loss"] for x in outputs]).mean() preds = np.concatenate([x["pred"] for x in outputs], axis=0) preds = np.argmax(preds, axis=2) out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0) label_map = {i: label for i, label in enumerate(self.labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != self.pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) results = { "val_loss": val_loss_mean, "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), } ret = {k: v for k, v in results.items()} ret["log"] = results return ret, preds_list, out_label_list
def weighted_voting(args, dataset, src_probs, labels, pad_token_label_id): args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) out_label_ids = None for batch in eval_dataloader: out_label_ids = batch[3] if out_label_ids is None else np.append(out_label_ids, batch[3], axis=0) preds = np.argmax(src_probs.cpu().numpy(), axis=-1) label_map = {i: label for i, label in enumerate(labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) results = { "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list) } logger.info("***** Eval results %s *****", os.path.basename(args.data_dir)) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) return results, preds_list
def compute_metrics(p: EvalPrediction) -> Dict: preds_list, out_label_list = align_predictions(p.predictions, p.label_ids) return { "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), }
def test_epoch_end(self, outputs: List[Dict[str, torch.Tensor]]): preds = np.concatenate( [x["pred"].detach().cpu().numpy() for x in outputs], axis=0) preds = np.argmax(preds, axis=2) target_ids = np.concatenate( [x["target"].detach().cpu().numpy() for x in outputs], axis=0) target_list: StrListList = [[] for _ in range(target_ids.shape[0])] preds_list: StrListList = [[] for _ in range(target_ids.shape[0])] for i in range(target_ids.shape[0]): for j in range(target_ids.shape[1]): if target_ids[i][j] != PAD_TOKEN_LABEL_ID: target_list[i].append( self.label_ids_to_label[target_ids[i][j]]) preds_list[i].append(self.label_ids_to_label[preds[i][j]]) accuracy = accuracy_score(target_list, preds_list) precision = precision_score(target_list, preds_list, mode="strict", scheme=BILOU) recall = recall_score(target_list, preds_list, mode="strict", scheme=BILOU) f1 = f1_score(target_list, preds_list, mode="strict", scheme=BILOU) self.log("test_accuracy", accuracy) self.log("test_precision", precision) self.log("test_recall", recall) self.log("test_f1", f1)
def calculate_token_class_metrics(pred_toks, targ_toks, metric_key): if (metric_key == 'accuracy'): return seq_metrics.accuracy_score(targ_toks, pred_toks) if (metric_key == 'precision'): return seq_metrics.precision_score(targ_toks, pred_toks) if (metric_key == 'recall'): return seq_metrics.recall_score(targ_toks, pred_toks) if (metric_key == 'f1'): return seq_metrics.f1_score(targ_toks, pred_toks) if (metric_key == 'classification_report'): return seq_metrics.classification_report(targ_toks, pred_toks)
def model_evaluate(model, data, label, tag2id, batch_size, seq_len_list): id2tag = {value: key for key, value in tag2id.items()} pred_logits = model.predict(data, batch_size=batch_size) # pred shape [batch_size, max_len] preds = np.argmax(pred_logits, axis=2).tolist() assert len(preds) == len(seq_len_list) # get predcit label predict_label = [] target_label = [] for i in range(len(preds)): pred = preds[i][1:] temp = [] true_label = label[i][:min(seq_len_list[i], len(pred))] for j in range(min(seq_len_list[i], len(pred))): temp.append(id2tag[pred[j]]) assert len(temp) == len(true_label) target_label.append(true_label) predict_label.append(temp) # 计算 precision, recall, f1_score precision = precision_score(target_label, predict_label, average="macro", zero_division=0) recall = recall_score(target_label, predict_label, average="macro", zero_division=0) f1 = f1_score(target_label, predict_label, average="macro", zero_division=0) logger.info(classification_report(target_label, predict_label)) return precision, recall, f1