def evaluate(self, model): self.pad_process() train_X, test_X, train_y, test_y = self.train_test_split() test_pred = model.predict(test_X, verbose=1) pred_labels = self._evaluate(test_pred) test_labels = self._evaluate(test_y) print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels))) print(classification_report(test_labels, pred_labels))
def score(self, X, y, sample_weight=None): y_pred = self.predict(X) if not self.is_nested: score = f1_score(y, y_pred, average="macro") else: y_pred_ids = self.multi_label_to_id(y_pred) y_true_ids = self.multi_label_to_id(y) score = (y_pred_ids == y_true_ids).float().mean() score = score.item() return score
def test_evaluate(model, test_dataloader): test_l, n = 0.0, 0 out_epoch, label_eppch = [], [] global f1_min model.eval() with torch.no_grad(): for data_x, data_y, batch_seq_len in test_dataloader: _, out = model(data_x.to(device), batch_seq_len) #out就是路径序列 [10, 40] label = [line.numpy().tolist() for line in data_y] for line in label: for i in range(data_x.shape[1] - len(line)): line.append(line[len(line) - 1]) loss = model.loss_function(data_x.to(device), label, batch_seq_len) for line in out: for i in range(data_x.shape[1] - len(line)): line.append(line[len(line) - 1]) label = torch.tensor(label).view(-1, 1).squeeze(-1).to( device) #torch.Size([274]) out = torch.tensor(out).view(-1, 1).squeeze(-1).to( device) #torch.Size([274]) out, label = processing_len(out, label, batch_seq_len) #测试集评价指标 out_epoch.extend(out) label_eppch.extend(label) test_l += loss.item() n += 1 #print(classification_report(label_eppch, out_epoch, target_names=target_names, digits=6)) label_eppch = [[id2label[label] for label in label_eppch]] out_epoch = [[id2label[label] for label in out_epoch]] print(classification_report(label_eppch, out_epoch, digits=6)) #report = classification_report(label_eppch, out_epoch, output_dict=True) report = classification_report(label_eppch, out_epoch, digits=6) if f1_score(label_eppch, out_epoch) > f1_min: f1_min = f1_score(label_eppch, out_epoch) torch.save(model.state_dict(), args.ckp) print("save model......") return test_l / n
def _predict_with_seqeval(self, sample, model): # select target locations, that are not pad from seqeval.metrics import classification_report, f1_score, accuracy_score with torch.no_grad(): if hasattr( model, 'tagging_heads') and 'tagging_head' in model.tagging_heads: logits, _ = model( **sample['net_input'], features_only=True, tagging_head_name='tagging_head', ) else: logits = model(**sample['net_input'])[0] predictions = logits.argmax(dim=-1) targets = model.get_targets(sample, [logits]) # making sure of dimensions assert predictions.size() == targets.size() predicted_labels = predictions.detach().cpu().numpy() label_ids = targets.cpu().numpy() y_true = [] y_pred = [] for i, cur_label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(cur_label): if targets[i][j] not in [ self.target_dictionary.bos(), self.target_dictionary.eos(), self.target_dictionary.pad() ]: # if it's a valid label temp_1.append(self.target_dictionary[m]) temp_2.append( self.target_dictionary[predicted_labels[i][j]]) assert len(temp_1) == len(temp_2) y_true.append(temp_1) y_pred.append(temp_2) f1 = f1_score(y_true, y_pred, average='macro') acc = accuracy_score(y_true, y_pred) return { 'F1-score': f1, 'Accuracy': acc, 'y_true': y_true, 'y_pred': y_pred }
def eval(iter_data, model): logger.info("starting to evaluate") model = model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps = 0 predictions, true_labels = [], [] for batch in tqdm(iter_data): batch = tuple(t.to(device) for t in batch) b_input_ids, b_labels, b_input_mask, b_token_type_ids, b_label_masks = batch with torch.no_grad(): tmp_eval_loss, logits, reduced_labels = model( b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask, labels=b_labels, label_masks=b_label_masks) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() reduced_labels = reduced_labels.to('cpu').numpy() labels_to_append = [] predictions_to_append = [] for prediction, r_label in zip(logits, reduced_labels): preds = [] labels = [] for pred, lab in zip(prediction, r_label): if lab.item( ) == -1: # masked label; -1 means do not collect this label continue preds.append(pred) labels.append(lab) predictions_to_append.append(preds) labels_to_append.append(labels) predictions.extend(predictions_to_append) true_labels.append(labels_to_append) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps logger.info("Validation loss: {}".format(eval_loss)) pred_tags = [tags_vals[p_i] for p in predictions for p_i in p] valid_tags = [ tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i ] logger.info("Seq eval accuracy: {}".format( accuracy_score(valid_tags, pred_tags))) logger.info("F1-Score: {}".format(f1_score(valid_tags, pred_tags))) logger.info("Classification report: -- ") logger.info(classification_report(valid_tags, pred_tags))
def score(self, y_true, y_pred): f_score = f1_score(y_true, y_pred) r_score = recall_score(y_true, y_pred) p_score = precision_score(y_true, y_pred) print( 'NER Métricas > precision_score: {:04.2f} -- recall_score: {:04.2f} -- f1_score: {:04.2f}' .format(p_score, r_score, f_score)) return f_score, r_score, p_score
def calculate_token_class_metrics(pred_toks, targ_toks, metric_key): if (metric_key == 'accuracy'): return seq_metrics.accuracy_score(targ_toks, pred_toks) if (metric_key == 'precision'): return seq_metrics.precision_score(targ_toks, pred_toks) if (metric_key == 'recall'): return seq_metrics.recall_score(targ_toks, pred_toks) if (metric_key == 'f1'): return seq_metrics.f1_score(targ_toks, pred_toks) if (metric_key == 'classification_report'): return seq_metrics.classification_report(targ_toks, pred_toks)
def compute_metrics(p: EvalPrediction) -> Dict: preds_list, out_label_list = align_predictions(p.predictions, p.label_ids) report = classification_report(out_label_list, preds_list) output_report_file = os.path.join(training_args.output_dir, "classification_report.txt") with open(output_report_file, "w") as writer: writer.write(report) return { "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), }
def evaluate(self, x_data, y_data, batch_size=None) -> Tuple[float, float, Dict]: y_pred = self.predict(x_data, batch_size=batch_size) weighted_f1 = f1_score(y_data, y_pred) weighted_recall = recall_score(y_data, y_pred) report = classification_report(y_data, y_pred) print(classification_report(y_data, y_pred)) return weighted_f1, weighted_recall, report
def ner_eval(gold_tags, idx2tag, pred_probs): lengths = [ min(len(tag), pred_prob.shape[0]) for tag, pred_prob in zip(gold_tags, pred_probs) ] pred_tags = ner_tag_decode(idx2tag, pred_probs, lengths) r = metrics.recall_score(gold_tags, pred_tags) p = metrics.precision_score(gold_tags, pred_tags) f1 = metrics.f1_score(gold_tags, pred_tags) return r, p, f1
def scores(epoch_trues, epoch_preds): f1 = f1_score(epoch_trues, epoch_preds) rec = recall_score(epoch_trues, epoch_preds) prec = precision_score(epoch_trues, epoch_preds) acc = accuracy_score(epoch_trues, epoch_preds) return {"f1": np.around(f1, 4), "rec": np.around(rec, 4), "prec": np.around(prec, 4), "acc": np.around(acc, 4)}
def evaluate_ner(args, model): if args.onto: ner_examples = get_onto_ner_examples(args.ner_data) else: ner_examples = get_ner_examples(args.ner_data) total_pred_labels = [] true_labels = [] total_f1 = 0.0 start = time.time() total_labels_clean = [] all_ner_words = [] model.tokenizer = Tokenizer(model.vocab) for exp in tqdm(ner_examples): words = exp[0] labels = exp[1] idxs = [] text = "" for word in words: idxs += [len(text)] text += word + " " tokens = model(text) pred_ner_labels = [] for tk in tokens: # if tk.idx in idxs: # pred_label = tk.ent_type_ # if len(pred_label)> 0: # pred_ner_labels.append(pred_label) # else: # pred_ner_labels.append("O") pred_label = tk.ent_type_ if len(pred_label) > 0: pred_ner_labels.append(pred_label) else: pred_ner_labels.append("O") assert len(pred_ner_labels) == len(labels) total_labels_clean.append(labels) total_pred_labels.append(pred_ner_labels) end = time.time() ## evaluate print("pred", total_pred_labels[:20]) print("true labels", total_labels_clean[:20]) total = sum([len(x) for x in total_pred_labels]) print("sents per second", total * 1.0000000 / (end - start)) print("ner tag time cost", end - start) print("ner acc", precision_score(total_labels_clean, total_pred_labels)) print("ner f1", f1_score(total_labels_clean, total_pred_labels))
def caculate_report(y_true, y_pred, transform_func): """ 计算预测的分数 """ for i in range(len(y_true)): y_true[i] = transform_func(y_true[i]) for i in range(len(y_pred)): y_pred[i] = transform_func(y_pred[i]) return f1_score(y_true, y_pred), precision_score(y_true, y_pred), recall_score( y_true, y_pred)
def evaluate(gold_label_list, preds_list, config): tag_vocab = dataset_utils.get_tag_vocab(config) gold_label_list = dataset_utils.tagid2tag_seq(tag_vocab, gold_label_list) preds_list = dataset_utils.tagid2tag_seq(tag_vocab, preds_list) results = { "precision": precision_score(gold_label_list, preds_list), "recall": recall_score(gold_label_list, preds_list), "f1": f1_score(gold_label_list, preds_list) } return results
def model_metrics(true_labels, pre_labels): start_time = time.time() acc = accuracy_score(true_labels, pre_labels) f1score = f1_score(true_labels, pre_labels, average='macro') report = classification_report(true_labels, pre_labels, digits=4) msg = '\nTest Acc: {0:>6.2%}, Test f1: {1:>6.2%}' logger.info(msg.format(acc, f1score)) logger.info("\nPrecision, Recall and F1-Score...") logger.info("\n{}".format(report)) time_dif = time.time() - start_time logger.info("Time usage:{0:>.6}s".format(time_dif))
def _simple_score(self, model, iter): model.eval() y_true, y_pred = [], [] for batch in iter: predict_tags = model(batch) _, _, _, label_seq_tensor = batch y_true.extend(convert(label_seq_tensor.tolist(), self.label_dict)) y_pred.extend(convert(predict_tags, self.label_dict)) return accuracy_score(y_true, y_pred), recall_score(y_true, y_pred), f1_score( y_true, y_pred)
def evaluate(model, config, val_loader): model.eval() opt = config['opt'] pad_label_id = config['pad_label_id'] eval_loss = 0. criterion = nn.CrossEntropyLoss(ignore_index=pad_label_id).to(opt.device) n_batches = len(val_loader) prog = Progbar(target=n_batches) preds = None ys = None with torch.no_grad(): for i, (x,y) in enumerate(val_loader): x = to_device(x, opt.device) y = to_device(y, opt.device) if opt.use_crf: logits, prediction = model(x) mask = torch.sign(torch.abs(x[0])).to(torch.uint8).to(opt.device) log_likelihood = model.crf(logits, y, mask=mask, reduction='mean') loss = -1 * log_likelihood else: logits = model(x) loss = criterion(logits.view(-1, model.label_size), y.view(-1)) if preds is None: if opt.use_crf: preds = to_numpy(prediction) else: preds = to_numpy(logits) ys = to_numpy(y) else: if opt.use_crf: preds = np.append(preds, to_numpy(prediction), axis=0) else: preds = np.append(preds, to_numpy(logits), axis=0) ys = np.append(ys, to_numpy(y), axis=0) eval_loss += loss.item() prog.update(i+1, [('eval curr loss', loss.item())]) eval_loss = eval_loss / n_batches if not opt.use_crf: preds = np.argmax(preds, axis=2) # compute measure using seqeval labels = model.labels ys_lbs = [[] for _ in range(ys.shape[0])] preds_lbs = [[] for _ in range(ys.shape[0])] for i in range(ys.shape[0]): # foreach sentence for j in range(ys.shape[1]): # foreach token if ys[i][j] != pad_label_id: ys_lbs[i].append(labels[ys[i][j]]) preds_lbs[i].append(labels[preds[i][j]]) ret = { "loss": eval_loss, "precision": precision_score(ys_lbs, preds_lbs), "recall": recall_score(ys_lbs, preds_lbs), "f1": f1_score(ys_lbs, preds_lbs), "report": classification_report(ys_lbs, preds_lbs, digits=4), } print(ret['report']) return ret
def score(self, y_true, y_pred): """Calculate f1 score. Args: y_true (list): true sequences. y_pred (list): predicted sequences. Returns: score: f1 score. """ score = f1_score(y_true, y_pred) print(' - valid_f1: {:04.2f}'.format(score * 100)) return score
def seqeval_classification_metrics(pred): labels = pred.label_ids preds = pred.predictions precision_macro = precision_score(labels, preds, average='macro') recall_macro = recall_score(labels, preds, average='macro') f1_macro = f1_score(labels, preds, average='macro') precision_micro = precision_score(labels, preds, average='micro') recall_micro = recall_score(labels, preds, average='micro') f1_micro = f1_score(labels, preds, average='micro') acc = accuracy_score(labels, preds) return { 'accuracy': acc, 'f1_micro': f1_micro, 'precision_micro': precision_micro, 'recall_micro': recall_micro, 'f1_macro': f1_macro, 'precision_macro': precision_macro, 'recall_macro': recall_macro, 'nb_samples': len(labels), 'classification_report': classification_report(labels, preds, digits=4) }
def evaluate_model(model, eval_dataset, label_list, batch_size, device): """ Evaluates an NER model on the eval_dataset provided. Returns: F1_score: Macro-average f1_score on the evaluation dataset. Report: detailed classification report """ # Run prediction for full data eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=batch_size) model.eval() # turn of dropout y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids, label_ids, l_mask, valid_ids in eval_dataloader: input_ids = input_ids.to(device) label_ids = label_ids.to(device) valid_ids = valid_ids.to(device) l_mask = l_mask.to(device) with torch.no_grad(): logits = model(input_ids, labels=None, labels_mask=None, valid_mask=valid_ids) logits = torch.argmax(logits, dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.cpu().numpy() for i, cur_label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(cur_label): if valid_ids[i][j]: # if it's a valid label temp_1.append(label_map[m]) temp_2.append(label_map[logits[i][j]]) assert len(temp_1) == len(temp_2) y_true.append(temp_1) y_pred.append(temp_2) report = classification_report(y_true, y_pred, digits=4) f1 = f1_score(y_true, y_pred, average='Macro') return f1, report
def test_epoch_end(self, outputs): preds = torch.cat([x["preds"] for x in outputs]).detach().cpu().numpy() labels = torch.cat([x["labels"] for x in outputs]).detach().cpu().numpy() # remove padding out_label_list = [[] for _ in range(labels.shape[0])] preds_list = [[] for _ in range(preds.shape[0])] assert (len(out_label_list) == len(preds_list) ), "Prediction and Label are not matched." from torch.nn import CrossEntropyLoss pad_token_label_id = CrossEntropyLoss().ignore_index label_map = { i: label for i, label in enumerate(list(self.label_vocab.keys())) } for i in range(labels.shape[0]): for j in range(labels.shape[1]): if labels[i, j] != pad_token_label_id: out_label_list[i].append(label_map[labels[i][j]]) preds_list[i].append(label_map[preds[i][j]]) # metrics - Precision, Recall, F1 result = { "precision": seqeval_metrics.precision_score(out_label_list, preds_list), "recall": seqeval_metrics.recall_score(out_label_list, preds_list), "f1": seqeval_metrics.f1_score(out_label_list, preds_list), } print() print( seqeval_metrics.classification_report(out_label_list, preds_list, digits=4)) # dump predicted outputs predicted_outputs_fn = os.path.join(self.trainer.callbacks[1].dirpath, 'predicted_outputs.txt') predicted_outputs = preds_list with open(predicted_outputs_fn, "w", encoding='utf-8') as f: for output in predicted_outputs: print(output, file=f) print("Predicted Outputs are dumped at {}".format( predicted_outputs_fn)) return result
def __epoch_valid(self, data_loader, prefix, writer=None, unseen_entity_set: set = None, entity_span_prediction: bool = False): """ single epoch validation/test """ # aggregate prediction and true label self.model.eval() seq_pred, seq_true = [], [] for encode in data_loader: encode = {k: v.to(self.device) for k, v in encode.items()} labels_tensor = encode.pop('labels') logit = self.model(**encode, return_dict=True)['logits'] _true = labels_tensor.cpu().detach().int().tolist() _pred = torch.max(logit, 2)[1].cpu().detach().int().tolist() for b in range(len(_true)): _pred_list, _true_list = [], [] for s in range(len(_true[b])): if _true[b][s] != PAD_TOKEN_LABEL_ID: _true_list.append(self.id_to_label[_true[b][s]]) if unseen_entity_set is None: _pred_list.append(self.id_to_label[_pred[b][s]]) else: __pred = self.id_to_label[_pred[b][s]] if __pred in unseen_entity_set: _pred_list.append('O') else: _pred_list.append(__pred) assert len(_pred_list) == len(_true_list) if len(_true_list) > 0: if entity_span_prediction: # ignore entity type and focus on entity position _true_list = [i if i == 'O' else '-'.join([i.split('-')[0], 'entity']) for i in _true_list] _pred_list = [i if i == 'O' else '-'.join([i.split('-')[0], 'entity']) for i in _pred_list] seq_true.append(_true_list) seq_pred.append(_pred_list) # compute metrics metric = { "f1": f1_score(seq_true, seq_pred) * 100, "recall": recall_score(seq_true, seq_pred) * 100, "precision": precision_score(seq_true, seq_pred) * 100, } try: summary = classification_report(seq_true, seq_pred) logging.info('[epoch {}] ({}) \n {}'.format(self.__epoch, prefix, summary)) except Exception: logging.exception('classification_report raises error') summary = '' metric['summary'] = summary if writer: writer.add_scalar('{}/f1'.format(prefix), metric['f1'], self.__epoch) writer.add_scalar('{}/recall'.format(prefix), metric['recall'], self.__epoch) writer.add_scalar('{}/precision'.format(prefix), metric['precision'], self.__epoch) return metric
def on_epoch_end(self, epoch, logs={}): label_true = [] label_pred = [] for i in range(len(self.seq)): x_true, y_true = self.seq[i] lengths = self.get_lengths(y_true) y_pred = self.model.predict_on_batch(x_true) y_true = self.p.inverse_transform(y_true, lengths) y_pred = self.p.inverse_transform(y_pred, lengths) label_true.extend(y_true) label_pred.extend(y_pred) valid_score = f1_score(label_true, label_pred) print(' - f1-valid: {:04.2f}'.format(valid_score * 100)) print('validation report :', classification_report(label_true, label_pred)) label_true = [] label_pred = [] for i in range(len(self.train_seq)): x_true, y_true = self.train_seq[i] lengths = self.get_lengths(y_true) y_pred = self.model.predict_on_batch(x_true) y_true = self.p.inverse_transform(y_true, lengths) y_pred = self.p.inverse_transform(y_pred, lengths) label_true.extend(y_true) label_pred.extend(y_pred) train_score = f1_score(label_true, label_pred) print(' - f1-train: {:04.2f}'.format(train_score * 100)) print('train report :', classification_report(label_true, label_pred)) logs['f1'] = { "epoch": epoch, "dev_score": valid_score, "train_score": train_score }
def test_metrics_for_inv_data(self): with open(self.file_name) as f: acc_pred = accuracy_score(self.y_true, self.y_pred) p_pred = precision_score(self.y_true, self.y_pred) r_pred = recall_score(self.y_true, self.y_pred) f1_pred = f1_score(self.y_true, self.y_pred) acc_pred_inv = accuracy_score(self.y_true_inv, self.y_pred_inv) p_pred_inv = precision_score(self.y_true_inv, self.y_pred_inv, suffix=True) r_pred_inv = recall_score(self.y_true_inv, self.y_pred_inv, suffix=True) f1_pred_inv = f1_score(self.y_true_inv, self.y_pred_inv, suffix=True) self.assertLess(abs(acc_pred - acc_pred_inv), 1e-4) self.assertLess(abs(p_pred - p_pred_inv), 1e-4) self.assertLess(abs(r_pred - r_pred_inv), 1e-4) self.assertLess(abs(f1_pred - f1_pred_inv), 1e-4)
def calculate_seqeval_metrics(predictions, labels, tags=None, binary=False): if tags is not None: map2label = {v:k for k,v in tags.items()} # pdb.set_trace() for i in range(len(predictions)): predictions[i] = [map2label[v] for v in predictions[i]] labels[i] = [map2label[v] for v in labels[i]] accuracy = seq_metrics.accuracy_score(labels, predictions) precision = seq_metrics.precision_score(labels, predictions) recall = seq_metrics.recall_score(labels, predictions) f1_score = seq_metrics.f1_score(labels, predictions) return accuracy, precision, recall, f1_score
def acc_and_f1(preds, labels): acc = accuracy_score(preds, labels) f1 = f1_score(y_true=labels, y_pred=preds) p = precision_score(y_true=labels, y_pred=preds) r = recall_score(y_true=labels, y_pred=preds) report = classification_report(y_true=labels, y_pred=preds) return { "acc": acc, "f1": f1, "acc_and_f1": (acc + f1) / 2, "precision:": p, "recall": r, }
def on_epoch_end(self, epoch, logs=None): pred_probs = self.model.predict(self.valid_features) y_pred = self.preprocessor.label_decode(pred_probs, self.get_lengths(pred_probs)) r = metrics.recall_score(self.valid_labels, y_pred) p = metrics.precision_score(self.valid_labels, y_pred) f1 = metrics.f1_score(self.valid_labels, y_pred) logs['val_r'] = r logs['val_p'] = p logs['val_f1'] = f1 print('Epoch {}: val_r: {}, val_p: {}, val_f1: {}'.format(epoch, r, p, f1)) print(metrics.classification_report(self.valid_labels, y_pred))
def f1_pre_rec(labels, preds, is_ner=True): if is_ner: return { "precision": seqeval_metrics.precision_score(labels, preds, suffix=True), "recall": seqeval_metrics.recall_score(labels, preds, suffix=True), "f1": seqeval_metrics.f1_score(labels, preds, suffix=True), } else: return { "precision": sklearn_metrics.precision_score(labels, preds, average="macro"), "recall": sklearn_metrics.recall_score(labels, preds, average="macro"), "f1": sklearn_metrics.f1_score(labels, preds, average="macro"), }
def on_epoch_end(self, epoch, logs = None): # self.model is auto set by keras yt, yp = [], [] pred = np.argmax(self.smodel.predict(self.X, batch_size=32), -1) lengths = [x.sum() for x in self.X[1]] for pseq, yseq, llen in zip(pred, self.Y, lengths): yt.append([self.tags[z] for z in pseq[1:llen-1]]) yp.append([self.tags[z] for z in yseq[1:llen-1]]) f1 = f1_score(yt, yp) self.best_f1 = max(self.best_f1, f1) accu = accuracy_score(yt, yp) print('\naccu: %.4f F1: %.4f BestF1: %.4f\n' % (accu, f1, self.best_f1)) print(classification_report(yt, yp))
def score(self, y_true, y_pred): """Calculate f1 score. Args: y_true (list): true sequences. y_pred (list): predicted sequences. Returns: score: f1 score. """ score = f1_score(y_true, y_pred) print(' - f1: {:04.2f}'.format(score * 100)) if self.digits: print(classification_report(y_true, y_pred, digits=self.digits)) return score