def compute_prf1(fname): with open(fname) as f: total = json.load(f) true_and_predictions = [] for k, v in total.items(): true_and_predictions.append((v['gold'], v['pred'])) count, pred_count, avg_pred_count, p, r, f1 = macro(true_and_predictions) perf_total = "{0}\t{1:.2f}\tP:{2:.1f}\tR:{3:.1f}\tF1:{4:.1f}".format( count, avg_pred_count, p * 100, r * 100, f1 * 100) print(perf_total)
def evaluate_data(batch_num, dev_fname, model, tensorboard, val_type_name, args, elmo, bert, actual_f1=True, vocab=None): model.eval() if vocab is None: vocab = (constant.CHAR_DICT, None) dev_gen = get_data_gen(dev_fname, 'test', args, vocab, args.goal, elmo=elmo, bert=bert) gold_pred = [] binary_out = [] eval_loss = 0. total_ex_count = 0 if args.mode in ['train_labeler', 'test_labeler']: cls_correct = 0. cls_total = 0. cls_tp = 0. cls_t_gold = 0. cls_t_pred = 0. for n, batch in enumerate(dev_gen): total_ex_count += len(batch['y']) eval_batch, annot_ids = to_torch(batch) if args.mode in ['train_labeler', 'test_labeler']: loss, output_logits, cls_logits = model(eval_batch, val_type_name) if cls_logits is not None: cls_correct += sum([(1. if pred > 0. else 0.) == gold for pred, gold in zip(cls_logits, batch['y_cls'])]) cls_total += float(cls_logits.size()[0]) cls_tp += sum([(1. if pred > 0. else 0.) == 1. and gold == 1. for pred, gold in zip(cls_logits, batch['y_cls'])]) cls_t_gold += float(sum(batch['y_cls'])) cls_t_pred += float(sum([1. if pred > 0. else 0. for pred in cls_logits])) else: loss, output_logits, _ = model(eval_batch, val_type_name) output_index = get_output_index(output_logits, threshold=args.threshold) gold_pred += get_gold_pred_str(output_index, eval_batch['y'].data.cpu().clone(), args.goal) eval_loss += loss.clone().item() eval_accu = sum([set(y) == set(yp) for y, yp in gold_pred]) * 1.0 / len(gold_pred) eval_str = get_eval_string(gold_pred) _, _, _, _, _, macro_f1 = eval_metric.macro(gold_pred) eval_loss_str = 'Eval loss: {0:.7f} at step {1:d}'.format(eval_loss, batch_num) tensorboard.add_validation_scalar('eval_acc_' + val_type_name, eval_accu, batch_num) tensorboard.add_validation_scalar('eval_loss_' + val_type_name, eval_loss, batch_num) print('EVAL: seen ' + repr(total_ex_count) + ' examples.') print(val_type_name + ":" +eval_loss_str) print(gold_pred[:3]) if args.mode in ['train_labeler', 'test_labeler'] and cls_logits is not None: cls_accuracy = cls_correct / cls_total * 100. cls_precision = cls_tp / cls_t_pred cls_recall = cls_tp / cls_t_gold cls_f1 = f1(cls_precision, cls_recall) cls_str = ' CLS accuracy: {0:.2f}% P: {1:.3f} R: {2:.3f} F1: {3:.3f}'.format(cls_accuracy, cls_precision, cls_recall, cls_f1) print(val_type_name+":"+ eval_str + cls_str) else: print(val_type_name+":"+ eval_str) logging.info(val_type_name + ":" + eval_loss_str) logging.info(val_type_name +":" + eval_str) model.train() dev_gen = None return eval_loss, macro_f1
def get_eval_string(true_prediction): """ Given a list of (gold, prediction)s, generate output string. """ count, pred_count, avg_pred_count, p, r, f1 = eval_metric.micro(true_prediction) _, _, _, ma_p, ma_r, ma_f1 = eval_metric.macro(true_prediction) output_str = "Eval: {0} {1} {2:.3f} P:{3:.3f} R:{4:.3f} F1:{5:.3f} Ma_P:{6:.3f} Ma_R:{7:.3f} Ma_F1:{8:.3f}".format( count, pred_count, avg_pred_count, p, r, f1, ma_p, ma_r, ma_f1) accuracy = sum([set(y) == set(yp) for y, yp in true_prediction]) * 1.0 / len(true_prediction) output_str += '\t Dev accuracy: {0:.1f}%'.format(accuracy * 100) return output_str
def compute_length_prf1(fname, data_fname): with open(fname) as f: total = json.load(f) data = original = load_augmented_input(data_fname) counts = {} for annot_id, v in total.items(): ex = data[annot_id] mention_len = len(ex['mention_span'].strip().split()) if mention_len not in counts: counts[mention_len] = [] counts[mention_len].append((v['gold'], v['pred'])) for k, v in sorted(counts.items(), key=lambda x: x[0])[:20]: count, pred_count, avg_pred_count, p, r, f1 = macro(v) perf = "{0}\t{1:.2f}\tP:{2:.1f}\tR:{3:.1f}\tF1:{4:.1f}\tLEN:{5}".format( count, avg_pred_count, p * 100, r * 100, f1 * 100, k) print(perf)
def metric_dicts(true_prediction): count, pred_count, avg_pred_count, p, r, f1 = eval_metric.micro( true_prediction) _, _, _, ma_p, ma_r, ma_f1 = eval_metric.macro(true_prediction) output_str = "Eval: {0} {1} {2:.3f} P:{3:.3f} R:{4:.3f} F1:{5:.3f} Ma_P:{6:.3f} Ma_R:{7:.3f} Ma_F1:{8:.3f}".format( count, pred_count, avg_pred_count, p, r, f1, ma_p, ma_r, ma_f1) accuracy = sum([set(y) == set(yp) for y, yp in true_prediction]) * 1.0 / len(true_prediction) output_str += '\t Dev accuracy: {0:.1f}%'.format(accuracy * 100) result = { "precision": p, "recall": r, 'f1': f1, "ma_precision": ma_p, "ma_recall": ma_r, "ma_f1": ma_f1, "accu": accuracy } return result, output_str
def evaluate_data(batch_num, dev_fname, model, args, elmo, device, char_vocab, dev_type='original'): model.eval() dev_gen = get_data_gen(dev_fname, 'test', args, char_vocab, elmo=elmo) gold_pred = [] eval_loss = 0. total_ex_count = 0 for batch in tqdm(dev_gen): total_ex_count += len(batch['y']) eval_batch, annot_ids = to_torch(batch, device) loss, output_logits, _ = model(eval_batch) output_index = get_output_index(output_logits, threshold=args.threshold) gold_pred += get_gold_pred_str(output_index, eval_batch['y'].data.cpu().clone(), args.goal) eval_loss += loss.clone().item() eval_str = get_eval_string(gold_pred) _, _, _, _, _, macro_f1 = eval_metric.macro(gold_pred) eval_loss_str = 'Eval loss: {0:.7f} at step {1:d}'.format(eval_loss, batch_num) print('==> ' + dev_type + ' EVAL: seen ' + repr(total_ex_count) + ' examples.') print(eval_loss_str) print(gold_pred[:3]) print('==> ' + dev_type + ' : ' + eval_str) logging.info(eval_loss_str) logging.info(eval_str) model.train() return eval_loss, macro_f1
def compute_granul_prf1(fname, type_fname): with open(fname) as f: total = json.load(f) coarse_true_and_predictions = [] fine_true_and_predictions = [] finer_true_and_predictions = [] with open(type_fname) as f: types = [x.strip() for x in f.readlines()] for k, v in total.items(): coarse_gold, fine_gold, finer_gold = stratify(v['gold'], types) coarse_pred, fine_pred, finer_pred = stratify(v['pred'], types) coarse_true_and_predictions.append((coarse_gold, coarse_pred)) fine_true_and_predictions.append((fine_gold, fine_pred)) finer_true_and_predictions.append((finer_gold, finer_pred)) for true_and_predictions in [ coarse_true_and_predictions, fine_true_and_predictions, finer_true_and_predictions ]: count, pred_count, avg_pred_count, p, r, f1 = macro( true_and_predictions) perf = "{0}\t{1:.2f}\tP:{2:.1f}\tR:{3:.1f}\tF1:{4:.1f}".format( count, avg_pred_count, p * 100, r * 100, f1 * 100) print(perf)