def compute_prf1_single_type(fname, type_, data=None): print('---------- ' + type_ + ' ----------') with open(fname) as f: total = json.load(f) gold_binary = [] pred_binary = [] for k, v in total.items(): if type_ in v['gold']: gold_binary.append(1.) else: gold_binary.append(0.) if type_ in v['pred']: pred_binary.append(1.) print_example(data[k]) else: pred_binary.append(0.) count = len(gold_binary) TP_FN_counts = sum([1. for gold in gold_binary if int(gold) == 1]) TP_FP_counts = sum([1. for pred in pred_binary if int(pred) == 1]) TP_counts = sum([ 1. for pred, gold in zip(pred_binary, gold_binary) if int(pred) == 1 and int(gold) == 1 ]) p = TP_counts / TP_FP_counts if TP_FP_counts > 0 else 0. r = TP_counts / TP_FN_counts if TP_FN_counts > 0 else 0. f1_ = f1(p, r) output_str = "Type: {0}\t#: {1} TP:{2} TP+FP:{3} TP+FN:{4} P:{5:.3f} R:{6:.3f} F1:{7:.3f}".format( type_, count, int(TP_counts), int(TP_FP_counts), int(TP_FN_counts), p, r, f1_) accuracy = sum([ pred == gold for pred, gold in zip(pred_binary, gold_binary) ]) / float(len(gold_binary)) output_str += '\t Dev accuracy: {0:.1f}%'.format(accuracy * 100) print(output_str)
def evaluate_data(batch_num, dev_fname, model, tensorboard, val_type_name, args, elmo, bert, actual_f1=True, vocab=None): model.eval() if vocab is None: vocab = (constant.CHAR_DICT, None) dev_gen = get_data_gen(dev_fname, 'test', args, vocab, args.goal, elmo=elmo, bert=bert) gold_pred = [] binary_out = [] eval_loss = 0. total_ex_count = 0 if args.mode in ['train_labeler', 'test_labeler']: cls_correct = 0. cls_total = 0. cls_tp = 0. cls_t_gold = 0. cls_t_pred = 0. for n, batch in enumerate(dev_gen): total_ex_count += len(batch['y']) eval_batch, annot_ids = to_torch(batch) if args.mode in ['train_labeler', 'test_labeler']: loss, output_logits, cls_logits = model(eval_batch, val_type_name) if cls_logits is not None: cls_correct += sum([(1. if pred > 0. else 0.) == gold for pred, gold in zip(cls_logits, batch['y_cls'])]) cls_total += float(cls_logits.size()[0]) cls_tp += sum([(1. if pred > 0. else 0.) == 1. and gold == 1. for pred, gold in zip(cls_logits, batch['y_cls'])]) cls_t_gold += float(sum(batch['y_cls'])) cls_t_pred += float(sum([1. if pred > 0. else 0. for pred in cls_logits])) else: loss, output_logits, _ = model(eval_batch, val_type_name) output_index = get_output_index(output_logits, threshold=args.threshold) gold_pred += get_gold_pred_str(output_index, eval_batch['y'].data.cpu().clone(), args.goal) eval_loss += loss.clone().item() eval_accu = sum([set(y) == set(yp) for y, yp in gold_pred]) * 1.0 / len(gold_pred) eval_str = get_eval_string(gold_pred) _, _, _, _, _, macro_f1 = eval_metric.macro(gold_pred) eval_loss_str = 'Eval loss: {0:.7f} at step {1:d}'.format(eval_loss, batch_num) tensorboard.add_validation_scalar('eval_acc_' + val_type_name, eval_accu, batch_num) tensorboard.add_validation_scalar('eval_loss_' + val_type_name, eval_loss, batch_num) print('EVAL: seen ' + repr(total_ex_count) + ' examples.') print(val_type_name + ":" +eval_loss_str) print(gold_pred[:3]) if args.mode in ['train_labeler', 'test_labeler'] and cls_logits is not None: cls_accuracy = cls_correct / cls_total * 100. cls_precision = cls_tp / cls_t_pred cls_recall = cls_tp / cls_t_gold cls_f1 = f1(cls_precision, cls_recall) cls_str = ' CLS accuracy: {0:.2f}% P: {1:.3f} R: {2:.3f} F1: {3:.3f}'.format(cls_accuracy, cls_precision, cls_recall, cls_f1) print(val_type_name+":"+ eval_str + cls_str) else: print(val_type_name+":"+ eval_str) logging.info(val_type_name + ":" + eval_loss_str) logging.info(val_type_name +":" + eval_str) model.train() dev_gen = None return eval_loss, macro_f1
def compute_acc_by_type_freq(fname, type_bucket_count_file, types_file): with open(fname) as f: total = json.load(f) with open(type_bucket_count_file, 'rb') as f: type_bucket_count = pickle.load(f) with open(types_file, 'r') as f: types = [t.strip() for t in f.readlines()] #print('TOTAL:', sum([len(v['gold']) for k, v in total.items()])) type2bucket = {t[0]: k for k, v in type_bucket_count.items() for t in v} TP_FP_counts = {'unseen': 0.} TP_FN_counts = {'unseen': 0.} TP_counts = {'unseen': 0.} for annot_id, v in total.items(): gold = v['gold'] pred = v['pred'] for t in set(pred).intersection(set(gold)): if t in type2bucket: bucket = type2bucket[t] if bucket not in TP_counts: TP_counts[bucket] = 0. TP_counts[bucket] += 1. else: TP_counts['unseen'] += 1. for t in set(pred): if t in type2bucket: bucket = type2bucket[t] if bucket not in TP_FP_counts: TP_FP_counts[bucket] = 0. TP_FP_counts[bucket] += 1. else: TP_FP_counts['unseen'] += 1. for t in set(gold): if t in type2bucket: bucket = type2bucket[t] if bucket not in TP_FN_counts: TP_FN_counts[bucket] = 0. TP_FN_counts[bucket] += 1. else: TP_FN_counts['unseen'] += 1. ordered_keys = sorted([k for k, v in TP_counts.items() if k != 'unseen'], key=lambda x: int(x.split('-')[0]), reverse=True) # + ['unseen'] for k in ordered_keys: precision = TP_counts[k] / TP_FP_counts[k] recall = TP_counts[k] / TP_FN_counts[k] f1_score = f1(precision, recall) perf = "{0}\tCORRECT:{1}\tP:{2:.2f}\tR:{3:.2f}\tF1:{4:.2}".format( k, int(TP_counts[k]), precision * 100., recall * 100., f1_score * 100.) print(perf)
def get_eval_string_binary(binary_out, y): assert len(binary_out) == len(y) count = len(binary_out) TP_FN_counts = sum([1. for gold in y if int(gold) == 1]) TP_FP_counts = sum([1. for pred in binary_out if int(pred) == 1]) TP_counts = sum([ 1. for pred, gold in zip(binary_out, y) if int(pred) == 1 and int(gold) == 1 ]) p = TP_counts / TP_FP_counts if TP_FP_counts > 0 else 0. r = TP_counts / TP_FN_counts if TP_FN_counts > 0 else 0. f1 = eval_metric.f1(p, r) output_str = "Eval: {0} TP:{1} TP+FP:{2} TP+FN:{3} P:{4:.3f} R:{5:.3f} F1:{6:.3f}".format( count, int(TP_counts), int(TP_FP_counts), int(TP_FN_counts), p, r, f1) accuracy = sum([pred == gold for pred, gold in zip(binary_out, y) ]) / float(len(binary_out)) output_str += '\t Dev accuracy: {0:.1f}%'.format(accuracy * 100) return output_str, accuracy
def _train_labeler(args): if args.data_setup == 'joint': train_gen_list, val_gen_list, crowd_dev_gen, elmo, bert, vocab = get_joint_datasets(args) else: train_fname = args.train_data dev_fname = args.dev_data print(train_fname, dev_fname) data_gens, elmo = get_datasets([(train_fname, 'train', args.goal), (dev_fname, 'dev', args.goal)], args) train_gen_list = [(args.goal, data_gens[0])] val_gen_list = [(args.goal, data_gens[1])] train_log = SummaryWriter(os.path.join(constant.EXP_ROOT, args.model_id, "log", "train")) validation_log = SummaryWriter(os.path.join(constant.EXP_ROOT, args.model_id, "log", "validation")) tensorboard = TensorboardWriter(train_log, validation_log) if args.model_type == 'labeler': print('==> Labeler') model = denoising_models.Labeler(args, constant.ANSWER_NUM_DICT[args.goal]) elif args.model_type == 'filter': print('==> Filter') model = denoising_models.Filter(args, constant.ANSWER_NUM_DICT[args.goal]) else: print('Invalid model type: -model_type ' + args.model_type) raise NotImplementedError model.cuda() total_loss = 0 batch_num = 0 best_macro_f1 = 0. start_time = time.time() init_time = time.time() if args.bert: if args.bert_param_path: print('==> Loading BERT from ' + args.bert_param_path) model.bert.load_state_dict(torch.load(args.bert_param_path, map_location='cpu')) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.bert_learning_rate, warmup=args.bert_warmup_proportion, t_total=-1) # TODO: else: optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) #optimizer = optim.SGD(model.parameters(), lr=1., momentum=0.) if args.load: load_model(args.reload_model_name, constant.EXP_ROOT, args.model_id, model, optimizer) for idx, m in enumerate(model.modules()): logging.info(str(idx) + '->' + str(m)) while True: batch_num += 1 # single batch composed of all train signal passed by. for (type_name, data_gen) in train_gen_list: try: batch = next(data_gen) batch, _ = to_torch(batch) except StopIteration: logging.info(type_name + " finished at " + str(batch_num)) print('Done!') torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, '{0:s}/{1:s}.pt'.format(constant.EXP_ROOT, args.model_id)) return optimizer.zero_grad() loss, output_logits, cls_logits = model(batch, type_name) loss.backward() total_loss += loss.item() optimizer.step() if batch_num % args.log_period == 0 and batch_num > 0: gc.collect() cur_loss = float(1.0 * loss.clone().item()) elapsed = time.time() - start_time train_loss_str = ('|loss {0:3f} | at {1:d}step | @ {2:.2f} ms/batch'.format(cur_loss, batch_num, elapsed * 1000 / args.log_period)) start_time = time.time() print(train_loss_str) logging.info(train_loss_str) tensorboard.add_train_scalar('train_loss_' + type_name, cur_loss, batch_num) if batch_num % args.eval_period == 0 and batch_num > 0: output_index = get_output_index(output_logits, threshold=args.threshold) gold_pred_train = get_gold_pred_str(output_index, batch['y'].data.cpu().clone(), args.goal) print(gold_pred_train[:10]) accuracy = sum([set(y) == set(yp) for y, yp in gold_pred_train]) * 1.0 / len(gold_pred_train) train_acc_str = '{1:s} Train accuracy: {0:.1f}%'.format(accuracy * 100, type_name) if cls_logits is not None: cls_accuracy = sum([(1. if pred > 0. else 0.) == gold for pred, gold in zip(cls_logits, batch['y_cls'].data.cpu().numpy())]) / float(cls_logits.size()[0]) cls_tp = sum([(1. if pred > 0. else 0.) == 1. and gold == 1. for pred, gold in zip(cls_logits, batch['y_cls'].data.cpu().numpy())]) cls_precision = cls_tp / float(sum([1. if pred > 0. else 0. for pred in cls_logits])) cls_recall = cls_tp / float(sum(batch['y_cls'].data.cpu().numpy())) cls_f1 = f1(cls_precision, cls_recall) train_cls_acc_str = '{1:s} Train cls accuracy: {0:.2f}% P: {2:.3f} R: {3:.3f} F1: {4:.3f}'.format(cls_accuracy * 100, type_name, cls_precision, cls_recall, cls_f1) print(train_acc_str) if cls_logits is not None: print(train_cls_acc_str) logging.info(train_acc_str) tensorboard.add_train_scalar('train_acc_' + type_name, accuracy, batch_num) if args.goal != 'onto': for (val_type_name, val_data_gen) in val_gen_list: if val_type_name == type_name: eval_batch, _ = to_torch(next(val_data_gen)) evaluate_batch(batch_num, eval_batch, model, tensorboard, val_type_name, args, args.goal) if batch_num % args.eval_period == 0 and batch_num > 0 and args.data_setup == 'joint': # Evaluate Loss on the Turk Dev dataset. print('---- eval at step {0:d} ---'.format(batch_num)) crowd_eval_loss, macro_f1 = evaluate_data(batch_num, 'crowd/dev_tree.json', model, tensorboard, "open", args, elmo, bert, vocab=vocab) if best_macro_f1 < macro_f1: best_macro_f1 = macro_f1 save_fname = '{0:s}/{1:s}_best.pt'.format(constant.EXP_ROOT, args.model_id) torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, save_fname) print( 'Total {0:.2f} minutes have passed, saving at {1:s} '.format((time.time() - init_time) / 60, save_fname)) if batch_num % args.eval_period == 0 and batch_num > 0 and args.goal == 'onto': # Evaluate Loss on the Turk Dev dataset. print('---- OntoNotes: eval at step {0:d} ---'.format(batch_num)) crowd_eval_loss, macro_f1 = evaluate_data(batch_num, args.dev_data, model, tensorboard, args.goal, args, elmo) if batch_num % args.save_period == 0 and batch_num > 0: save_fname = '{0:s}/{1:s}_{2:d}.pt'.format(constant.EXP_ROOT, args.model_id, batch_num) torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, save_fname) print( 'Total {0:.2f} minutes have passed, saving at {1:s} '.format((time.time() - init_time) / 60, save_fname)) # Training finished! torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, '{0:s}/{1:s}.pt'.format(constant.EXP_ROOT, args.model_id))