def evaluate_model(evalparams): torch.manual_seed(evalparams.seed) random.seed(1234) if evalparams.cpu: evalparams.cuda = False elif evalparams.cud: torch.cuda.manual_seed(args.seed) # load opt print(evalparams.model_dir, evalparams.model) # model_file = evalparams.model_dir + "/" + evalparams.model model_file = 'best_model.pt' print("Loading model from {}".format(model_file)) opt = torch_utils.load_config(model_file) model = RelationModel(opt) model.load(model_file) # load vocab vocab_file = evalparams.model_dir + '/vocab.pkl' vocab = Vocab(vocab_file, load=True) assert opt[ 'vocab_size'] == vocab.size, "Vocab size must match that in the saved model." # load data data_file = opt['data_dir'] + '/{}.json'.format(evalparams.dataset) print("Loading data from {} with batch size {}...".format( data_file, opt['batch_size'])) batch = DataLoader(data_file, opt['batch_size'], opt, vocab, evaluation=True) helper.print_config(opt) id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()]) predictions = [] all_probs = [] for i, b in enumerate(batch): preds, probs, _ = model.predict(b) predictions += preds all_probs += probs predictions = [id2label[p] for p in predictions] p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True) # save probability scores if len(evalparams.out) > 0: helper.ensure_dir(os.path.dirname(evalparams.out)) with open(evalparams.out, 'wb') as outfile: pickle.dump(all_probs, outfile) print("Prediction scores saved to {}.".format(evalparams.out)) print("Evaluation ended.") return (batch.gold(), predictions, model)
def get_scores(data_file, opt, vocab, model): print( "Loading data from {} with batch size {}...".format( data_file, opt["batch_size"] ) ) batch = DataLoader(data_file, opt["batch_size"], opt, vocab, evaluation=True) predictions = [] all_probs = [] for i, b in enumerate(batch): preds, probs, attn_weights, _ = model.predict(b) predictions += preds all_probs += probs predictions = [id2label[p] for p in predictions] # print("predictions") # for a, b in zip(batch.gold(), predictions): # print(f"{a:<28} {b:<28}") p, r, f1 = scorer.score(batch.gold(), predictions, verbose=False) return p, r, f1
for i, batch in enumerate(loaded): all_preds, ids = trainer.plural_predict(batch, args.plurality) for predictions, preds in zip(all_predictions, all_preds): predictions += preds all_ids += ids all_prediction_labels = [] for predictions in all_predictions: prediction_labels = [id2label[p] for p in predictions] all_prediction_labels.append(prediction_labels) p, r, f1 = scorer.plural_score(loaded.gold(), all_prediction_labels, verbose=True) print("{} set evaluate result: {:.2f}\t{:.2f}\t{:.2f}".format(args.dataset,p,r,f1)) if args.trace_file != None: print(f'Creating trace file "{args.trace_file}"') with open(args.trace_file, 'w', encoding='utf-8', newline='') as trace_file: csv_writer = csv.writer(trace_file) title = ['id', 'gold'] title += ['prediction{0}'.format(i+1) for i in range(len(all_prediction_labels)) ] csv_writer.writerow(title)
trainer.load(model_file) batch = DataLoader([data_file], opt['batch_size'], opt, vocab, evaluation=True, corefresolve=True) batch_iter = tqdm(batch) all_probs = [] samples = [] for i, b in enumerate(batch_iter): preds, probs, _, sample = trainer.predict(b) predictions += preds all_probs += probs # effsum+=lab_eff # lab_nums+=lab_num samples = samples + sample key += batch.gold() # with open('samples.json','w') as f: # json.dump(samples,f,indent=4) predictions = [id2label[p] for p in predictions] p, r, f1 = scorer.score(batch, predictions, verbose=True) print("{} set evaluate result: {:.2f}\t{:.2f}\t{:.2f}".format( args.dataset, p, r, f1)) print("Evaluation ended.")
helper.print_config(opt) if opt['scheme'] == 'iob': label2id = constant.TYPE_TO_ID_IOB elif opt['scheme'] == 'iobes': label2id = constant.TYPE_TO_ID_IOBES else: raise Exception("Tagging scheme not found: " + opt['scheme']) id2label = dict([(v, k) for k, v in label2id.items()]) predictions = [] for i, b in enumerate(tqdm(batch)): preds, _ = trainer.predict(b) predictions += preds predictions = [[id2label[p] for p in ps] for ps in predictions] p, r, f1 = scorer.score_by_chunk(batch.gold(), predictions, scheme=opt['scheme']) print("{} set evaluate result: {:.2f}\t{:.2f}\t{:.2f}".format( args.dataset, p * 100, r * 100, f1 * 100)) if len(args.out) > 0: gold = batch.gold() words = batch.words() assert len(gold) == len(words) == len( predictions), "Dataset size mismatch." out = args.model_dir + '/' + args.out with open(out, 'w') as outfile: for ws, gs, ps in zip(words, gold, predictions): assert len(ws) == len(gs) == len(ps), "Example length mismatch." for w, g, p in zip(ws, gs, ps):
final_predictions, inst_predictions, aux_predictions = [], [], [] all_final_probs, all_inst_probs, all_aux_probs = [], [], [] for i, b in enumerate(batch): final_preds, inst_preds, aux_preds, final_probs, inst_probs, aux_probs = student_model.predict_all( b) final_predictions += final_preds inst_predictions += inst_preds aux_predictions += aux_preds all_final_probs += final_probs all_inst_probs += inst_probs all_aux_probs += aux_probs final_predictions = [id2label[p] for p in final_predictions] inst_predictions = [id2label[p] for p in inst_predictions] aux_predictions = [id2label[p] for p in aux_predictions] print('\n >> Final Prediction:') _, _, _ = scorer.score(batch.gold(), final_predictions, verbose=True) print('\n >> Instance Prediction:') _, _, _ = scorer.score(batch.gold(), inst_predictions, verbose=True) print('\n >> Auxiliary Prediction:') _, _, _ = scorer.score(batch.gold(), aux_predictions, verbose=True) # save probability scores # if len(args.out) > 0: # outfile = 'saved_models/' + args.model_id + '/' + args.out # with open(outfile, 'w') as fw: # for f_prob, i_prob, a_prob in zip(all_final_probs, all_inst_probs, all_aux_probs): # fw.write(json.dumps([round(p, 4) for p in f_prob])) # fw.write('\r\n') # fw.write(json.dumps([round(p, 4) for p in i_prob])) # fw.write('\r\n') # fw.write(json.dumps([round(p, 4) for p in a_prob]))
opt['num_epoch'], loss, duration, current_lr)) # eval on dev print("Evaluating on dev set...") predictions = [] dev_loss = 0 for i, batch in enumerate(dev_batch): preds, _, loss = trainer.predict(batch) predictions += preds dev_loss += loss # predictions = [id2label[p] for p in predictions] train_loss = train_loss / train_batch.num_examples * opt[ 'batch_size'] # avg loss per batch dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size'] dev_p, dev_r, dev_f1 = scorer.p_r_f1_binary(dev_batch.gold(), predictions) print("Precision (micro): {:.3%}".format(dev_p)) print(" Recall (micro): {:.3%}".format(dev_r)) print(" F1 (micro): {:.3%}".format(dev_f1)) print("epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}".format(epoch,\ train_loss, dev_loss, dev_f1)) dev_score = dev_r file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}".format( epoch, train_loss, dev_loss, dev_score, max([dev_score] + dev_score_history))) # save model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch) trainer.save(model_file, epoch) if epoch == 1 or dev_score > max(dev_score_history): copyfile(model_file, model_save_dir + '/best_model.pt')
vocab = Vocab(vocab_file, load=True) assert opt['vocab_size'] == vocab.size, "Vocab size must match that in the saved model." # load data data_file = opt['data_dir'] + '/test.json' print("Loading data from {} with batch size {}...".format(data_file, opt['batch_size'])) data = read_file(data_file, vocab, opt, False) batch = DataLoader(data, opt['batch_size'], opt, evaluation=True) helper.print_config(opt) label2id = constant.LABEL_TO_ID id2label = dict([(v, k) for k, v in label2id.items()]) predictions = [] all_probs = [] cross_list = [] batch_iter = tqdm(batch) for i, b in enumerate(batch_iter): cross_list += b[8] preds, probs, _ = trainer.predict(b) predictions += preds all_probs += probs predictions = [id2label[p] for p in predictions] print(predictions) test_score, test_single_score = nary_scorer.score(batch.gold(), predictions, cross_list) print("Test set evaluate result: cross {:.3f}, single {:.3f}".format(test_score, test_single_score)) print("Evaluation ended.")
print(format_str.format(datetime.now(), global_step, max_steps, epoch,\ opt['num_epoch'], loss, duration, current_lr)) # eval on train print("Evaluating on train set...") train_predictions = [] train_eval_loss = 0 for i, batch in enumerate(train_batch): preds, _, loss = trainer.predict(batch) train_predictions += preds train_eval_loss += loss train_predictions = [id2label[p] for p in train_predictions] train_eval_loss = train_eval_loss / train_batch.num_examples * opt[ 'batch_size'] train_p, train_r, train_f1 = scorer.score(train_batch.gold(), train_predictions) print( "epoch {}: train_loss = {:.6f}, train_eval_loss = {:.6f}, dev_f1 = {:.4f}" .format(epoch, train_loss, train_eval_loss, train_f1)) train_score = train_f1 # file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}".format(epoch, train_loss, train_eval_loss, train_f1)) # eval on dev print("Evaluating on dev set...") dev_predictions = [] dev_loss = 0 for i, batch in enumerate(dev_batch): preds, _, loss = trainer.predict(batch) dev_predictions += preds dev_loss += loss
batch = DataLoader(data_file, opt['batch_size'], opt, vocab, evaluation=True) helper.print_config(opt) label2id = constant.LABEL_TO_ID id2label = dict([(v, k) for k, v in label2id.items()]) predictions = [] all_probs = [] batch_iter = tqdm(batch) for i, b in enumerate(batch_iter): preds, probs, _ = trainer.predict(b) predictions += preds all_probs += probs predictions = [id2label[p] for p in predictions] p, r, f1 = scorer.score(batch.gold(), predictions) fjson = open(data_file, 'r') origin_data = json.load(fjson) fjson.close() with open("eval_output.txt", 'a') as f: f.write("True Label\tPrediction\tSubject\tObject\tSentence") for i in range(len(predictions)): if batch.gold()[i] != predictions[i]: ss = origin_data[i]['subj_start'] se = origin_data[i]['subj_end'] os = origin_data[i]['obj_start'] oe = origin_data[i]['obj_end'] token = origin_data[i]['token'] subj = " ".join(token[ss:ss + 1])
def main(): # set top-level random seeds torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) if args.cpu: args.cuda = False elif args.cuda: # force random seed for reproducibility # also apply same seed to numpy in every file torch.backends.cudnn.deterministic = True torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # make opt opt = vars(args) opt['num_class'] = len(constant.LABEL_TO_ID) # load vocab vocab_file = opt['vocab_dir'] + '/vocab.pkl' vocab = Vocab(vocab_file, load=True) # in some previous experiments we saw that lower vocab size can improve performance # but it was in a completely different project although on the same data # here it seems it's much harder to get this to work # uncomment the following line if this is solved: # new_vocab_size = 30000 opt['vocab_size'] = vocab.size emb_file = opt['vocab_dir'] + '/embedding.npy' emb_matrix = np.load(emb_file) assert emb_matrix.shape[0] == vocab.size assert emb_matrix.shape[1] == opt['emb_dim'] # load data print("Loading data from {} with batch size {}...".format( opt['data_dir'], opt['batch_size'])) train_batch = DataLoader(opt['data_dir'] + '/train.json', opt['batch_size'], opt, vocab, evaluation=False) dev_batch = DataLoader(opt['data_dir'] + '/dev.json', opt['batch_size'], opt, vocab, evaluation=True) model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id'] model_save_dir = opt['save_dir'] + '/' + model_id opt['model_save_dir'] = model_save_dir helper.ensure_dir(model_save_dir, verbose=True) # save config helper.save_config(opt, model_save_dir + '/config.json', verbose=True) vocab.save(model_save_dir + '/vocab.pkl') file_logger = helper.FileLogger( model_save_dir + '/' + opt['log'], header="# epoch\ttrain_loss\tdev_loss\tdev_p\tdev_r\tdev_f1") # print model info helper.print_config(opt) # model model = RelationModel(opt, emb_matrix=emb_matrix) id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()]) dev_f1_history = [] current_lr = opt['lr'] global_step = 0 format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' max_steps = len(train_batch) * opt['num_epoch'] # setup the scheduler for lr decay # this doesn't seem to work well compared to what we already have # scheduler = ReduceLROnPlateau(model.optimizer, mode='min', factor=opt['lr_decay'], patience=1) # start training for epoch in range(1, opt['num_epoch'] + 1): # TODO: if lr warmup is used, the lr console output is not updated print( "Current params: " + " heads-" + str(opt["n_head"]) + " enc_layers-" + str(opt["num_layers_encoder"]), " drop-" + str(opt["dropout"]) + " scaled_drop-" + str(opt["scaled_dropout"]) + " lr-" + str(opt["lr"]), " lr_decay-" + str(opt["lr_decay"]) + " max_grad_norm-" + str(opt["max_grad_norm"])) print( " weight_no_rel-" + str(opt["weight_no_rel"]) + " weight_rest-" + str(opt["weight_rest"]) + " attn-" + str(opt["attn"]) + " attn_dim-" + str(opt["attn_dim"]), " obj_sub_pos-" + str(opt["obj_sub_pos"]) + " new_residual-" + str(opt["new_residual"])) print( " use_batch_norm-" + str(opt["use_batch_norm"]) + " relative_positions-" + str(opt["relative_positions"]), " decay_epoch-" + str(opt["decay_epoch"]) + " use_lemmas-" + str(opt["use_lemmas"]), " hidden_self-" + str(opt["hidden_self"])) train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = model.update(batch) train_loss += float(loss) if global_step % opt['log_step'] == 0: duration = time.time() - start_time print( format_str.format(datetime.now(), global_step, max_steps, epoch, opt['num_epoch'], loss, duration, current_lr)) # do garbage collection, # as per https://discuss.pytorch.org/t/best-practices-for-maximum-gpu-utilization/13863/6 del loss # eval on dev print("Evaluating on dev set...") predictions = [] dev_loss = 0 for i, batch in enumerate(dev_batch): preds, _, loss = model.predict(batch) predictions += preds dev_loss += float(loss) del loss predictions = [id2label[p] for p in predictions] dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions) train_loss = train_loss / train_batch.num_examples * opt[ 'batch_size'] # avg loss per batch dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size'] print( "epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}".format(epoch, \ train_loss, dev_loss, dev_f1) ) file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}\t{:.4f}".format( epoch, train_loss, dev_loss, dev_p, dev_r, dev_f1)) # save model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch) model.save(model_file, epoch) if epoch == 1 or dev_f1 > max(dev_f1_history): copyfile(model_file, model_save_dir + '/best_model.pt') print("new best model saved.") if epoch % opt['save_epoch'] != 0: os.remove(model_file) # reduce learning rate if it stagnates by a certain decay rate and within given epoch patience # this for some reason works worth than the implementation we have afterwards # scheduler.step(dev_loss) if opt["optim"] != "noopt_adam" and opt["optim"] != "noopt_nadam": # do warm_up_for sgd only instead of adam do_warmup_trick = False if do_warmup_trick: # print("do_warmup_trick") # 1 and 5 first worked kind of # 10 and 15 current_lr = 10 * (360**(-0.5) * min(epoch**(-0.5), epoch * 15**(-1.5))) # print("current_lr", current_lr) model.update_lr(current_lr) else: # decay schedule # 15 is best! # simulate patience of x epochs if len(dev_f1_history ) > opt['decay_epoch'] and dev_f1 <= dev_f1_history[-1]: current_lr *= opt['lr_decay'] model.update_lr(current_lr) # else, update the learning rate in torch_utils.py dev_f1_history += [dev_f1] print("") print("Training ended with {} epochs.".format(epoch))
data_file = args.data_dir + '/{}.json'.format(args.dataset) print("Loading data from {} with batch size {}...".format( data_file, opt['batch_size'])) batch = DataLoader(data_file, opt['batch_size'], opt, vocab, evaluation=True) helper.print_config(opt) label2id = constant.LABEL_TO_ID id2label = dict([(v, k) for k, v in label2id.items()]) predictions = [] all_probs = [] incorrect_indices = [] batch_iter = tqdm(batch) for i, b in enumerate(batch_iter): preds, probs, _ = trainer.predict(b) predictions += preds all_probs += probs predictions = [id2label[p] for p in predictions] is_incorrect = np.array(predictions) != np.array(batch.gold()) incorrect_data = np.array(batch.raw_data)[is_incorrect] save_file = os.path.join(args.data_dir, 'test_incorrect.json') with open(save_file, 'w') as handle: json.dump(incorrect_data.tolist(), handle) p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True) print("{} set evaluate result: {:.2f}\t{:.2f}\t{:.2f}".format( args.dataset, p, r, f1)) print("Evaluation ended.")
# loss, # duration, # current_lr, # ) # ) # eval on dev print("Evaluating on dev set...") predictions = [] dev_loss = 0 for i, batch in tqdm(enumerate(dev_batch)): preds, _, _, loss = model.predict(batch) predictions += preds dev_loss += loss predictions = [id2label[p] for p in predictions] dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions) f = open("label.txt", "w+") f.write(str(dev_batch.gold())) f.close() train_loss = (train_loss / train_batch.num_examples * opt["batch_size"] ) # avg loss per batch train_nll_loss = train_nll_loss / train_batch.num_examples * opt[ "batch_size"] train_rat_loss = train_rat_loss / train_batch.num_examples * opt[ "batch_size"] train_nonrat_loss = train_nonrat_loss / train_batch.num_examples * opt[ "batch_size"]
'text': text, 'label': label, 'prediction': prediction, 'posterior': posterior, 'attention': attention, 'id': 'sample{}_{}'.format(i, j) } data_visual.append(jason_dict) predictions += preds # all_probs += probs # print(data_visual[0:4]) predictions = [id2label[p] for p in predictions] p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True) # save probability scores if len(args.out) > 0: helper.ensure_dir(os.path.dirname(args.out)) # with open(args.out + 'test_temp.pkl', 'wb') as outfile: # pickle.dump(all_probs, outfile) # print("Prediction scores saved to {}.".format(args.out)) with open(args.out + 'predictions.pkl', 'wb') as outfile: pickle.dump(predictions, outfile) print("Prediction saved to {}.".format(args.out)) with open(args.out + 'gold.pkl', 'wb') as outfile: pickle.dump(batch.gold(), outfile) print("True label saved to {}.".format(args.out))
train_loss += loss if global_step % opt['log_step'] == 0: duration = time.time() - start_time print(format_str.format(datetime.now(), global_step, max_steps, epoch,\ opt['num_epoch'], loss, duration, current_lr)) # eval on dev print("Evaluating on dev set...") predictions = [] dev_loss = 0 for i, batch in enumerate(dev_batch): preds, _, loss = model.predict(batch) predictions += preds dev_loss += loss predictions = [id2label[p] for p in predictions] dev_p, dev_r, dev_f1, mistake = scorer.score(dev_batch.gold(), predictions) train_loss = train_loss / train_batch.num_examples * opt['batch_size'] # avg loss per batch dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size'] print("epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}".format(epoch,\ train_loss, dev_loss, dev_f1)) file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}".format(epoch, train_loss, dev_loss, dev_f1)) # save model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch) model.save(model_file, epoch) if epoch == 1 or dev_f1 > max(dev_f1_history): bad_count = 0 copyfile(model_file, model_save_dir + '/best_model.pt') print("new best model saved.") elif dev_f1 < max(dev_f1_history):
batch = DataLoader(data_file, opt['batch_size'], opt, vocab, data_type='test') helper.print_config(opt) label2id = constant.LABEL_TO_ID[type_pair_id] id2label = dict([(v, k) for k, v in label2id.items()]) predictions = [] all_probs = [] batch_iter = tqdm(batch) for i, b in enumerate(batch_iter): preds, probs, _ = trainer.predict(b) predictions += preds all_probs += probs predictions = [id2label[p] for p in predictions] gold = batch.gold() p, r, f1 = scorer.score(gold, predictions, verbose=True) print("{} set evaluate result: {:.2f}\t{:.2f}\t{:.2f}".format( args.dataset, p, r, f1)) ids = batch.ids() out_file = args.model_dir + '/gold.txt' out_f = open(out_file, 'w') for i, g in zip(ids, gold): out_f.write('%d %s\n' % (i, g)) out_file = args.model_dir + '/predictions.txt' out_f = open(out_file, 'w') for i, p in zip(ids, predictions): out_f.write('%d %s\n' % (i, p))
predictions = [] dev_loss = 0 for i, batch in enumerate(dev_batch): preds, _, loss, _ = trainer.predict(batch) predictions += preds dev_loss += loss predictions = [[id2label[l + 1]] for p in predictions for l in p] train_loss = train_loss / train_batch.num_examples * opt[ 'batch_size'] # avg loss per batch train_sent_loss = train_sent_loss / train_batch.num_examples * opt[ 'batch_size'] # avg loss per batch train_dep_path_loss = train_dep_path_loss / train_batch.num_examples * opt[ 'batch_size'] # avg loss per batch dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size'] dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions, method='macro') print("epoch {}: train_loss = {:.6f}, train_sent_loss = {:.6f}, train_dep_path_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}".format(epoch,\ train_loss, train_sent_loss, train_dep_path_loss, dev_loss, dev_f1)) dev_score = dev_f1 file_logger.log( "{}\t{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}".format( epoch, train_loss, train_sent_loss, train_dep_path_loss, dev_loss, dev_score, max([dev_score] + dev_score_history))) # save model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch) trainer.save(model_file, epoch) if epoch == 1 or dev_score > max(dev_score_history): copyfile(model_file, model_save_dir + '/best_model.pt')
data_file = opt['data_dir'] + f'/test.json' print("Loading data from {} with batch size {}...".format( data_file, opt['batch_size'])) batch = DataLoader(data_file, opt['batch_size'], opt, vocab, evaluation=True) helper.print_config(opt) id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()]) predictions = [] all_probs = [] for i, b in enumerate(batch): preds, probs, _ = model.predict(b) predictions += preds all_probs += probs predictions = [id2label[p] for p in predictions] metrics, other_data = scorer.score(batch.gold(), predictions, verbose=True) p = metrics['precision'] r = metrics['recall'] f1 = metrics['f1'] wrong_indices = other_data['wrong_indices'] correct_indices = other_data['correct_indices'] wrong_predictions = other_data['wrong_predictions'] raw_data = np.array(batch.raw_data) wrong_data = raw_data[wrong_indices] correct_data = raw_data[correct_indices] wrong_ids = [d['id'] for d in wrong_data] correct_ids = [d['id'] for d in correct_data]
duration = time.time() - start_time print(format_str.format(datetime.now(), global_step, max_steps, epoch,\ opt['num_epoch'], loss, duration, current_lr)) # eval on dev print("Evaluating on dev set...") dev_predictions = [] dev_loss = 0 for i, batch in enumerate(dev_batch): preds, _, loss = trainer.predict(batch) dev_predictions += preds dev_loss += loss dev_predictions = [id2label[p] for p in dev_predictions] dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size'] dev_metrics, _ = scorer.score(dev_batch.gold(), dev_predictions) dev_p, dev_r, dev_f1 = dev_metrics['precision'], dev_metrics[ 'recall'], dev_metrics['f1'] print("epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}". format(epoch, train_loss, dev_loss, dev_f1)) dev_score = dev_f1 file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}".format( epoch, train_loss, dev_loss, dev_score, max([dev_score] + dev_score_history))) current_dev_metrics = {'f1': dev_f1, 'precision': dev_p, 'recall': dev_r} # eval on test print("Evaluating on test set...") test_predictions = [] for i, batch in enumerate(test_batch): preds, _, loss = trainer.predict(batch)
id2label = dict([(v,k) for k,v in label2id.items()]) id2label[0] = 'no_relation' predictions = [] all_probs = [] all_ids = [] batch_iter = tqdm(batch) for i, b in enumerate(batch_iter): preds, probs, _, ids = trainer.predict(b) predictions += preds all_probs += probs all_ids += ids predictions = [id2label[p] for p in predictions] p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True) print("{} set evaluate result: {:.2f}\t{:.2f}\t{:.2f}".format(args.dataset,p,r,f1)) if args.trace_file_for_misses != None: print(f'Preparing miss information and writing it to "{args.trace_file_for_misses}"') with open(args.trace_file_for_misses, 'w', encoding='utf-8', newline='') as trace_file_for_misses: csv_writer = csv.writer(trace_file_for_misses) csv_writer.writerow( ['id', 'gold', 'predicted']) for gold, prediction, id in zip(batch.gold(), predictions, all_ids): if gold != prediction: csv_writer.writerow( [id, gold, prediction])
duration = time.time() - start_time print( format_str.format(datetime.now(), global_step, max_steps, epoch, opt['num_epoch'], loss, duration, current_lr)) # eval on dev print("Evaluating on dev set...") predictions = [] dev_loss = 0 for _, batch in enumerate(dev_batch): preds, _, loss = base_model.predict(batch) predictions += preds dev_loss += loss predictions = [id2label[p] for p in predictions] dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions) max_dev_f1, max_dev_id = (dev_f1, epoch) if max_dev_f1 < dev_f1 else (max_dev_f1, max_dev_id) train_loss = train_loss / train_batch.num_examples * opt[ 'batch_size'] # avg loss per batch dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size'] # eval on test print("Evaluating on test set...") predictions = [] test_loss = 0 for _, batch in enumerate(test_batch): preds, _, loss = base_model.predict(batch) predictions += preds
all_probs = [] sent_predictions = [] batch_iter = tqdm(batch) for i, b in enumerate(batch_iter): preds, probs, _, sent_preds = trainer.predict(b) predictions += preds all_probs += probs sent_predictions += sent_preds lens = [len(p) for p in predictions] predictions = [[id2label[l + 1]] for p in predictions for l in p] sent_predictions = [sent_id2label[p] for p in sent_predictions] #print(len(predictions)) #print(len(batch.gold())) p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True, verbose_output=args.per_class == 1) print('scroes from sklearn: ') macro_f1 = f1_score(batch.gold(), predictions, average='macro') micro_f1 = f1_score(batch.gold(), predictions, average='micro') macro_p = precision_score(batch.gold(), predictions, average='macro') micro_p = precision_score(batch.gold(), predictions, average='micro') macro_r = recall_score(batch.gold(), predictions, average='macro') micro_r = recall_score(batch.gold(), predictions, average='micro') print('micro scores: ') print('micro P: ', micro_p) print('micro R: ', micro_r) print('micro F1: ', micro_f1)
newline='') writer = csv.writer(csvfile) writer.writerow(["sentence", "idx", "predict", "gold"]) for i in tqdm(range(len(error))): for j in range(len(error[i])): inside = 0 for k in range(len(error[i][j])): inside = 1 sentence = " ".join( [vocab.id2word[g] for g in error[i][j][k]["token"]]) sub = [] obj = [] for it, g in enumerate(error[i][j][k]["sub_pos"]): if g == 0 and it < len(error[i][j][k]["token"]): sub.append(vocab.id2word[error[i][j][k]["token"][it]]) for it, g in enumerate(error[i][j][k]["obj_pos"]): if g == 0 and it < len(error[i][j][k]["token"]): obj.append(vocab.id2word[error[i][j][k]["token"][it]]) predict = id2label[error[i][j][k]["preds"]] gold = id2label[error[i][j][k]["label"]] writer.writerow([sentence, idx, predict, gold]) if inside == 1: writer.writerow("") predictions = [id2label[p] for p in predictions] p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True) print("{} set evaluate result: {:.2f}\t{:.2f}\t{:.2f}".format( args.dataset, p, r, f1)) print("Evaluation ended.")
helper.print_config(opt) label2id = constant.LABEL_TO_ID id2label = dict([(v, k) for k, v in label2id.items()]) predictions = [] all_probs = [] all_ids = [] for i, b in enumerate(loaded): preds, probs, _, ids = trainer.predict_with_confidence(b) predictions += preds all_probs += probs all_ids += ids predictions = [id2label[p] for p in predictions] p, r, f1 = scorer.score(loaded.gold(), predictions, verbose=True) print("{} set evaluate result: {:.2f}\t{:.2f}\t{:.2f}".format( args.dataset, p, r, f1)) if args.trace_file != None: print(f'Creating trace file "{args.trace_file}"') with open(args.trace_file, 'w', encoding='utf-8', newline='') as trace_file: csv_writer = csv.writer(trace_file) csv_writer.writerow(['id', 'gold', 'predicted', 'probability']) for id, gold, prediction, probability in zip(all_ids, loaded.gold(), predictions, all_probs): csv_writer.writerow([id, gold, prediction, probability])
train_loss += loss if global_step % opt['log_step'] == 0: duration = time.time() - start_time print(format_str.format(datetime.now(), global_step, max_steps, epoch,\ opt['num_epoch'], loss, duration, current_lr)) # eval on dev print("Evaluating on dev set...") predictions = [] dev_loss = 0 for i, batch in enumerate(dev_batch): preds, _, loss = model.predict(batch) predictions += preds dev_loss += loss predictions = [id2label[p] for p in predictions] current_dev_metrics, _ = scorer.score(dev_batch.gold(), predictions) dev_f1 = current_dev_metrics['f1'] train_loss = train_loss / train_batch.num_examples * opt[ 'batch_size'] # avg loss per batch dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size'] print("epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}". format(epoch, train_loss, dev_loss, dev_f1)) file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}".format( epoch, train_loss, dev_loss, dev_f1)) print("Evaluating on test set...") predictions = [] test_loss = 0 test_preds = [] for i, batch in enumerate(test_batch):
opt['num_epoch'], loss, duration, current_lr)) # eval on dev print("Evaluating on dev set...") predictions = [] dev_loss = 0 for i, batch in enumerate(dev_batch): preds, _, loss = trainer.predict(batch) predictions += preds dev_loss += loss predictions = [id2label[p] for p in predictions] train_loss = train_loss / train_batch.num_examples * opt[ 'batch_size'] # avg loss per batch dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size'] dev_score, _ = nary_scorer.score(dev_batch.gold(), predictions) print("epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_score = {:.4f}".format(epoch,\ train_loss, dev_loss, dev_score)) # save model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch) trainer.save(model_file, epoch) if epoch == 1 or dev_score > max(dev_score_history): copyfile(model_file, model_save_dir + '/best_model.pt') print("new best model saved.") if epoch % opt['save_epoch'] != 0: os.remove(model_file) # lr schedule if len(dev_score_history) > opt['decay_epoch'] and dev_score <= dev_score_history[-1] and \ opt['optim'] in ['sgd', 'adagrad', 'adadelta']:
# load data data_file = opt['data_dir'] + '/{}.json'.format(args.dataset) print("Loading data from {} with batch size {}...".format(data_file, opt['batch_size'])) batch = DataLoader(data_file, opt['batch_size'], opt, vocab, evaluation=True) helper.print_config(opt) predictions = [] all_probs = [] batch_iter = tqdm(batch) for i, b in enumerate(batch_iter): preds, probs, _ = trainer.predict(b) predictions += preds all_probs += probs p, r, f1 = scorer.p_r_f1_binary(batch.gold(), predictions) print( "Precision (micro): {:.3%}".format(p) ) print( " Recall (micro): {:.3%}".format(r) ) print( " F1 (micro): {:.3%}".format(f1) ) print("{} set evaluate result: {:.2f}\t{:.2f}\t{:.2f}".format(args.dataset,p,r,f1)) id2label = {0:'no_relation', 1:''} predictions = [id2label[p] for p in predictions] out_file = args.model_dir + '/predictions.txt' out_f = open(out_file, 'w') for i, p in enumerate(predictions): out_f.write('%d %s\n' % (i, p)) print("Evaluation ended.")
def train_unbiased_model(args, biased_batch_probs): # make opt opt = vars(args) opt["num_class"] = len(constant.LABEL_TO_ID) # load vocab vocab_file = opt['vocab_dir'] + '/vocab.pkl' vocab = Vocab(vocab_file, load=True) opt['vocab_size'] = vocab.size emb_file = opt['vocab_dir'] + '/embedding.npy' emb_matrix = np.load(emb_file) assert emb_matrix.shape[0] == vocab.size assert emb_matrix.shape[1] == opt['emb_dim'] # load data print("Loading data from {} with batch size {}...".format( opt["data_dir"], opt["batch_size"])) train_batch = DataLoader( opt["data_dir"] + "/" + args.data_name, opt["batch_size"], opt, vocab, evaluation=False, ) dev_batch = DataLoader(opt["data_dir"] + "/dev.json", opt["batch_size"], opt, vocab, evaluation=True) model_id = opt["id"] if len(opt["id"]) > 1 else "0" + opt["id"] model_save_dir = opt["save_dir"] + "/" + model_id opt["model_save_dir"] = model_save_dir helper.ensure_dir(model_save_dir, verbose=True) # save config helper.save_config(opt, model_save_dir + "/config.json", verbose=True) vocab.save(model_save_dir + "/vocab.pkl") file_logger = helper.FileLogger( model_save_dir + "/" + opt["log"], header="# epoch\ttrain_loss\tdev_loss\tdev_f1") # print model info helper.print_config(opt) # model model = RelationModel(opt, emb_matrix=emb_matrix) id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()]) dev_f1_history = [] current_lr = opt["lr"] global_step = 0 global_start_time = time.time() format_str = ( "{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}" ) max_steps = len(train_batch) * opt["num_epoch"] # start training for epoch in range(1, opt["num_epoch"] + 1): train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = model.update(batch, torch.tensor(biased_batch_probs[i]).cuda()) train_loss += loss if global_step % opt["log_step"] == 0: duration = time.time() - start_time print( format_str.format( datetime.now(), global_step, max_steps, epoch, opt["num_epoch"], loss, duration, current_lr, )) # eval on dev print("Evaluating on dev set...") predictions = [] dev_loss = 0 for i, batch in enumerate(dev_batch): preds, _, loss = model.predict(batch) predictions += preds dev_loss += loss predictions = [id2label[p] for p in predictions] dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions) f = open("label.txt", "w+") f.write(str(dev_batch.gold())) f.close() train_loss = (train_loss / train_batch.num_examples * opt["batch_size"] ) # avg loss per batch dev_loss = dev_loss / dev_batch.num_examples * opt["batch_size"] print( "epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}" .format(epoch, train_loss, dev_loss, dev_f1)) file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}".format( epoch, train_loss, dev_loss, dev_f1)) # save model_file = model_save_dir + "/checkpoint_epoch_{}.pt".format(epoch) model.save(model_file, epoch) if epoch == 1 or dev_f1 > max(dev_f1_history): copyfile(model_file, model_save_dir + "/best_model.pt") print("new best model saved.") if epoch % opt["save_epoch"] != 0: os.remove(model_file) # lr schedule if (len(dev_f1_history) > 10 and dev_f1 <= dev_f1_history[-1] and opt["optim"] in ["sgd", "adagrad"]): current_lr *= opt["lr_decay"] model.update_lr(current_lr) dev_f1_history += [dev_f1] print("") print("Training ended with {} epochs.".format(epoch))
opt['num_epoch'], loss, duration, current_lr)) # eval on dev print("Evaluating on dev set...") predictions = [] dev_loss = 0 for i, batch in enumerate(dev_batch): preds, _, loss = trainer.predict(batch) predictions += preds dev_loss += loss predictions = [id2label[p] for p in predictions] train_loss = train_loss / train_batch.num_examples * opt[ 'batch_size'] # avg loss per batch dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size'] dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions) print("epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}".format(epoch,\ train_loss, dev_loss, dev_f1)) dev_score = dev_f1 file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}".format( epoch, train_loss, dev_loss, dev_score, max([dev_score] + dev_score_history))) # save model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch) trainer.save(model_file, epoch) if epoch == 1 or dev_score > max(dev_score_history): copyfile(model_file, model_save_dir + '/best_model.pt') print("new best model saved.") file_logger.log("new best model saved at epoch {}: {:.2f}\t{:.2f}\t{:.2f}"\ .format(epoch, dev_p*100, dev_r*100, dev_score*100))
continue train_loss += loss if global_step % opt['log_step'] == 0: duration = time.time() - start_time print(format_str.format(datetime.now(), global_step, max_steps, epoch,\ opt['num_epoch'], loss, duration, current_lr)) # eval on dev print("Evaluating on dev set...") predictions = [] dev_loss = 0 final_gold = [] for i, batch in enumerate(dev_batch): try: preds, _, loss = trainer.predict(batch) final_gold.append(dev_batch.gold()[i]) except: print("lost dev epoch %d" % i) continue predictions += preds dev_loss += loss predictions = [id2label[p] for p in predictions] train_loss = train_loss / train_batch.num_examples * opt[ 'batch_size'] # avg loss per batch dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size'] dev_p, dev_r, dev_f1 = scorer.score(final_gold, predictions) print("epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}".format(epoch,\ train_loss, dev_loss, dev_f1)) dev_score = dev_f1 file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}".format(