def main(): args = parse_args() # input files wv_file = args.glove_dir + '/' + args.wv_file wv_dim = args.wv_dim # output files helper.ensure_dir(args.vocab_dir) vocab_file = args.vocab_dir + '/vocab.pkl' emb_file = args.vocab_dir + '/embedding.npy' # load files print("loading files...") train_file = args.data_dir + '/train.jsonl' dev_file = args.data_dir + '/dev.jsonl' # test_file = args.data_dir + '/test.jsonl' train_tokens = load_tokens(train_file) dev_tokens = load_tokens(dev_file) # test_tokens = load_tokens(test_file) if args.lower: train_tokens, dev_tokens = [[t.lower() for t in tokens] for tokens in \ (train_tokens, dev_tokens)] # (train_tokens, dev_tokens, test_tokens)] # load glove print("loading glove...") glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim) print("{} words loaded from glove.".format(len(glove_vocab))) print("building vocab...") # all_tokens = train_tokens + dev_tokens + test_tokens all_tokens = train_tokens + dev_tokens v = build_vocab(all_tokens, glove_vocab, args.min_freq) print("calculating oov...") # datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens} datasets = {'train': train_tokens, 'dev': dev_tokens} for dname, d in datasets.items(): total, oov = count_oov(d, v) print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov * 100.0 / total)) print("building embeddings...") if args.random: print("using random initialization...") embedding = random_embedding(v, wv_dim) else: embedding = vocab.build_embedding(wv_file, v, wv_dim) print("embedding size: {} x {}".format(*embedding.shape)) print("dumping to files...") with open(vocab_file, 'wb') as outfile: pickle.dump(v, outfile) np.save(emb_file, embedding) print("all done.")
def evaluate_model(evalparams): torch.manual_seed(evalparams.seed) random.seed(1234) if evalparams.cpu: evalparams.cuda = False elif evalparams.cud: torch.cuda.manual_seed(args.seed) # load opt print(evalparams.model_dir, evalparams.model) # model_file = evalparams.model_dir + "/" + evalparams.model model_file = 'best_model.pt' print("Loading model from {}".format(model_file)) opt = torch_utils.load_config(model_file) model = RelationModel(opt) model.load(model_file) # load vocab vocab_file = evalparams.model_dir + '/vocab.pkl' vocab = Vocab(vocab_file, load=True) assert opt[ 'vocab_size'] == vocab.size, "Vocab size must match that in the saved model." # load data data_file = opt['data_dir'] + '/{}.json'.format(evalparams.dataset) print("Loading data from {} with batch size {}...".format( data_file, opt['batch_size'])) batch = DataLoader(data_file, opt['batch_size'], opt, vocab, evaluation=True) helper.print_config(opt) id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()]) predictions = [] all_probs = [] for i, b in enumerate(batch): preds, probs, _ = model.predict(b) predictions += preds all_probs += probs predictions = [id2label[p] for p in predictions] p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True) # save probability scores if len(evalparams.out) > 0: helper.ensure_dir(os.path.dirname(evalparams.out)) with open(evalparams.out, 'wb') as outfile: pickle.dump(all_probs, outfile) print("Prediction scores saved to {}.".format(evalparams.out)) print("Evaluation ended.") return (batch.gold(), predictions, model)
def prepare_vocab(data_dir, vocab_dir, spacy_model, glove_dir="dataset/glove", wv_file="glove.840B.300d.txt", wv_dim=300, min_freq=0, lower=True): # input files train_file = data_dir + '/train.json' dev_file = data_dir + '/dev.json' test_file = data_dir + '/test.json' wv_file = glove_dir + '/' + wv_file wv_dim = wv_dim # output files helper.ensure_dir(vocab_dir) vocab_file = vocab_dir + '/vocab.pkl' emb_file = vocab_dir + '/embedding.npy' # load files print("loading files...") train_tokens = load_tokens(train_file, spacy_model) dev_tokens = load_tokens(dev_file) test_tokens = load_tokens(test_file, spacy_model) if lower: train_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in \ (train_tokens, dev_tokens, test_tokens)] # load glove print("loading glove...") glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim) print("{} words loaded from glove.".format(len(glove_vocab))) print("building vocab...") v = build_vocab(train_tokens, glove_vocab, min_freq) print("calculating oov...") datasets = {'train': train_tokens, 'test': test_tokens} for dname, d in datasets.items(): total, oov = count_oov(d, v) print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov * 100.0 / total)) print("building embeddings...") embedding = vocab.build_embedding(wv_file, v, wv_dim) print("embedding size: {} x {}".format(*embedding.shape)) print("dumping to files...") with open(vocab_file, 'wb') as outfile: pickle.dump(v, outfile) np.save(emb_file, embedding) print("all done.")
def viz_att(words, attn, name, label): sns.set() f, ax = plt.subplots(figsize=(20, 20)) df = pd.DataFrame(attn, index=words, columns=words) sns.heatmap(df, xticklabels=words, yticklabels=words, cmap="YlGnBu", ax=ax) ax.set_title(name) label_y = ax.get_yticklabels() plt.setp(label_y, rotation=360, horizontalalignment='right') label_x = ax.get_xticklabels() plt.setp(label_x, rotation=90, horizontalalignment='right') fig_path = "svgs/" + str(label) ensure_dir(fig_path) f.savefig(fig_path + "/" + name + '.svg', format='svg', bbox_inches='tight')
def prepare_voabulary (vocab_params): # input files train_file = vocab_params.data_dir + '/train.json' dev_file = vocab_params.data_dir + '/dev.json' test_file = vocab_params.data_dir + '/test.json' wv_file = vocab_params.glove_dir + '/' + vocab_params.glove_text_file wv_dim = vocab_params.emb_dim # output files helper.ensure_dir(vocab_params.vocab_dir) vocab_file = vocab_params.vocab_dir + vocab_params.vocab_file emb_file = vocab_params.vocab_dir + vocab_params.embed_file # load files print("loading files...") train_tokens = load_tokens(train_file) dev_tokens = load_tokens(dev_file) test_tokens = load_tokens(test_file) if vocab_params.lower: train_tokens, dev_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\ (train_tokens, dev_tokens, test_tokens)] # load glove print("loading glove...") glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim) print("{} words loaded from glove.".format(len(glove_vocab))) print("building vocab...") v = build_vocab(train_tokens, glove_vocab, vocab_params.min_freq) print("calculating oov...") datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens} for dname, d in datasets.items(): total, oov = count_oov(d, v) print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov*100.0/total)) print("building embeddings...") embedding = vocab.build_embedding(wv_file, v, wv_dim) print("embedding size: {} x {}".format(*embedding.shape)) print("dumping to files...") with open(vocab_file, 'wb') as outfile: pickle.dump(v, outfile) np.save(emb_file, embedding) print("all done.") return(vocab)
def main(): args = parse_args() # input files train_file = args.data_dir + '/train.json' dev_file = args.data_dir + '/dev.json' test_file = args.data_dir + '/test.json' embedding_file = args.ucca_embedding_dir + '/' + args.ucca_embedding_file index_file = args.ucca_embedding_dir + '/' + args.ucca_embedding_index_file helper.ensure_dir(args.ucca_embedding_dir) UccaEmbedding.prepare(args.ucca_embedding_dim, [train_file, dev_file, test_file], index_file, embedding_file, args.ucca_embedding_source) return UccaEmbedding(args.ucca_embedding_dim, index_file, embedding_file)
def split_test_data(coarse_name): data = read_tsv('dataset/test.tsv') # for i,coarse_name in enumerate(constant.COARSE_INTO_MULTI): # save dir res_dir = 'result/multi/' + coarse_name helper.ensure_dir(res_dir) # select test data according to coarse predictions coarse_id = constant.COARSE_TO_ID[coarse_name] coarse_prediction = eval(open(constant.BEST_PRED_COARSE_FILE).read()) tmp_list, index_rec, labels = [], {}, [] for i, p in enumerate(coarse_prediction): if p == coarse_id: tmp_list.append(data[i]) index_rec[i] = len( tmp_list) - 1 # index[0~1500] = current coarse data index labels.append(data[i]['label']) # save input data of test print("\nsaving data...") helper.ensure_dir('dataset/multi/' + coarse_name + '/eval/') input_path = os.path.join('dataset/multi/' + coarse_name + '/eval/', 'test.tsv') with open(input_path, 'w') as f: pass with open(input_path, 'a') as f: for i, p in enumerate(tmp_list): f.write('\t'.join([str(p['label']), p['text_a']]) + '\n') print("test input file saved to file {}".format(input_path)) # save index relation index_rela_path = os.path.join(res_dir, 'index_relation') with open(index_rela_path, 'w') as f: f.write(str(index_rec)) print( "index relation between multi test set and test.tsv saved to file {}". format(index_rela_path)) # save corresponding labels labels_save_path = os.path.join(res_dir, 'labels') with open(labels_save_path, 'w') as f: f.write(str(labels)) print("corresponding labels saved to file {}".format(labels_save_path) + "\n")
def main(): args = parse_args() # input files train_file = args.data_dir + '/train.json' dev_file = args.data_dir + '/dev.json' test_file = args.data_dir + '/test.json' wv_file = args.glove_dir + '/' + args.wv_file wv_dim = args.wv_dim # output files helper.ensure_dir(args.vocab_dir) vocab_file = args.vocab_dir + '/vocab.pkl' emb_file = args.vocab_dir + '/embedding.npy' # load files print("loading files...") train_tokens = load_tokens(train_file) test_tokens = load_tokens(test_file) dev_tokens = test_tokens # load glove print("loading glove...") glove_vocab = load_glove_vocab(wv_file, wv_dim) print("{} words loaded from glove.".format(len(glove_vocab))) print("building vocab...") v = build_vocab(train_tokens, glove_vocab) print("calculating oov...") datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens} for dname, d in datasets.items(): total, oov = count_oov(d, v) print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov*100.0/total)) print("building embeddings...") embedding = build_embedding(wv_file, v, wv_dim) print("embedding size: {} x {}".format(*embedding.shape)) print("dumping to files...") with open(vocab_file, 'wb') as outfile: pickle.dump(v, outfile) np.save(emb_file, embedding) print("all done.")
helper.print_config(opt) id2label = dict([(v,k) for k,v in constant.LABEL_TO_ID.items()]) predictions = [] all_probs = [] for i, b in enumerate(batch): preds, probs, _ = model.predict(b) predictions += preds all_probs += probs predictions = [id2label[p] for p in predictions] p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True) # save probability scores if len(args.out) > 0: helper.ensure_dir(os.path.dirname(args.out)) with open(args.out, 'wb') as outfile: pickle.dump(all_probs, outfile) print("Prediction scores saved to {}.".format(args.out)) print("Evaluation ended.")
NER.build_vocab(dataset_vocab) PST.build_vocab(dataset_vocab) opt["num_class"] = len(RELATION.vocab) opt["vocab_pad_id"] = TOKEN.vocab.stoi["<pad>"] opt["pos_pad_id"] = POS.vocab.stoi["<pad>"] opt["ner_pad_id"] = NER.vocab.stoi["<pad>"] opt["pe_pad_id"] = PST.vocab.stoi["<pad>"] opt["vocab_size"] = len(TOKEN.vocab) opt["pos_size"] = len(POS.vocab) opt["ner_size"] = len(NER.vocab) opt["pe_size"] = len(PST.vocab) opt["rel_stoi"] = RELATION.vocab.stoi opt["rel_itos"] = RELATION.vocab.itos helper.ensure_dir(opt["p_dir"], verbose=True) helper.ensure_dir(opt["s_dir"], verbose=True) TOKEN.vocab.load_vectors("glove.840B.300d", cache="./dataset/.vectors_cache") if TOKEN.vocab.vectors is not None: opt["emb_dim"] = TOKEN.vocab.vectors.size(1) def load_best_model(model_dir, model_type="predictor"): model_file = model_dir + "/best_model.pt" print("Loading model from {}".format(model_file)) model_opt = torch_utils.load_config(model_file) if model_type == "predictor": predictor = Predictor(model_opt) model = Trainer(model_opt, predictor, model_type=model_type) else:
def main(): args = parse_args() # input files train_file = args.data_dir + '/train.json' dev_file = args.data_dir + '/dev.json' test_file = args.data_dir + '/test.json' schema_file = args.data_dir + '/schemas.json' wv_file = args.emb_dir + '/' + args.wv_file wv_dim = args.wv_dim # output files helper.ensure_dir(args.vocab_dir) vocab_file = args.vocab_dir + '/vocab.pkl' char_file = args.vocab_dir + '/chars.json' emb_file = args.vocab_dir + '/embedding.npy' # load files print("loading files...") train_tokens = load_tokens(train_file) dev_tokens = load_tokens(dev_file) test_tokens = load_tokens(test_file) if args.lower: train_tokens, dev_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\ (train_tokens, dev_tokens, test_tokens)] # load glove print("loading glove...") glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim) print("{} words loaded from glove.".format(len(glove_vocab))) print("building vocab...") v = build_vocab(train_tokens, glove_vocab, args.min_freq) print("calculating oov...") datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens} for dname, d in datasets.items(): total, oov = count_oov(d, v) print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov * 100.0 / total)) print("building embeddings...") embedding = vocab.build_embedding(wv_file, v, wv_dim) print("embedding size: {} x {}".format(*embedding.shape)) print("dumping embeddings to files...") with open(vocab_file, 'wb') as outfile: pickle.dump(v, outfile) np.save(emb_file, embedding) # print("all done.") print("building schemas...") all_schemas = set() subj_type = set() obj_type = set() min_count = 2 pos_tags = set() chars = defaultdict(int) with open(train_file) as f: a = json.load(f) for ins in a: for spo in ins['spo_details']: all_schemas.add(spo[3]) subj_type.add(spo[2]) obj_type.add(spo[6]) for pos in ins['pos_tags']: pos_tags.add(pos) for token in ins['tokens']: for char in token: chars[char] += 1 id2predicate = {i + 1: j for i, j in enumerate(all_schemas)} # 0表示终止类别 predicate2id = {j: i for i, j in id2predicate.items()} id2subj_type = {i + 1: j for i, j in enumerate(subj_type)} # 0表示终止类别 subj_type2id = {j: i for i, j in id2subj_type.items()} id2obj_type = {i + 1: j for i, j in enumerate(obj_type)} # 0表示终止类别 obj_type2id = {j: i for i, j in id2obj_type.items()} with codecs.open(schema_file, 'w', encoding='utf-8') as f: json.dump([ id2predicate, predicate2id, id2subj_type, subj_type2id, id2obj_type, obj_type2id ], f, indent=4, ensure_ascii=False) print("dumping chars to files...") with codecs.open(char_file, 'w', encoding='utf-8') as f: chars = {i: j for i, j in chars.items() if j >= min_count} id2char = {i + 2: j for i, j in enumerate(chars)} # padding: 0, unk: 1 char2id = {j: i for i, j in id2char.items()} id2pos = {i + 2: j for i, j in enumerate(pos_tags)} # padding: 0, unk: 1 pos2id = {j: i for i, j in id2pos.items()} json.dump([id2char, char2id, id2pos, pos2id], f, indent=4, ensure_ascii=False)
def train_unbiased_model(args, biased_batch_probs): # make opt opt = vars(args) opt["num_class"] = len(constant.LABEL_TO_ID) # load vocab vocab_file = opt['vocab_dir'] + '/vocab.pkl' vocab = Vocab(vocab_file, load=True) opt['vocab_size'] = vocab.size emb_file = opt['vocab_dir'] + '/embedding.npy' emb_matrix = np.load(emb_file) assert emb_matrix.shape[0] == vocab.size assert emb_matrix.shape[1] == opt['emb_dim'] # load data print("Loading data from {} with batch size {}...".format( opt["data_dir"], opt["batch_size"])) train_batch = DataLoader( opt["data_dir"] + "/" + args.data_name, opt["batch_size"], opt, vocab, evaluation=False, ) dev_batch = DataLoader(opt["data_dir"] + "/dev.json", opt["batch_size"], opt, vocab, evaluation=True) model_id = opt["id"] if len(opt["id"]) > 1 else "0" + opt["id"] model_save_dir = opt["save_dir"] + "/" + model_id opt["model_save_dir"] = model_save_dir helper.ensure_dir(model_save_dir, verbose=True) # save config helper.save_config(opt, model_save_dir + "/config.json", verbose=True) vocab.save(model_save_dir + "/vocab.pkl") file_logger = helper.FileLogger( model_save_dir + "/" + opt["log"], header="# epoch\ttrain_loss\tdev_loss\tdev_f1") # print model info helper.print_config(opt) # model model = RelationModel(opt, emb_matrix=emb_matrix) id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()]) dev_f1_history = [] current_lr = opt["lr"] global_step = 0 global_start_time = time.time() format_str = ( "{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}" ) max_steps = len(train_batch) * opt["num_epoch"] # start training for epoch in range(1, opt["num_epoch"] + 1): train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = model.update(batch, torch.tensor(biased_batch_probs[i]).cuda()) train_loss += loss if global_step % opt["log_step"] == 0: duration = time.time() - start_time print( format_str.format( datetime.now(), global_step, max_steps, epoch, opt["num_epoch"], loss, duration, current_lr, )) # eval on dev print("Evaluating on dev set...") predictions = [] dev_loss = 0 for i, batch in enumerate(dev_batch): preds, _, loss = model.predict(batch) predictions += preds dev_loss += loss predictions = [id2label[p] for p in predictions] dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions) f = open("label.txt", "w+") f.write(str(dev_batch.gold())) f.close() train_loss = (train_loss / train_batch.num_examples * opt["batch_size"] ) # avg loss per batch dev_loss = dev_loss / dev_batch.num_examples * opt["batch_size"] print( "epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}" .format(epoch, train_loss, dev_loss, dev_f1)) file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}".format( epoch, train_loss, dev_loss, dev_f1)) # save model_file = model_save_dir + "/checkpoint_epoch_{}.pt".format(epoch) model.save(model_file, epoch) if epoch == 1 or dev_f1 > max(dev_f1_history): copyfile(model_file, model_save_dir + "/best_model.pt") print("new best model saved.") if epoch % opt["save_epoch"] != 0: os.remove(model_file) # lr schedule if (len(dev_f1_history) > 10 and dev_f1 <= dev_f1_history[-1] and opt["optim"] in ["sgd", "adagrad"]): current_lr *= opt["lr_decay"] model.update_lr(current_lr) dev_f1_history += [dev_f1] print("") print("Training ended with {} epochs.".format(epoch))
def trainmodel(config=None): if config is not None: args.batch_size = config["bsz"] args.seed = config["npseed"] args.npseed = config["npseed"] args.input_dropout = config["inp_drop"] torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) torch.cuda.manual_seed(args.seed) helper.print_arguments(args) train_batch, valid_batch, test_batch = get_dataloaders(args, vocab) trainer = ABSATrainer(args, emb_matrix=word_emb) print(trainer.model) print("Total parameters:", _totally_parameters(trainer.model)) best_path = args.save_dir helper.ensure_dir(best_path, verbose=True) print("Training Set: {}".format(len(train_batch))) print("Valid Set: {}".format(len(valid_batch))) print("Test Set: {}".format(len(test_batch))) train_acc_history, train_loss_history = [], [] val_acc_history, val_loss_history, val_f1_score_history = [0.0], [0.0 ], [0.0] patience = 0 epoch = 0 for _ in range(1, args.num_epoch + 1): epoch += 1 print("Epoch {}".format(epoch) + "-" * 60) train_loss, train_acc, train_step = 0.0, 0.0, 0 for i, batch in enumerate(train_batch): loss, acc = trainer.update(batch) train_loss += loss train_acc += acc train_step += 1 if train_step % args.log_step == 0: print("{}/{} train_loss: {:.6f}, train_acc: {:.6f}".format( i, len(train_batch), train_loss / train_step, train_acc / train_step)) val_loss, val_acc, val_f1 = evaluate(trainer, valid_batch) print( "End of {} train_loss: {:.4f}, train_acc: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}, f1_score: {:.4f}" .format( epoch, train_loss / train_step, train_acc / train_step, val_loss, val_acc, val_f1, )) train_acc_history.append(train_acc / train_step) train_loss_history.append(train_loss / train_step) val_loss_history.append(val_loss) # save best model if epoch == 1 or float(val_acc) > max(val_acc_history): patience = 0 torch.save(trainer, best_path + '/best_checkpoint.pt') print("new best model saved.") val_acc_history.append(float(val_acc)) val_f1_score_history.append(val_f1) if patience >= 20: print('Reach the max patience, stopping...') break print("Training ended with {} epochs.".format(epoch)) # bt_val_acc = max(val_acc_history) # bt_val_idx = val_acc_history.index(bt_val_acc) # bt_val_f1 = val_f1_score_history[bt_val_idx] # bt_val_loss = val_loss_history[bt_val_idx] # print( # "Training Summary: Best best_acc_epoch:{}, val_loss:{}, val_acc:{}, val_f1:{}".format( # bt_val_idx, bt_val_loss, bt_val_acc, bt_val_f1 # ) # ) print("Loading best checkpoints from", best_path + '/best_checkpoint.pt') trainer = torch.load(best_path + '/best_checkpoint.pt') test_loss, test_acc, test_f1 = evaluate(trainer, test_batch) print("Evaluation Results: test_loss:{}, test_acc:{}, test_f1:{}".format( test_loss, test_acc, test_f1))
opt['save_dir'] = "saved_models/" + opt['type'] + "/" opt['res_dir'] = "result/" + opt['type'] + "/" label2id = get_current_label2id(opt) if opt['type'] == 'multi': opt['save_dir'] = "saved_models/" + opt['type'] + "/" + opt['coarse_name'] opt['res_dir'] = "result/" + opt['type'] + "/" + opt['coarse_name'] + "/" else: opt['coarse_name'] = '' opt['num_class'] = len(label2id) # print opt helper.print_config(opt) id2label = dict([(v, k) for k, v in label2id.items()]) # model save dir helper.ensure_dir(opt['save_dir'], verbose=True) helper.ensure_dir(opt['res_dir'], verbose=True) # save config helper.save_config(opt, os.path.join(opt['save_dir'], 'config.json'), verbose=True) file_logger = helper.FileLogger( opt['save_dir'] + '/' + opt['log'], header="# epoch\ttrain_loss\tdev_loss\ttrain_ACC\ttest_ACC\tF1") # load data if opt['type'] == 'multi': # split train set into new train set and test set, used in the second level split_save_dir = 'dataset/multi/' + opt['coarse_name'] helper.ensure_dir(split_save_dir)
random.seed(1234) if args.cuda: torch.cuda.manual_seed(args.seed) # make opt opt = vars(args) label2id = constant.LABEL_TO_ID opt['num_class'] = len(label2id) # print opt helper.print_config(opt) label2id = constant.LABEL_TO_ID id2label = dict([(v, k) for k, v in label2id.items()]) # model save dir helper.ensure_dir(opt['save_dir'], verbose=True) # save config helper.save_config(opt, opt['save_dir'] + '/config.json', verbose=True) file_logger = helper.FileLogger( opt['save_dir'] + '/' + opt['log'], header="# epoch\ttrain_loss\tdev_loss\ttrain_ACC\ttest_ACC\tF1") # load data print("Loading data from {} with batch size {} ...".format( opt['data_dir'], opt['batch_size'])) train_batch = DataLoader(opt['data_dir'] + '/train.tsv', opt['batch_size'], opt) test_batch = DataLoader(opt['data_dir'] + '/test.tsv', opt['batch_size'], opt) # build model
def main(): args = parse_args() # input files train_file = args.data_dir + '/train.jsonl' dev_file = args.data_dir + '/testa.jsonl' test_file = args.data_dir + '/testb.jsonl' wv_file = args.glove_dir + '/' + args.wv_file wv_dim = args.wv_dim # output files helper.ensure_dir(args.vocab_dir) vocab_file = args.vocab_dir + '/vocab.pkl' char_vocab_file = args.vocab_dir + '/vocab_char.pkl' emb_file = args.vocab_dir + '/embedding.npy' # load files print("loading files...") train_tokens, train_chars = load_tokens(train_file) dev_tokens, dev_chars = load_tokens(dev_file) test_tokens, test_chars = load_tokens(test_file) if args.lower: train_tokens, dev_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\ (train_tokens, dev_tokens, test_tokens)] if args.char_lower and train_chars: train_chars, dev_chars, test_chars = [[c.lower() for c in chars] for chars in\ (train_chars, dev_chars, test_chars)] # load glove print("loading glove...") glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim) print("{} words loaded from glove.".format(len(glove_vocab))) print("building vocab...") if args.all: all_tokens = train_tokens + dev_tokens + test_tokens else: all_tokens = train_tokens v = build_vocab(all_tokens, glove_vocab, args.min_freq) if train_chars: print("building vocab for chars...") all_chars = train_chars + dev_chars + test_chars char_counter = Counter(all_chars) #char_vocab = constant.VOCAB_PREFIX + sorted(char_counter.keys(), key=char_counter.get, reverse=True) char_vocab = constant.VOCAB_PREFIX + sorted(list(char_counter.keys())) print("vocab built with {} chars.".format(len(char_vocab))) else: char_vocab = None print("calculating oov...") datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens} for dname, d in datasets.items(): total, oov = count_oov(d, v) print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov * 100.0 / total)) print("building embeddings...") if args.random: print("using random initialization...") embedding = random_embedding(v, wv_dim) else: embedding = vocab.build_embedding(wv_file, v, wv_dim) print("embedding size: {} x {}".format(*embedding.shape)) print("dumping to files...") with open(vocab_file, 'wb') as outfile: pickle.dump(v, outfile) if char_vocab: with open(char_vocab_file, 'wb') as outfile: pickle.dump(char_vocab, outfile) np.save(emb_file, embedding) print("all done.")
def train_model(vocab_params, train_params, train_batch, dev_batch, model_id=-1): torch.manual_seed(train_params.seed) np.random.seed(train_params.seed) random.seed(train_params.seed) if train_params.cpu: train_params.cuda = False elif train_params.cuda: torch.cuda.manual_seed(train_params.seed) # make opt opt = vars(vocab_params) print(constant.LABEL_TO_ID) print(opt) opt['num_class'] = len(constant.LABEL_TO_ID) # Combine all the parameters together opt.update(vars(train_params)) # load vocab vocab_file = opt['vocab_dir'] + '/vocab.pkl' vocab = Vocab(vocab_file, load=True) opt['vocab_size'] = vocab.size emb_file = opt['vocab_dir'] + '/embedding.npy' emb_matrix = np.load(emb_file) assert emb_matrix.shape[0] == vocab.size assert emb_matrix.shape[1] == opt['emb_dim'] if (model_id == -1): model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id'] model_save_dir = opt['save_dir'] + '/' + model_id opt['model_save_dir'] = model_save_dir helper.ensure_dir(model_save_dir, verbose=True) # save config helper.save_config(opt, model_save_dir + '/config.json', verbose=True) vocab.save(model_save_dir + '/vocab.pkl') file_logger = helper.FileLogger( model_save_dir + '/' + opt['log'], header="# epoch\ttrain_loss\tdev_loss\tdev_f1") # print model info helper.print_config(opt) # model model = RelationModel(opt, emb_matrix=emb_matrix) id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()]) dev_f1_history = [] current_lr = opt['lr'] global_step = 0 global_start_time = time.time() format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' max_steps = len(train_batch) * opt['num_epoch'] # start training for epoch in range(1, opt['num_epoch'] + 1): train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = model.update(batch) train_loss += loss if global_step % opt['log_step'] == 0: duration = time.time() - start_time print(format_str.format(datetime.now(), global_step, max_steps, epoch,\ opt['num_epoch'], loss, duration, current_lr)) # eval on dev print("Evaluating on dev set...") predictions = [] dev_loss = 0 for i, batch in enumerate(dev_batch): preds, _, loss = model.predict(batch) predictions += preds dev_loss += loss predictions = [id2label[p] for p in predictions] dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions) train_loss = train_loss / train_batch.num_examples * opt[ 'batch_size'] # avg loss per batch dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size'] print("epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}".format(epoch,\ train_loss, dev_loss, dev_f1)) file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}".format( epoch, train_loss, dev_loss, dev_f1)) # save model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch) model.save(model_file, epoch) if epoch == 1 or dev_f1 > max(dev_f1_history): copyfile(model_file, model_save_dir + '/best_model.pt') print("new best model saved.") if epoch % opt['save_epoch'] != 0: os.remove(model_file) # lr schedule if len(dev_f1_history) > 10 and dev_f1 <= dev_f1_history[-1] and \ opt['optim'] in ['sgd', 'adagrad']: current_lr *= opt['lr_decay'] model.update_lr(current_lr) dev_f1_history += [dev_f1] print("") print("Training ended with {} epochs.".format(epoch))
def main(): args = parse_args() # input files train_file = args.data_dir + "/rationale_train.json" dev_file = args.data_dir + "/rationale_dev.json" un_file = args.data_dir + "/rationale_un.json" wl_file = args.data_dir + "/rationale_wl.json" cts_file = args.data_dir + "/rationale_cts.json" bc_file = args.data_dir + "/rationale_bc.json" wv_file = args.glove_dir + "/" + args.wv_file wv_dim = args.wv_dim # output files helper.ensure_dir(args.vocab_dir) vocab_file = args.vocab_dir + "/vocab.pkl" emb_file = args.vocab_dir + "/embedding.npy" # load files print("loading files...") train_tokens = load_tokens(train_file) dev_tokens = load_tokens(dev_file) un_tokens = load_tokens(un_file) wl_tokens = load_tokens(wl_file) cts_tokens = load_tokens(cts_file) bc_tokens = load_tokens(bc_file) if args.lower: train_tokens, dev_tokens, un_tokens, wl_tokens, cts_tokens, bc_tokens = [ [t.lower() for t in tokens] for tokens in ( train_tokens, dev_tokens, un_tokens, wl_tokens, cts_tokens, bc_tokens, ) ] # load glove print("loading glove...") glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim) print("{} words loaded from glove.".format(len(glove_vocab))) print("building vocab...") v = build_vocab(train_tokens, glove_vocab, args.min_freq) print("calculating oov...") datasets = { "train": train_tokens, "dev": dev_tokens, "un": un_tokens, "wl": wl_tokens, "cts": cts_tokens, "bc": bc_tokens, } for dname, d in datasets.items(): total, oov = count_oov(d, v) print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov * 100.0 / total)) print("building embeddings...") embedding = vocab.build_embedding(wv_file, v, wv_dim) print("embedding size: {} x {}".format(*embedding.shape)) print("dumping to files...") with open(vocab_file, "wb") as outfile: pickle.dump(v, outfile) np.save(emb_file, embedding) print("all done.")
def main(): # set top-level random seeds torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) if args.cpu: args.cuda = False elif args.cuda: # force random seed for reproducibility # also apply same seed to numpy in every file torch.backends.cudnn.deterministic = True torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # make opt opt = vars(args) opt['num_class'] = len(constant.LABEL_TO_ID) # load vocab vocab_file = opt['vocab_dir'] + '/vocab.pkl' vocab = Vocab(vocab_file, load=True) # in some previous experiments we saw that lower vocab size can improve performance # but it was in a completely different project although on the same data # here it seems it's much harder to get this to work # uncomment the following line if this is solved: # new_vocab_size = 30000 opt['vocab_size'] = vocab.size emb_file = opt['vocab_dir'] + '/embedding.npy' emb_matrix = np.load(emb_file) assert emb_matrix.shape[0] == vocab.size assert emb_matrix.shape[1] == opt['emb_dim'] # load data print("Loading data from {} with batch size {}...".format( opt['data_dir'], opt['batch_size'])) train_batch = DataLoader(opt['data_dir'] + '/train.json', opt['batch_size'], opt, vocab, evaluation=False) dev_batch = DataLoader(opt['data_dir'] + '/dev.json', opt['batch_size'], opt, vocab, evaluation=True) model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id'] model_save_dir = opt['save_dir'] + '/' + model_id opt['model_save_dir'] = model_save_dir helper.ensure_dir(model_save_dir, verbose=True) # save config helper.save_config(opt, model_save_dir + '/config.json', verbose=True) vocab.save(model_save_dir + '/vocab.pkl') file_logger = helper.FileLogger( model_save_dir + '/' + opt['log'], header="# epoch\ttrain_loss\tdev_loss\tdev_p\tdev_r\tdev_f1") # print model info helper.print_config(opt) # model model = RelationModel(opt, emb_matrix=emb_matrix) id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()]) dev_f1_history = [] current_lr = opt['lr'] global_step = 0 format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' max_steps = len(train_batch) * opt['num_epoch'] # setup the scheduler for lr decay # this doesn't seem to work well compared to what we already have # scheduler = ReduceLROnPlateau(model.optimizer, mode='min', factor=opt['lr_decay'], patience=1) # start training for epoch in range(1, opt['num_epoch'] + 1): # TODO: if lr warmup is used, the lr console output is not updated print( "Current params: " + " heads-" + str(opt["n_head"]) + " enc_layers-" + str(opt["num_layers_encoder"]), " drop-" + str(opt["dropout"]) + " scaled_drop-" + str(opt["scaled_dropout"]) + " lr-" + str(opt["lr"]), " lr_decay-" + str(opt["lr_decay"]) + " max_grad_norm-" + str(opt["max_grad_norm"])) print( " weight_no_rel-" + str(opt["weight_no_rel"]) + " weight_rest-" + str(opt["weight_rest"]) + " attn-" + str(opt["attn"]) + " attn_dim-" + str(opt["attn_dim"]), " obj_sub_pos-" + str(opt["obj_sub_pos"]) + " new_residual-" + str(opt["new_residual"])) print( " use_batch_norm-" + str(opt["use_batch_norm"]) + " relative_positions-" + str(opt["relative_positions"]), " decay_epoch-" + str(opt["decay_epoch"]) + " use_lemmas-" + str(opt["use_lemmas"]), " hidden_self-" + str(opt["hidden_self"])) train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = model.update(batch) train_loss += float(loss) if global_step % opt['log_step'] == 0: duration = time.time() - start_time print( format_str.format(datetime.now(), global_step, max_steps, epoch, opt['num_epoch'], loss, duration, current_lr)) # do garbage collection, # as per https://discuss.pytorch.org/t/best-practices-for-maximum-gpu-utilization/13863/6 del loss # eval on dev print("Evaluating on dev set...") predictions = [] dev_loss = 0 for i, batch in enumerate(dev_batch): preds, _, loss = model.predict(batch) predictions += preds dev_loss += float(loss) del loss predictions = [id2label[p] for p in predictions] dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions) train_loss = train_loss / train_batch.num_examples * opt[ 'batch_size'] # avg loss per batch dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size'] print( "epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}".format(epoch, \ train_loss, dev_loss, dev_f1) ) file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}\t{:.4f}".format( epoch, train_loss, dev_loss, dev_p, dev_r, dev_f1)) # save model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch) model.save(model_file, epoch) if epoch == 1 or dev_f1 > max(dev_f1_history): copyfile(model_file, model_save_dir + '/best_model.pt') print("new best model saved.") if epoch % opt['save_epoch'] != 0: os.remove(model_file) # reduce learning rate if it stagnates by a certain decay rate and within given epoch patience # this for some reason works worth than the implementation we have afterwards # scheduler.step(dev_loss) if opt["optim"] != "noopt_adam" and opt["optim"] != "noopt_nadam": # do warm_up_for sgd only instead of adam do_warmup_trick = False if do_warmup_trick: # print("do_warmup_trick") # 1 and 5 first worked kind of # 10 and 15 current_lr = 10 * (360**(-0.5) * min(epoch**(-0.5), epoch * 15**(-1.5))) # print("current_lr", current_lr) model.update_lr(current_lr) else: # decay schedule # 15 is best! # simulate patience of x epochs if len(dev_f1_history ) > opt['decay_epoch'] and dev_f1 <= dev_f1_history[-1]: current_lr *= opt['lr_decay'] model.update_lr(current_lr) # else, update the learning rate in torch_utils.py dev_f1_history += [dev_f1] print("") print("Training ended with {} epochs.".format(epoch))
errors[i]["prop_mentions"] = ( errors[i]["real_mentions"] / len(errors[i]["mentions"]) if len(errors[i]["mentions"]) > 0 else 1 ) # Convert the bootleg_emmental entity QIDs to the wikidata mentions def save_csv(obj, name): cols = obj[0].keys() print(cols) csv_columns = cols f = open(name + ".csv", "w") w = csv.DictWriter(f, fieldnames=csv_columns) w.writeheader() for k, v in obj.items(): w.writerow(v) print("Wrote to file!") # save probability scores if len(args.out) > 0: helper.ensure_dir(args.out) save_csv(errors, "{}/{}_{}".format(args.out, timestamp, args.dataset)) filename_probs = "{}/{}_{}".format(args.out, timestamp, "probs.pkl") with open(filename_probs, "wb") as outfile: pickle.dump(all_probs, outfile) print("Prediction scores saved to {}.".format(args.out)) print("Evaluation ended.")
torch.backends.cudnn.benchmark = False seed = args.seed seed_everything(seed) if args.cpu: args.use_cuda = False elif args.use_cuda: torch.cuda.manual_seed(args.seed) opt = vars(args) # print model info helper.print_config(opt) helper.ensure_dir(opt["model_save_dir"], verbose=True) # save model config helper.save_config(opt, opt["model_save_dir"] + "/" + opt["id"] + '.config', verbose=True) # record training log file_logger = helper.FileLogger( opt["model_save_dir"] + '/' + opt['id'] + ".log", header="# epoch\ttrain_loss\tprecision5\tNDCG5\tMAP5\tprecision7" "\tNDCG7\tMAP7\tprecision10\tNDCG10\tMAP10") preprocess = Preprocess(opt) print("Preprocess is done.") print("Create model TaNP...") opt['uf_dim'] = preprocess.uf_dim
def transre_search(ffn, connect, hidden_dim, trans_layers, multi_heads, ffn_ex_size, initial, final): opt['weighted'] = False opt['rnn'] = False opt['ffn'] = ffn opt['connect'] = connect opt['hidden_dim'] = hidden_dim opt['trans_layers'] = trans_layers opt['multi_heads'] = multi_heads opt['ffn_ex_size'] = ffn_ex_size opt['initial'] = initial opt['final'] = final id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id'] model_name =str (opt['optim']) + '_' + str (opt['lr']) + str (ffn) + '_' +str(connect)+"_"\ + str (hidden_dim) + '_' + str (trans_layers) + '_' + str (multi_heads) + '_' + \ str (ffn_ex_size)+'_'+str(initial)+'_'+str(final) model_name = model_name + '' + str(opt['memo']) model_name = str(id) + "_" + model_name model_save_dir = opt['save_dir'] + '/' + model_name opt['model_save_dir'] = model_save_dir helper.ensure_dir(model_save_dir, verbose=True) # save config helper.save_config(opt, model_save_dir + '/config.json', verbose=True) vocab.save(model_save_dir + '/vocab.pkl') file_logger = helper.FileLogger( model_save_dir + '/' + opt['log'], header="# epoch\ttrain_loss\tdev_loss\tdev_score\tbest_dev_score") helper.print_config(opt) if not opt['load']: trainer = TransTrainer(opt, emb_matrix=emb_matrix) else: # load pre-train model model_file = opt['model_file'] print("Loading model from {}".format(model_file)) model_opt = torch_utils.load_config(model_file) model_opt['optim'] = opt['optim'] trainer = TransTrainer(model_opt) trainer.load(model_file) id2label = dict([(v, k) for k, v in label2id.items() ]) # the classification result dev_score_history = [] dev_loss_history = [] current_lr = opt['lr'] global_step = 0 format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' max_steps = len(train_batch) * opt['num_epoch'] best_result = "unknown" file_logger.log(str(opt['memo'])) for epoch in range(1, opt['num_epoch'] + 1): train_loss = 0 epoch_start_time = time.time() for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss, norm = trainer.update(batch) train_loss += loss if global_step % opt['log_step'] == 0: duration = time.time() - start_time print( format_str.format(datetime.now(), global_step, max_steps, epoch, opt['num_epoch'], loss, duration, current_lr)) print("Evaluating on dev set...") predictions = [] dev_loss = 0 for i, batch in enumerate(dev_batch): preds, _, loss, _ = trainer.predict(batch) predictions += preds dev_loss += loss predictions = [id2label[p] for p in predictions] train_loss = train_loss / train_batch.num_examples * opt[ 'batch_size'] # avg loss per batch dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size'] acc, dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions) print( "epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}" .format(epoch, train_loss, dev_loss, dev_f1)) dev_score = dev_f1 file_logger.log("{}\t{:.3f}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}".format( epoch, acc, train_loss, dev_loss, dev_score, max([dev_score] + dev_score_history))) # save model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch) trainer.save(model_file, epoch) if epoch == 1 or dev_score > max(dev_score_history): copyfile(model_file, model_save_dir + '/best_model.pt') best_result = (model_name, dev_score) print("new best model saved.") file_logger.log( "new best model saved at epoch {}: {:.2f}\t{:.2f}\t{:.2f}". format(epoch, dev_p * 100, dev_r * 100, dev_score * 100)) if epoch % opt['save_epoch'] != 0: os.remove(model_file) # lr schedule if len(dev_score_history ) > opt['decay_epoch'] and dev_score <= dev_score_history[ -1] and opt['optim'] in ['sgd', 'adagrad', 'adadelta']: current_lr *= opt['lr_decay'] trainer.update_lr(current_lr) dev_score_history += [dev_score] dev_loss_history += [dev_loss] epoch_end_time = time.time() print("epoch time {:.3f}".format(epoch_end_time - epoch_start_time)) return best_result
def main(): args = parse_args() # input files train_file = args.data_dir + '/train.list' test_file = args.data_dir + '/test.list' wv_file = args.w2v_dir + '/' + args.wv_file wv_dim = args.wv_dim # output files helper.ensure_dir(args.vocab_dir) vocab_file = args.vocab_dir + '/vocab.pkl' emb_file = args.vocab_dir + '/embedding.npy' # load files print("loading files...") train_tokens = load_tokens(train_file) test_tokens = load_tokens(test_file) # load glove print("loading word vector...") glove_vocab = load_glove_vocab(wv_file, wv_dim) print("{} words loaded from glove.".format(len(glove_vocab))) print("building vocab...") v = build_vocab(train_tokens + test_tokens + constant.ASP_TOKEN, glove_vocab) print("calculating oov...") datasets = {'train': train_tokens, 'test': test_tokens} for dname, d in datasets.items(): total, oov = count_oov(d, v) print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov * 100.0 / total)) print("building embeddings...") embedding = build_embedding(wv_file, v, wv_dim) print("embedding size: {} x {}".format(*embedding.shape)) print("building asp embeddings...") w2id = {w: i for i, w in enumerate(v)} ASP_TO_ID = constant.ASP_TO_ID asp_emb = np.random.uniform(-1, 1, (len(ASP_TO_ID), wv_dim)) for key in ASP_TO_ID.keys(): ts = list(jieba.cut(key)) tmp = np.zeros(wv_dim) for t in ts: if t not in w2id: tmp += embedding[w2id['<UNK>']] print(t) else: tmp += embedding[w2id[t]] asp_emb[ASP_TO_ID[key]] = tmp / len(ts) print("embedding size: {} x {}".format(*asp_emb.shape)) print("dumping to files...") with open(vocab_file, 'wb') as outfile: pickle.dump(v, outfile) np.save(emb_file, embedding) np.save(args.vocab_dir + '/asp_embedding.npy', asp_emb) print("all done.")
opt['data_dir'], opt['batch_size'])) train_batch = DataLoader(opt['data_dir'] + '/train.json', opt['batch_size'], opt, vocab, evaluation=False) dev_batch = DataLoader(opt['data_dir'] + '/dev.json', opt['batch_size'], opt, vocab, evaluation=True) model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id'] model_save_dir = opt['save_dir'] + '/' + model_id opt['model_save_dir'] = model_save_dir helper.ensure_dir(model_save_dir, verbose=True) # save config helper.save_config(opt, model_save_dir + '/config.json', verbose=True) vocab.save(model_save_dir + '/vocab.pkl') file_logger = helper.FileLogger( model_save_dir + '/' + opt['log'], header="# epoch\ttrain_loss\tdev_loss\tdev_score\tbest_dev_score") # print model info helper.print_config(opt) # model if not opt['load']: trainer = GCNTrainer(opt, emb_matrix=emb_matrix) else:
def main(): args = get_parser() # set seed and prepare for training torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) if args.cpu: args.cuda = False elif args.cuda: torch.cuda.manual_seed(args.seed) init_time = time.time() # make opt opt = vars(args) TEXT, train_batch, dev_batch = load_data(opt['batch_size'], device='cuda:0') vocab = TEXT.vocab opt['vocab_size'] = len(vocab.stoi) emb_matrix = vocab.vectors assert emb_matrix.shape[0] == opt['vocab_size'] assert emb_matrix.shape[1] == opt['emb_dim'] model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id'] model_save_dir = opt['save_dir'] + '/' + str(model_id) opt['model_save_dir'] = model_save_dir helper.ensure_dir(model_save_dir, verbose=True) # save config path = os.path.join(model_save_dir, 'config.json') helper.save_config(opt, path, verbose=True) # vocab.save(os.path.join(model_save_dir, 'vocab.pkl')) file_logger = helper.FileLogger( os.path.join(model_save_dir, opt['log']), header="# epoch\ttrain_loss\tdev_loss\tdev_score\tbest_dev_score") # print model info helper.print_config(opt) # Build Model if not opt['load']: trainer = LSTMTrainer(opt, emb_matrix) else: model_file = opt['model_file'] print("Loading model from {}".format(model_file)) model_opt = torch_utils.load_config(model_file) model_opt['optim'] = opt['optim'] trainer = LSTMTrainer(model_opt) trainer.load(model_file) dev_score_history = [] current_lr = opt['lr'] global_step = 0 global_start_time = time.time() format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' max_steps = len(train_batch) * opt['num_epoch'] # start training for epoch in range(1, opt['num_epoch'] + 1): train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch) train_loss += loss if global_step % opt['log_step'] == 0: duration = time.time() - start_time print(format_str.format(datetime.now(), global_step, max_steps, epoch, \ opt['num_epoch'], loss, duration, current_lr)) # eval on dev print("Evaluating on dev set ...") predictions = [] golds = [] dev_loss = 0.0 for i, batch in enumerate(dev_batch): preds, probs, labels, loss = trainer.predict(batch) predictions += preds golds += labels dev_loss += loss train_loss = train_loss / len(train_batch) dev_loss = dev_loss / len(dev_batch) # print(golds) # print(predictions) print(accuracy_score(golds, predictions)) dev_roc = roc_auc_score(golds, predictions) print( "epoch {}: train loss = {:.6f}, dev loss = {:.6f}, dev roc = {:.4f}" .format(epoch, train_loss, dev_loss, dev_roc)) dev_score = dev_roc file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}".format( epoch, train_loss, dev_loss, dev_score, max([dev_score] + dev_score_history))) # save model model_file = os.path.join(model_save_dir, "checkpoint_epoch_{}.py".format(epoch)) trainer.save(model_file, epoch) if epoch == 1 or dev_score > max(dev_score_history): copyfile(model_file, model_save_dir + '/best_model.pt') print("new best model saved.") file_logger.log("new best model saved at epoch {}: {:.2f}"\ .format(epoch, dev_score*100)) if epoch % opt['save_epoch'] != 0: os.remove(model_file) if len(dev_score_history) > opt['decay_epoch'] and dev_score <= dev_score_history[-1] and \ opt['optim'] in ['sgd', 'adagrad', 'adadelta']: current_lr *= opt['lr_decay'] trainer.update_lr(current_lr) dev_score_history += [dev_score] print("") print("Training ended with {} epochs.".format(epoch))