def load_model( train_file=os.path.join(CURRENT_DIR, "../data/ere_filtered_train.txt"), eval_file=os.path.join(CURRENT_DIR, "../data/ere_filtered_test.txt"), model_path=os.path.join(CURRENT_DIR, "../data/filter_ere_dp_mask_5000.636.pkl")): data = data_pro.load_data(train_file) e_data = data_pro.load_data(eval_file) word_dict = data_pro.build_dict(data[0] + e_data[0]) entype_dict = data_pro.buildTypeDict(data[4] + e_data[4]) neural_model = torch.load(model_path, map_location={'cuda:0': 'cpu'}) neural_model.eval() return word_dict, entype_dict, neural_model
def load_uk(train_file=os.path.join(CURRENT_DIR, "../data/convert_ere_uk_train.txt"), eval_file=os.path.join(CURRENT_DIR, "../data/convert_ere_uk_test.txt"), model_path=os.path.join(CURRENT_DIR, "../data/uk_new_piece_5000.682.pkl")): data = data_pro.load_data(train_file) e_data = data_pro.load_data(eval_file) word_dict = data_pro.build_dict(data[0] + e_data[0]) entype_dict = data_pro.buildTypeDict(data[4] + e_data[4]) neural_model = torch.load(model_path, map_location={'cuda:0': 'cpu'}) neural_model.eval() return word_dict, entype_dict, neural_model
def extract_relations(model, word_dict, type_dic, test_file=os.path.join(CURRENT_DIR, "temp/AIDA_plain_text.txt"), batch_size=64, T=1.0, result_path=os.path.join(CURRENT_DIR, "temp/AIDA_results.txt"), depend_out_path=os.path.join(CURRENT_DIR, "temp/dp.pkl")): t_data = data_pro.load_data(test_file) t_x, t_y, t_e1, t_e2, t_dist1, t_dist2, t_en_type_vec, t_dp_vec, t_pool_mask_e1, \ t_pool_mask, t_pool_mask_e2 = data_pro.vectorize_full(t_data, word_dict, type_dic, dp_fpath=depend_out_path) t_y = np.array(t_y).astype(np.int64) t_np_cat = np.concatenate( (t_x, np.array(t_dist1), np.array(t_dist2), np.array(t_en_type_vec), np.array(t_dp_vec), np.array(t_pool_mask_e1), np.array(t_pool_mask), np.array(t_pool_mask_e2)), 1) test = torch.from_numpy(t_np_cat.astype(np.int64)) t_y_tensor = torch.from_numpy(t_y) test_datasets = D.TensorDataset(test, t_y_tensor) test_dataloader = D.DataLoader(test_datasets, batch_size, False, num_workers=1) results = [] confidence_score = [] with torch.no_grad(): for (b_x_cat, b_y) in test_dataloader: bx, bd1, bd2, ben, bdp, bmask1, bmask, bmask2, by = data_unpack_full( b_x_cat, b_y) logits = model(bx, bd1, bd2, ben, bdp, bmask1, bmask, bmask2, False) score = torch.nn.functional.softmax(logits / T, 1).data predict = torch.max(logits, 1)[1].data temp = [] for idx in range(predict.size()[0]): temp.append(score[idx][predict[idx]].item()) results.append(predict.tolist()) confidence_score.append(temp) # with open("temp/AIDA_results.txt", "w") as fmodel: with open(result_path, "w", encoding="utf-8") as fmodel: for result, score in zip(results, confidence_score): for idx, rel in enumerate(result): fmodel.write("{}\t{}\n".format(rel, score[idx])) # fmodel.write("{}\t{}\n".format(rel.item(), score[idx].item())) print("test done!") return True
import numpy as np import torch.nn.functional as F from sklearn.model_selection import KFold DW = 100 N = 123 DP = 25 NP = 123 NR = 19 DC = 1000 KP = 0.6 K = 3 LR = 0.2 BATCH_SIZE = 50 epochs = 100 data = pro.load_data('./data/train.txt') t_data = pro.load_data('./data/test.txt') word_dict = pro.build_dict(data[0]) x, y, e1, e2, dist1, dist2 = pro.vectorize(data, word_dict, N) y = np.array(y).astype(np.int64) np_cat = np.concatenate((x, np.array(e1).reshape(-1, 1), np.array(e2).reshape(-1, 1), np.array(dist1), np.array(dist2)), 1) e_x, e_y, e_e1, e_e2, e_dist1, e_dist2 = pro.vectorize(t_data, word_dict, N) y = np.array(y).astype(np.int64) eval_cat = np.concatenate( (e_x, np.array(e_e1).reshape(-1, 1), np.array(e_e2).reshape(-1, 1), np.array(e_dist1), np.array(e_dist2)), 1) tx, ty, te1, te2, td1, td2 = pro.vectorize(t_data, word_dict, N) embed_file = './data/embedding/senna/embeddings.txt' vac_file = './data/embedding/senna/words.lst' embedding = pro.load_embedding(embed_file, vac_file, word_dict)
def main(): print( '\n---------------------------------------------- Setup -----------------------------------------------' ) parser = ArgumentParser(description='') parser.add_argument('--max_len', type=int, metavar='<MAX_LEN>', default=123, help='max_len') parser.add_argument('--pos_embed_size', type=int, metavar='<POS_EMBED_SIZE>', default=70, help='position_embedding_size') parser.add_argument('--n_pos_embed', type=int, metavar='<N_POS_EMBED>', default=123, help='position_embedding_num') parser.add_argument('--window', type=int, metavar='<WINDOW>', default=3, help='slide_window') parser.add_argument('--n_filters', type=int, metavar='<n_filters>', default=1000, help='num_filters') parser.add_argument('--p_dropout', type=float, metavar='<p_dropout>', default=0.5, help='keep_prob') parser.add_argument('--epochs', type=int, metavar='<EPOCHS>', default=50, help='number of epochs') parser.add_argument('--lr', type=float, metavar='<LR>', default=0.001, help='learning_rate') parser.add_argument('--decay', type=float, metavar='<decay>', default=0, help='weight_decay') parser.add_argument('--batch_size', type=int, metavar='<BATCH_SIZE>', default=32, help='batch_size') parser.add_argument('--opt', type=str, metavar='<OPT>', default='adam', help='optimizer: adam or sgd') A = parser.parse_args() N_CLASS = 19 # class_num N_EPOCHS = A.epochs MAX_LEN = A.max_len # max_len POS_EMBED_SIZE = A.pos_embed_size # position_embedding_size N_POS_EMBED = A.n_pos_embed # position_embedding_num WINDOW = A.window # slide_window BATCH_SIZE = A.batch_size n_filters = A.n_filters # num_filters p_dropout = A.p_dropout # keep_prob LR = A.lr # learning_rate DECAY = A.decay # learning rate decay OPT = A.opt TIMESTAMP = time.strftime("%Y%m%d-%H%M") FPATH_BEST_MODEL = 'saved_models/20190122/crcnn_opt-{}_epoch-{}_lr-{}_decay-{}_{}.pkl'.format( OPT, N_EPOCHS, LR, DECAY, TIMESTAMP) print('Parameters:\n{}'.format( dict(MAX_LEN=MAX_LEN, POS_EMBED_SIZE=POS_EMBED_SIZE, N_POS_EMBED=N_POS_EMBED, N_CLASS=N_CLASS, n_filters=n_filters, p_dropout=p_dropout, WINDOW=WINDOW, LR=LR, DECAY=DECAY, BATCH_SIZE=BATCH_SIZE, EPOCHS=N_EPOCHS, OPT=OPT, TIMESTAMP=TIMESTAMP))) # print('\n---------------------------------------------- Load Data -----------------------------------------------') data_train_valid = pro.load_data('data/nine_train.txt') concat = list( zip(data_train_valid[0], data_train_valid[1], data_train_valid[2], data_train_valid[3])) data_train, data_validation = train_test_split(concat, test_size=0.2, random_state=0) # print(data_train[0]) new_data_train = [i for i in zip(*data_train)] new_data_validation = [i for i in zip(*data_validation)] word_dict = pro.build_dict( new_data_train[0] + new_data_validation[0]) # word_dict: 19215 words and their id print('len(word_dict): ', len(word_dict)) sent_train, y_train, dist1_train, dist2_train = pro.vectorize( new_data_train, word_dict, MAX_LEN) y_train = np.array(y_train).astype(np.int64) X_train = np.concatenate( (sent_train, np.array(dist1_train), np.array(dist2_train)), 1) print('Data shape: X_train={}, y_train={}'.format(X_train.shape, y_train.shape)) sent_valid, y_valid, dist1_valid, dist2_valid = pro.vectorize( new_data_validation, word_dict, MAX_LEN) y_valid = np.array(y_valid).astype(np.int64) X_valid = np.concatenate( (sent_valid, np.array(dist1_valid), np.array(dist2_valid)), 1) print('Data shape: X_valid={}, y_valid={}'.format(X_valid.shape, y_valid.shape)) # fpath_embedding = '../relation-extraction-ly-dev/data/pre_trained_embeddings/glove.6B.300d.txt' # embedding_matrix = pro.load_glove_embeddings(fpath_embedding, word_dict) # print('Pre-trained embeddings loaded from <{}>.'.format(fpath_embedding)) # np.save('data/embedding_matrix.npy', embedding_matrix) embedding_matrix = np.load('data/embedding_matrix.npy') print( '\n---------------------------------------------- Build Model -----------------------------------------------' ) model = CR_CNN(MAX_LEN, embedding_matrix, POS_EMBED_SIZE, N_POS_EMBED, WINDOW, N_CLASS, n_filters, p_dropout).cuda() print(model) loss_func = PairwiseRankingLoss(N_CLASS) if OPT == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=DECAY) elif OPT == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=LR, weight_decay=DECAY) # end if print( '\n------------------------------------------------- Train --------------------------------------------------' ) def data_unpack(cat_data, target): list_x = np.split(cat_data.numpy(), [MAX_LEN, MAX_LEN + N_POS_EMBED], 1) batch_x = Variable(torch.from_numpy(list_x[0])).cuda() batch_d1 = Variable(torch.from_numpy(list_x[1])).cuda() batch_d2 = Variable(torch.from_numpy(list_x[2])).cuda() target = Variable(target).cuda() return batch_x, batch_d1, batch_d2, target def prediction(sc, y): ''' Calculat the f1 score for y_true and y_predict. c_target_dict: 19 relation label -> 19 relation name tr_target_dict: 19 relation name -> 10 relation label ''' y_true = y.cpu().data.numpy() y_predict = torch.max(sc, 1)[1].long().cpu().data.numpy() f1 = f1_score(y_true, y_predict, average='micro') return f1 * 100 # end def best_score = 0 patience = 0 for i in range(1, N_EPOCHS + 1): patience += 1 # train over batches tensor_x_train = torch.from_numpy(X_train.astype(np.int64)) tensor_y_train = torch.LongTensor(y_train) train_datasets = D.TensorDataset(data_tensor=tensor_x_train, target_tensor=tensor_y_train) train_dataloader = D.DataLoader(dataset=train_datasets, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) score_train = 0 loss = 0 n_trained_batch = 0 for (batch_x_cat, batch_y) in train_dataloader: n_trained_batch += 1 batch_x, batch_d1, batch_d2, batch_y = data_unpack( batch_x_cat, batch_y) # print('batch_x: ', batch_x.shape) # print('batch_d1: ', batch_d1.shape) # print('batch_d2: ', batch_d2.shape) weight_o = model(batch_x, batch_d1, batch_d2) loss_per_batch = loss_func(weight_o, batch_y) optimizer.zero_grad() loss_per_batch.backward() optimizer.step() loss += loss_per_batch score_train += prediction(weight_o, batch_y) # end for loss = loss.cpu().data.numpy()[0] / n_trained_batch score_train = score_train / n_trained_batch # evaluate over batches tensor_X_valid = torch.from_numpy(X_valid.astype(np.int64)) tensor_y_valid = torch.LongTensor(y_valid) valid_datasets = D.TensorDataset(data_tensor=tensor_X_valid, target_tensor=tensor_y_valid) valid_dataloader = D.DataLoader(dataset=valid_datasets, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) score_val = 0 n_eval_batch = 0 for (batch_x_cat, batch_y) in valid_dataloader: batch_x, batch_d1, batch_d2, batch_y = data_unpack( batch_x_cat, batch_y) weight_o = model(batch_x, batch_d1, batch_d2, False) score_val += prediction(weight_o, batch_y) n_eval_batch += 1 # end for score_val = score_val / n_eval_batch # if i % 10 == 0: print( 'Epoch [{}/{}]\t train_loss: {:4f}\t train_f1: {:.3f}\t test_f1: {:.3f}' .format(i, N_EPOCHS, loss, score_train, score_val)) # save best model current_score = score_val if current_score > best_score: patience = 0 best_score = current_score torch.save(model.state_dict(), FPATH_BEST_MODEL) print('Model saved to <{}>'.format(FPATH_BEST_MODEL)) if patience >= 10: print('Earlystopping: patience = {}'.format(patience)) break # end for with open('saved_models/20190122/results_20190122.csv', 'a') as f: writer = csv.writer(f) writer.writerow([ TIMESTAMP, round(best_score, 3), OPT, BATCH_SIZE, N_EPOCHS, LR, DECAY, n_filters, p_dropout, MAX_LEN, POS_EMBED_SIZE, N_POS_EMBED, WINDOW, N_CLASS ]) f.close() print( '\n------------------------------------------------- Test --------------------------------------------------\n' ) # model = torch.load('saved_models/20190122/crcnn_opt-adam_epoch-50_lr-0.001_decay-0_20190122-2049.pkl') # test data_test = pro.load_data('data/nine_test.txt') sent_test, y_test, dist1_test, dist2_test = pro.vectorize( data_test, word_dict, MAX_LEN) y_test = np.array(y_test).astype(np.int64) X_test = np.concatenate( (sent_test, np.array(dist1_test), np.array(dist2_test)), 1) print('Data shape: X_test={}, y_test={}'.format(X_test.shape, y_test.shape)) # evaluate on test set tensor_X_test = torch.from_numpy(X_test.astype(np.int64)) tensor_y_test = torch.LongTensor(y_test) test_datasets = D.TensorDataset(data_tensor=tensor_X_test, target_tensor=tensor_y_test) test_dataloader = D.DataLoader(dataset=test_datasets, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) score_test = 0 n_test_batch = 0 y_predict_test = [] y_true_test = [] for (batch_x_cat, batch_y) in test_dataloader: batch_x, batch_d1, batch_d2, batch_y = data_unpack( batch_x_cat, batch_y) weight_o = model(batch_x, batch_d1, batch_d2, False) y_true_test.extend(list(batch_y.cpu().data.numpy())) y_predict_test.extend( list(torch.max(weight_o, 1)[1].long().cpu().data.numpy())) score_test += prediction(weight_o, batch_y) n_test_batch += 1 # end for score_test = score_test / n_test_batch print('score_test={:.3f}'.format(score_test)) # save y_predict to txt file and run official scorer target_dict = json.load(open('data/target_dict.txt', 'r', encoding='utf-8')) # 19 classes c_target_dict = {value: key for key, value in target_dict.items()} # label -> name y_predict_test_names = [c_target_dict[i] for i in y_predict_test] y_true_test_names = [c_target_dict[i] for i in y_true_test] FPATH_Y_PRED_TXT = 'saved_models/20190122/y_predict_{}.txt'.format( TIMESTAMP) FPATH_Y_TRUE_TXT = 'saved_models/20190122/y_true_{}.txt'.format(TIMESTAMP) with open(FPATH_Y_PRED_TXT, 'w') as f: for i, p in enumerate(y_predict_test_names): f.write('{}\t{}'.format(i, p)) f.write('\n') f.close() with open(FPATH_Y_TRUE_TXT, 'w') as f: for i, t in enumerate(y_true_test_names): f.write('{}\t{}'.format(i, t)) f.write('\n') f.close() print('TXT files saved to <{}> and <{}>'.format(FPATH_Y_PRED_TXT, FPATH_Y_TRUE_TXT)) PERL_PATH = 'data/semeval2010_task8_scorer-v1.2.pl' process = subprocess.Popen( ["perl", PERL_PATH, FPATH_Y_PRED_TXT, FPATH_Y_TRUE_TXT], stdout=subprocess.PIPE) for line in str(process.communicate()[0].decode("utf-8")).split("\\n"): print(line) print( '\n------------------------------------------------- END --------------------------------------------------\n\n\n' )
def extract_relations_cl(model, word_dict, type_dic, test_file=os.path.join(CURRENT_DIR, "temp/AIDA_plain_text.txt"), batch_size=64, T=1.0, result_path=os.path.join(CURRENT_DIR, "temp/AIDA_results.txt")): t_data = data_pro.load_data(test_file) t_x, t_y, t_e1, t_e2, t_dist1, t_dist2, t_en_type_vec, t_pool_mask_e1, \ t_pool_mask, t_pool_mask_e2 = data_pro.vectorize_cl(t_data, word_dict, type_dic) t_y = np.array(t_y).astype(np.int64) t_np_cat = np.concatenate( (t_x, np.array(t_dist1), np.array(t_dist2), np.array(t_en_type_vec), np.array(t_pool_mask_e1), np.array(t_pool_mask), np.array(t_pool_mask_e2)), 1) test = torch.from_numpy(t_np_cat.astype(np.int64)) t_y_tensor = torch.from_numpy(t_y) test_datasets = D.TensorDataset(test, t_y_tensor) test_dataloader = D.DataLoader(test_datasets, batch_size, False, num_workers=1) results = [] confidence_score = [] # count = 0 bad_count = 0 with torch.no_grad(): for (b_x_cat, b_y) in test_dataloader: bx, bd1, bd2, ben, bmask1, bmask, bmask2, by = data_unpack_cl( b_x_cat, b_y) try: logits = model(bx, bd1, bd2, ben, bmask1, bmask, bmask2, False) except RuntimeError as e: print("BAD" + "=" * 50) print("b_x_cat ({}) = {}".format(b_x_cat.size(), b_x_cat)) print("b_y ({}) = {}".format(b_y.size(), b_y)) print("bx ({}) = {}".format(bx.size(), bx)) print("bd1 ({}) = {}".format(bd1.size(), bd1)) print("bd2 ({}) = {}".format(bd2.size(), bd2)) print("ben ({}) = {}".format(ben.size(), ben)) print("bmask1 ({}) = {}".format(bmask1.size(), bmask1)) print("bmask ({}) = {}".format(bmask.size(), bmask)) print("bmask2 ({}) = {}".format(bmask2.size(), bmask2)) print("by ({}) = {}".format(by.size(), by)) print("BAD" + "=" * 50) print("\n\n\n\n") # logits = torch.empty(b_x_cat.size(0), model.nr, dtype=bmask1.dtype, device=bmask1.device).fill_(1e-8) # logits[:, 32] = 1.0 # bad_count += 1 # continue raise RuntimeError(e) # if count < 1: # print("GOOD" + "=" * 50) # print("b_x_cat ({}) = {}".format(b_x_cat.size(), b_x_cat)) # print("b_y ({}) = {}".format(b_y.size(), b_y)) # print("bx ({}) = {}".format(bx.size(), bx)) # print("bd1 ({}) = {}".format(bd1.size(), bd1)) # print("bd2 ({}) = {}".format(bd2.size(), bd2)) # print("ben ({}) = {}".format(ben.size(), ben)) # print("bmask1 ({}) = {}".format(bmask1.size(), bmask1)) # print("bmask ({}) = {}".format(bmask.size(), bmask)) # print("bmask2 ({}) = {}".format(bmask2.size(), bmask2)) # print("by ({}) = {}".format(by.size(), by)) # print(logits.size()) # print("\n\n\n\n") # count += 1 score = torch.nn.functional.softmax(logits / T, 1).tolist() predict = torch.max(logits, 1)[1].tolist() temp = [] for idx in range(len(predict)): temp.append(score[idx][predict[idx]]) results.append(predict) confidence_score.append(temp) # with open("temp/results_post_sponsor.txt", "w") as fmodel: with open(result_path, "w", encoding="utf-8") as fmodel: for result, score in zip(results, confidence_score): for idx, rel in enumerate(result): fmodel.write("{}\t{}\n".format(rel, score[idx])) # fmodel.write("{}\t{}\n".format(rel.item(), score[idx].item())) print("test done!") print("BAD batches: {} (batch size = {})".format(bad_count, batch_size)) return True
# encoding:utf-8 import data_pro as pro import numpy as np import torch import lstm import torch.utils.data as D from torch.autograd import Variable import torch.nn.functional as F import random from sklearn import cross_validation '''training data''' train_data = pro.load_data('train_pad.txt') word_dict = {'unk': 0} word_dict = pro.build_dict(train_data, word_dict) train_tag = pro.load_data('tag.txt') tag_dict = {} tag_dict = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4} # tag_dict=pro.build_dict(train_tag,tag_dict) import argparse parser = argparse.ArgumentParser(description='question classification') parser.add_argument('-embed_dim', type=int, default=50) parser.add_argument('-embed_num', type=int, default=len(word_dict)) parser.add_argument('-dropout', type=float, default=0.5) parser.add_argument('-hidden_size', type=int, default=100) parser.add_argument('-batch_size', type=int, default=20) parser.add_argument('-epochs', type=int, default=300) parser.add_argument('-t_size', type=int, default=100) parser.add_argument('-class_num', type=int, default=len(tag_dict))