def load_model(
        train_file=os.path.join(CURRENT_DIR, "../data/ere_filtered_train.txt"),
        eval_file=os.path.join(CURRENT_DIR, "../data/ere_filtered_test.txt"),
        model_path=os.path.join(CURRENT_DIR,
                                "../data/filter_ere_dp_mask_5000.636.pkl")):
    data = data_pro.load_data(train_file)
    e_data = data_pro.load_data(eval_file)
    word_dict = data_pro.build_dict(data[0] + e_data[0])
    entype_dict = data_pro.buildTypeDict(data[4] + e_data[4])
    neural_model = torch.load(model_path, map_location={'cuda:0': 'cpu'})
    neural_model.eval()
    return word_dict, entype_dict, neural_model
def load_uk(train_file=os.path.join(CURRENT_DIR,
                                    "../data/convert_ere_uk_train.txt"),
            eval_file=os.path.join(CURRENT_DIR,
                                   "../data/convert_ere_uk_test.txt"),
            model_path=os.path.join(CURRENT_DIR,
                                    "../data/uk_new_piece_5000.682.pkl")):
    data = data_pro.load_data(train_file)
    e_data = data_pro.load_data(eval_file)
    word_dict = data_pro.build_dict(data[0] + e_data[0])
    entype_dict = data_pro.buildTypeDict(data[4] + e_data[4])
    neural_model = torch.load(model_path, map_location={'cuda:0': 'cpu'})
    neural_model.eval()
    return word_dict, entype_dict, neural_model
def extract_relations(model,
                      word_dict,
                      type_dic,
                      test_file=os.path.join(CURRENT_DIR,
                                             "temp/AIDA_plain_text.txt"),
                      batch_size=64,
                      T=1.0,
                      result_path=os.path.join(CURRENT_DIR,
                                               "temp/AIDA_results.txt"),
                      depend_out_path=os.path.join(CURRENT_DIR,
                                                   "temp/dp.pkl")):
    t_data = data_pro.load_data(test_file)

    t_x, t_y, t_e1, t_e2, t_dist1, t_dist2, t_en_type_vec, t_dp_vec, t_pool_mask_e1, \
    t_pool_mask, t_pool_mask_e2 = data_pro.vectorize_full(t_data, word_dict, type_dic, dp_fpath=depend_out_path)
    t_y = np.array(t_y).astype(np.int64)
    t_np_cat = np.concatenate(
        (t_x, np.array(t_dist1), np.array(t_dist2), np.array(t_en_type_vec),
         np.array(t_dp_vec), np.array(t_pool_mask_e1), np.array(t_pool_mask),
         np.array(t_pool_mask_e2)), 1)
    test = torch.from_numpy(t_np_cat.astype(np.int64))
    t_y_tensor = torch.from_numpy(t_y)
    test_datasets = D.TensorDataset(test, t_y_tensor)
    test_dataloader = D.DataLoader(test_datasets,
                                   batch_size,
                                   False,
                                   num_workers=1)

    results = []
    confidence_score = []

    with torch.no_grad():
        for (b_x_cat, b_y) in test_dataloader:
            bx, bd1, bd2, ben, bdp, bmask1, bmask, bmask2, by = data_unpack_full(
                b_x_cat, b_y)
            logits = model(bx, bd1, bd2, ben, bdp, bmask1, bmask, bmask2,
                           False)
            score = torch.nn.functional.softmax(logits / T, 1).data
            predict = torch.max(logits, 1)[1].data
            temp = []
            for idx in range(predict.size()[0]):
                temp.append(score[idx][predict[idx]].item())
            results.append(predict.tolist())
            confidence_score.append(temp)
    # with open("temp/AIDA_results.txt", "w") as fmodel:
    with open(result_path, "w", encoding="utf-8") as fmodel:
        for result, score in zip(results, confidence_score):
            for idx, rel in enumerate(result):
                fmodel.write("{}\t{}\n".format(rel, score[idx]))
                # fmodel.write("{}\t{}\n".format(rel.item(), score[idx].item()))

    print("test done!")
    return True
Ejemplo n.º 4
0
import numpy as np
import torch.nn.functional as F
from sklearn.model_selection import KFold

DW = 100
N = 123
DP = 25
NP = 123
NR = 19
DC = 1000
KP = 0.6
K = 3
LR = 0.2
BATCH_SIZE = 50
epochs = 100
data = pro.load_data('./data/train.txt')
t_data = pro.load_data('./data/test.txt')
word_dict = pro.build_dict(data[0])
x, y, e1, e2, dist1, dist2 = pro.vectorize(data, word_dict, N)
y = np.array(y).astype(np.int64)
np_cat = np.concatenate((x, np.array(e1).reshape(-1, 1), np.array(e2).reshape(-1, 1), np.array(dist1), np.array(dist2)),
                        1)
e_x, e_y, e_e1, e_e2, e_dist1, e_dist2 = pro.vectorize(t_data, word_dict, N)
y = np.array(y).astype(np.int64)
eval_cat = np.concatenate(
    (e_x, np.array(e_e1).reshape(-1, 1), np.array(e_e2).reshape(-1, 1), np.array(e_dist1), np.array(e_dist2)), 1)

tx, ty, te1, te2, td1, td2 = pro.vectorize(t_data, word_dict, N)
embed_file = './data/embedding/senna/embeddings.txt'
vac_file = './data/embedding/senna/words.lst'
embedding = pro.load_embedding(embed_file, vac_file, word_dict)
def main():
    print(
        '\n---------------------------------------------- Setup -----------------------------------------------'
    )

    parser = ArgumentParser(description='')
    parser.add_argument('--max_len',
                        type=int,
                        metavar='<MAX_LEN>',
                        default=123,
                        help='max_len')
    parser.add_argument('--pos_embed_size',
                        type=int,
                        metavar='<POS_EMBED_SIZE>',
                        default=70,
                        help='position_embedding_size')
    parser.add_argument('--n_pos_embed',
                        type=int,
                        metavar='<N_POS_EMBED>',
                        default=123,
                        help='position_embedding_num')
    parser.add_argument('--window',
                        type=int,
                        metavar='<WINDOW>',
                        default=3,
                        help='slide_window')
    parser.add_argument('--n_filters',
                        type=int,
                        metavar='<n_filters>',
                        default=1000,
                        help='num_filters')
    parser.add_argument('--p_dropout',
                        type=float,
                        metavar='<p_dropout>',
                        default=0.5,
                        help='keep_prob')
    parser.add_argument('--epochs',
                        type=int,
                        metavar='<EPOCHS>',
                        default=50,
                        help='number of epochs')
    parser.add_argument('--lr',
                        type=float,
                        metavar='<LR>',
                        default=0.001,
                        help='learning_rate')
    parser.add_argument('--decay',
                        type=float,
                        metavar='<decay>',
                        default=0,
                        help='weight_decay')
    parser.add_argument('--batch_size',
                        type=int,
                        metavar='<BATCH_SIZE>',
                        default=32,
                        help='batch_size')
    parser.add_argument('--opt',
                        type=str,
                        metavar='<OPT>',
                        default='adam',
                        help='optimizer: adam or sgd')
    A = parser.parse_args()

    N_CLASS = 19  # class_num
    N_EPOCHS = A.epochs
    MAX_LEN = A.max_len  # max_len
    POS_EMBED_SIZE = A.pos_embed_size  # position_embedding_size
    N_POS_EMBED = A.n_pos_embed  # position_embedding_num
    WINDOW = A.window  # slide_window
    BATCH_SIZE = A.batch_size
    n_filters = A.n_filters  # num_filters
    p_dropout = A.p_dropout  # keep_prob
    LR = A.lr  # learning_rate
    DECAY = A.decay  # learning rate decay
    OPT = A.opt
    TIMESTAMP = time.strftime("%Y%m%d-%H%M")
    FPATH_BEST_MODEL = 'saved_models/20190122/crcnn_opt-{}_epoch-{}_lr-{}_decay-{}_{}.pkl'.format(
        OPT, N_EPOCHS, LR, DECAY, TIMESTAMP)

    print('Parameters:\n{}'.format(
        dict(MAX_LEN=MAX_LEN,
             POS_EMBED_SIZE=POS_EMBED_SIZE,
             N_POS_EMBED=N_POS_EMBED,
             N_CLASS=N_CLASS,
             n_filters=n_filters,
             p_dropout=p_dropout,
             WINDOW=WINDOW,
             LR=LR,
             DECAY=DECAY,
             BATCH_SIZE=BATCH_SIZE,
             EPOCHS=N_EPOCHS,
             OPT=OPT,
             TIMESTAMP=TIMESTAMP)))

    # print('\n---------------------------------------------- Load Data -----------------------------------------------')

    data_train_valid = pro.load_data('data/nine_train.txt')

    concat = list(
        zip(data_train_valid[0], data_train_valid[1], data_train_valid[2],
            data_train_valid[3]))
    data_train, data_validation = train_test_split(concat,
                                                   test_size=0.2,
                                                   random_state=0)
    # print(data_train[0])

    new_data_train = [i for i in zip(*data_train)]
    new_data_validation = [i for i in zip(*data_validation)]

    word_dict = pro.build_dict(
        new_data_train[0] +
        new_data_validation[0])  # word_dict: 19215 words and their id
    print('len(word_dict): ', len(word_dict))

    sent_train, y_train, dist1_train, dist2_train = pro.vectorize(
        new_data_train, word_dict, MAX_LEN)
    y_train = np.array(y_train).astype(np.int64)
    X_train = np.concatenate(
        (sent_train, np.array(dist1_train), np.array(dist2_train)), 1)
    print('Data shape: X_train={}, y_train={}'.format(X_train.shape,
                                                      y_train.shape))

    sent_valid, y_valid, dist1_valid, dist2_valid = pro.vectorize(
        new_data_validation, word_dict, MAX_LEN)
    y_valid = np.array(y_valid).astype(np.int64)
    X_valid = np.concatenate(
        (sent_valid, np.array(dist1_valid), np.array(dist2_valid)), 1)
    print('Data shape: X_valid={}, y_valid={}'.format(X_valid.shape,
                                                      y_valid.shape))

    # fpath_embedding = '../relation-extraction-ly-dev/data/pre_trained_embeddings/glove.6B.300d.txt'
    # embedding_matrix = pro.load_glove_embeddings(fpath_embedding, word_dict)
    # print('Pre-trained embeddings loaded from <{}>.'.format(fpath_embedding))
    # np.save('data/embedding_matrix.npy', embedding_matrix)
    embedding_matrix = np.load('data/embedding_matrix.npy')

    print(
        '\n---------------------------------------------- Build Model -----------------------------------------------'
    )

    model = CR_CNN(MAX_LEN, embedding_matrix, POS_EMBED_SIZE, N_POS_EMBED,
                   WINDOW, N_CLASS, n_filters, p_dropout).cuda()
    print(model)

    loss_func = PairwiseRankingLoss(N_CLASS)
    if OPT == 'adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=LR,
                                     weight_decay=DECAY)
    elif OPT == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=LR,
                                    weight_decay=DECAY)
    # end if

    print(
        '\n------------------------------------------------- Train --------------------------------------------------'
    )

    def data_unpack(cat_data, target):
        list_x = np.split(cat_data.numpy(), [MAX_LEN, MAX_LEN + N_POS_EMBED],
                          1)
        batch_x = Variable(torch.from_numpy(list_x[0])).cuda()
        batch_d1 = Variable(torch.from_numpy(list_x[1])).cuda()
        batch_d2 = Variable(torch.from_numpy(list_x[2])).cuda()
        target = Variable(target).cuda()
        return batch_x, batch_d1, batch_d2, target

    def prediction(sc, y):
        '''
        Calculat the f1 score for y_true and y_predict.

        c_target_dict: 19 relation label -> 19 relation name
        tr_target_dict: 19 relation name -> 10 relation label

        '''
        y_true = y.cpu().data.numpy()
        y_predict = torch.max(sc, 1)[1].long().cpu().data.numpy()
        f1 = f1_score(y_true, y_predict, average='micro')
        return f1 * 100

    # end def

    best_score = 0
    patience = 0
    for i in range(1, N_EPOCHS + 1):
        patience += 1

        # train over batches
        tensor_x_train = torch.from_numpy(X_train.astype(np.int64))
        tensor_y_train = torch.LongTensor(y_train)
        train_datasets = D.TensorDataset(data_tensor=tensor_x_train,
                                         target_tensor=tensor_y_train)
        train_dataloader = D.DataLoader(dataset=train_datasets,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                        num_workers=2)
        score_train = 0
        loss = 0
        n_trained_batch = 0
        for (batch_x_cat, batch_y) in train_dataloader:
            n_trained_batch += 1
            batch_x, batch_d1, batch_d2, batch_y = data_unpack(
                batch_x_cat, batch_y)
            # print('batch_x: ', batch_x.shape)
            # print('batch_d1: ', batch_d1.shape)
            # print('batch_d2: ', batch_d2.shape)
            weight_o = model(batch_x, batch_d1, batch_d2)
            loss_per_batch = loss_func(weight_o, batch_y)
            optimizer.zero_grad()
            loss_per_batch.backward()
            optimizer.step()
            loss += loss_per_batch
            score_train += prediction(weight_o, batch_y)
        # end for
        loss = loss.cpu().data.numpy()[0] / n_trained_batch
        score_train = score_train / n_trained_batch

        # evaluate over batches
        tensor_X_valid = torch.from_numpy(X_valid.astype(np.int64))
        tensor_y_valid = torch.LongTensor(y_valid)
        valid_datasets = D.TensorDataset(data_tensor=tensor_X_valid,
                                         target_tensor=tensor_y_valid)
        valid_dataloader = D.DataLoader(dataset=valid_datasets,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                        num_workers=2)
        score_val = 0
        n_eval_batch = 0
        for (batch_x_cat, batch_y) in valid_dataloader:
            batch_x, batch_d1, batch_d2, batch_y = data_unpack(
                batch_x_cat, batch_y)
            weight_o = model(batch_x, batch_d1, batch_d2, False)
            score_val += prediction(weight_o, batch_y)
            n_eval_batch += 1
        # end for
        score_val = score_val / n_eval_batch
        # if i % 10 == 0:
        print(
            'Epoch [{}/{}]\t train_loss: {:4f}\t train_f1: {:.3f}\t test_f1: {:.3f}'
            .format(i, N_EPOCHS, loss, score_train, score_val))

        # save best model
        current_score = score_val
        if current_score > best_score:
            patience = 0
            best_score = current_score
            torch.save(model.state_dict(), FPATH_BEST_MODEL)
            print('Model saved to <{}>'.format(FPATH_BEST_MODEL))

        if patience >= 10:
            print('Earlystopping: patience = {}'.format(patience))
            break
        # end for

    with open('saved_models/20190122/results_20190122.csv', 'a') as f:
        writer = csv.writer(f)
        writer.writerow([
            TIMESTAMP,
            round(best_score,
                  3), OPT, BATCH_SIZE, N_EPOCHS, LR, DECAY, n_filters,
            p_dropout, MAX_LEN, POS_EMBED_SIZE, N_POS_EMBED, WINDOW, N_CLASS
        ])
    f.close()

    print(
        '\n------------------------------------------------- Test --------------------------------------------------\n'
    )

    # model = torch.load('saved_models/20190122/crcnn_opt-adam_epoch-50_lr-0.001_decay-0_20190122-2049.pkl')

    # test
    data_test = pro.load_data('data/nine_test.txt')
    sent_test, y_test, dist1_test, dist2_test = pro.vectorize(
        data_test, word_dict, MAX_LEN)
    y_test = np.array(y_test).astype(np.int64)
    X_test = np.concatenate(
        (sent_test, np.array(dist1_test), np.array(dist2_test)), 1)
    print('Data shape: X_test={}, y_test={}'.format(X_test.shape,
                                                    y_test.shape))

    # evaluate on test set
    tensor_X_test = torch.from_numpy(X_test.astype(np.int64))
    tensor_y_test = torch.LongTensor(y_test)
    test_datasets = D.TensorDataset(data_tensor=tensor_X_test,
                                    target_tensor=tensor_y_test)
    test_dataloader = D.DataLoader(dataset=test_datasets,
                                   batch_size=BATCH_SIZE,
                                   shuffle=True,
                                   num_workers=2)
    score_test = 0
    n_test_batch = 0
    y_predict_test = []
    y_true_test = []
    for (batch_x_cat, batch_y) in test_dataloader:
        batch_x, batch_d1, batch_d2, batch_y = data_unpack(
            batch_x_cat, batch_y)
        weight_o = model(batch_x, batch_d1, batch_d2, False)

        y_true_test.extend(list(batch_y.cpu().data.numpy()))
        y_predict_test.extend(
            list(torch.max(weight_o, 1)[1].long().cpu().data.numpy()))

        score_test += prediction(weight_o, batch_y)
        n_test_batch += 1
    # end for
    score_test = score_test / n_test_batch
    print('score_test={:.3f}'.format(score_test))

    # save y_predict to txt file and run official scorer
    target_dict = json.load(open('data/target_dict.txt', 'r',
                                 encoding='utf-8'))  # 19 classes
    c_target_dict = {value: key
                     for key, value in target_dict.items()}  # label -> name

    y_predict_test_names = [c_target_dict[i] for i in y_predict_test]
    y_true_test_names = [c_target_dict[i] for i in y_true_test]

    FPATH_Y_PRED_TXT = 'saved_models/20190122/y_predict_{}.txt'.format(
        TIMESTAMP)
    FPATH_Y_TRUE_TXT = 'saved_models/20190122/y_true_{}.txt'.format(TIMESTAMP)
    with open(FPATH_Y_PRED_TXT, 'w') as f:
        for i, p in enumerate(y_predict_test_names):
            f.write('{}\t{}'.format(i, p))
            f.write('\n')
    f.close()

    with open(FPATH_Y_TRUE_TXT, 'w') as f:
        for i, t in enumerate(y_true_test_names):
            f.write('{}\t{}'.format(i, t))
            f.write('\n')
    f.close()

    print('TXT files saved to <{}> and <{}>'.format(FPATH_Y_PRED_TXT,
                                                    FPATH_Y_TRUE_TXT))

    PERL_PATH = 'data/semeval2010_task8_scorer-v1.2.pl'
    process = subprocess.Popen(
        ["perl", PERL_PATH, FPATH_Y_PRED_TXT, FPATH_Y_TRUE_TXT],
        stdout=subprocess.PIPE)
    for line in str(process.communicate()[0].decode("utf-8")).split("\\n"):
        print(line)

    print(
        '\n------------------------------------------------- END --------------------------------------------------\n\n\n'
    )
def extract_relations_cl(model,
                         word_dict,
                         type_dic,
                         test_file=os.path.join(CURRENT_DIR,
                                                "temp/AIDA_plain_text.txt"),
                         batch_size=64,
                         T=1.0,
                         result_path=os.path.join(CURRENT_DIR,
                                                  "temp/AIDA_results.txt")):
    t_data = data_pro.load_data(test_file)
    t_x, t_y, t_e1, t_e2, t_dist1, t_dist2, t_en_type_vec, t_pool_mask_e1, \
    t_pool_mask, t_pool_mask_e2 = data_pro.vectorize_cl(t_data, word_dict, type_dic)
    t_y = np.array(t_y).astype(np.int64)
    t_np_cat = np.concatenate(
        (t_x, np.array(t_dist1), np.array(t_dist2), np.array(t_en_type_vec),
         np.array(t_pool_mask_e1), np.array(t_pool_mask),
         np.array(t_pool_mask_e2)), 1)
    test = torch.from_numpy(t_np_cat.astype(np.int64))
    t_y_tensor = torch.from_numpy(t_y)
    test_datasets = D.TensorDataset(test, t_y_tensor)
    test_dataloader = D.DataLoader(test_datasets,
                                   batch_size,
                                   False,
                                   num_workers=1)

    results = []
    confidence_score = []

    # count = 0
    bad_count = 0
    with torch.no_grad():
        for (b_x_cat, b_y) in test_dataloader:
            bx, bd1, bd2, ben, bmask1, bmask, bmask2, by = data_unpack_cl(
                b_x_cat, b_y)
            try:
                logits = model(bx, bd1, bd2, ben, bmask1, bmask, bmask2, False)
            except RuntimeError as e:
                print("BAD" + "=" * 50)
                print("b_x_cat ({}) = {}".format(b_x_cat.size(), b_x_cat))
                print("b_y ({}) = {}".format(b_y.size(), b_y))
                print("bx ({}) = {}".format(bx.size(), bx))
                print("bd1 ({}) = {}".format(bd1.size(), bd1))
                print("bd2 ({}) = {}".format(bd2.size(), bd2))
                print("ben ({}) = {}".format(ben.size(), ben))
                print("bmask1 ({}) = {}".format(bmask1.size(), bmask1))
                print("bmask ({}) = {}".format(bmask.size(), bmask))
                print("bmask2 ({}) = {}".format(bmask2.size(), bmask2))
                print("by ({}) = {}".format(by.size(), by))
                print("BAD" + "=" * 50)
                print("\n\n\n\n")

                # logits = torch.empty(b_x_cat.size(0), model.nr, dtype=bmask1.dtype, device=bmask1.device).fill_(1e-8)
                # logits[:, 32] = 1.0
                #
                bad_count += 1
                # continue
                raise RuntimeError(e)

            # if count < 1:
            #     print("GOOD" + "=" * 50)
            #     print("b_x_cat ({}) = {}".format(b_x_cat.size(), b_x_cat))
            #     print("b_y ({}) = {}".format(b_y.size(), b_y))
            #     print("bx ({}) = {}".format(bx.size(), bx))
            #     print("bd1 ({}) = {}".format(bd1.size(), bd1))
            #     print("bd2 ({}) = {}".format(bd2.size(), bd2))
            #     print("ben ({}) = {}".format(ben.size(), ben))
            #     print("bmask1 ({}) = {}".format(bmask1.size(), bmask1))
            #     print("bmask ({}) = {}".format(bmask.size(), bmask))
            #     print("bmask2 ({}) = {}".format(bmask2.size(), bmask2))
            #     print("by ({}) = {}".format(by.size(), by))
            #     print(logits.size())
            #     print("\n\n\n\n")
            #     count += 1

            score = torch.nn.functional.softmax(logits / T, 1).tolist()
            predict = torch.max(logits, 1)[1].tolist()
            temp = []
            for idx in range(len(predict)):
                temp.append(score[idx][predict[idx]])
            results.append(predict)
            confidence_score.append(temp)

    # with open("temp/results_post_sponsor.txt", "w") as fmodel:
    with open(result_path, "w", encoding="utf-8") as fmodel:
        for result, score in zip(results, confidence_score):
            for idx, rel in enumerate(result):
                fmodel.write("{}\t{}\n".format(rel, score[idx]))
                # fmodel.write("{}\t{}\n".format(rel.item(), score[idx].item()))

    print("test done!")
    print("BAD batches: {} (batch size = {})".format(bad_count, batch_size))
    return True
# encoding:utf-8
import data_pro as pro
import numpy as np
import torch
import lstm
import torch.utils.data as D
from torch.autograd import Variable
import torch.nn.functional as F
import random
from sklearn import cross_validation

'''training data'''
train_data = pro.load_data('train_pad.txt')
word_dict = {'unk': 0}
word_dict = pro.build_dict(train_data, word_dict)
train_tag = pro.load_data('tag.txt')
tag_dict = {}
tag_dict = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4}
# tag_dict=pro.build_dict(train_tag,tag_dict)

import argparse

parser = argparse.ArgumentParser(description='question classification')
parser.add_argument('-embed_dim', type=int, default=50)
parser.add_argument('-embed_num', type=int, default=len(word_dict))
parser.add_argument('-dropout', type=float, default=0.5)
parser.add_argument('-hidden_size', type=int, default=100)
parser.add_argument('-batch_size', type=int, default=20)
parser.add_argument('-epochs', type=int, default=300)
parser.add_argument('-t_size', type=int, default=100)
parser.add_argument('-class_num', type=int, default=len(tag_dict))