Exemple #1
0
    def testLoad(self,cfg):
        cfg = self.cfg
        entity2id = load_dict(cfg['data_folder'] + cfg['entity2id'])
        word2id = load_dict(cfg['data_folder'] + cfg['word2id'])
        relation2id = load_dict(cfg['data_folder'] + cfg['relation2id'])

        train_documents = load_documents(cfg['data_folder'] + cfg['train_documents'])
        train_document_entity_indices, train_document_texts = index_document_entities(train_documents, word2id,
                                                                                      entity2id,
                                                                                      cfg['max_document_word'])
        train_data = DataLoader(cfg['data_folder'] + cfg['train_data'], train_documents, train_document_entity_indices,
                                train_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'],
                                cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation'])
Exemple #2
0
def test(cfg):
    entity2id = load_dict(cfg['data_folder'] + cfg['entity2id'])
    word2id = load_dict(cfg['data_folder'] + cfg['word2id'])
    relation2id = load_dict(cfg['data_folder'] + cfg['relation2id'])

    test_documents = load_documents(cfg['data_folder'] + cfg['test_documents'])
    test_document_entity_indices, test_document_texts = index_document_entities(
        test_documents, word2id, entity2id, cfg['max_document_word'])
    test_data = DataLoader(cfg['data_folder'] + cfg['test_data'],
                           test_documents, test_document_entity_indices,
                           test_document_texts, word2id, relation2id,
                           entity2id, cfg['max_query_word'],
                           cfg['max_document_word'], cfg['use_kb'],
                           cfg['use_doc'], cfg['use_inverse_relation'])

    my_model = get_model(cfg, test_data.num_kb_relation, len(entity2id),
                         len(word2id)).to(device)
    test_acc = inference(my_model, test_data, entity2id, cfg, log_info=True)
    return test_acc
def train(cfg):
    tf_logger = SummaryWriter('tf_logs/' + cfg['model_id'])

    # train and test share the same set of documents
    documents = load_documents(cfg['data_folder'] +
                               cfg['{}_documents'.format(cfg['mode'])])

    # train data
    train_data = DataLoader(cfg, documents)
    valid_data = DataLoader(cfg, documents, mode='dev')

    model = KAReader(cfg)
    model = model.to(torch.device('cuda'))

    trainable = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(trainable, lr=cfg['learning_rate'])

    if cfg['lr_schedule']:
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optim, [30],
                                                         gamma=0.5)

    model.train()
    best_val_f1 = 0
    best_val_hits = 0
    for epoch in range(cfg['num_epoch']):
        batcher = train_data.batcher(shuffle=True)
        train_loss = []
        for feed in batcher:
            loss, pred, pred_dist = model(feed)
            train_loss.append(loss.item())
            # acc, max_acc = cal_accuracy(pred, feed['answers'].cpu().numpy())
            # train_acc.append(acc)
            # train_max_acc.append(max_acc)
            optim.zero_grad()
            loss.backward()
            if cfg['gradient_clip'] != 0:
                torch.nn.utils.clip_grad_norm_(trainable, cfg['gradient_clip'])
            optim.step()
        tf_logger.add_scalar('avg_batch_loss', np.mean(train_loss), epoch)

        val_f1, val_hits = test(model, valid_data, cfg['eps'])
        if cfg['lr_schedule']:
            scheduler.step()
        tf_logger.add_scalar('eval_f1', val_f1, epoch)
        tf_logger.add_scalar('eval_hits', val_hits, epoch)
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
        if val_hits > best_val_hits:
            best_val_hits = val_hits
            torch.save(
                model.state_dict(),
                'model/{}/{}_best.pt'.format(cfg['name'], cfg['model_id']))
        print('evaluation best f1:{} current:{}'.format(best_val_f1, val_f1))
        print('evaluation best hits:{} current:{}'.format(
            best_val_hits, val_hits))

    print('save final model')
    torch.save(model.state_dict(),
               'model/{}/{}_final.pt'.format(cfg['name'], cfg['model_id']))

    # model_save_path = 'model/{}/{}_best.pt'.format(cfg['name'], cfg['model_id'])
    # model.load_state_dict(torch.load(model_save_path))

    print('\n..........Finished training, start testing.......')

    test_data = DataLoader(cfg, documents, mode='test')
    model.eval()
    print('finished training, testing final model...')
    test(model, test_data, cfg['eps'])
    print('how many eval samples......', len(f1s))
    print('avg_f1', np.mean(f1s))
    print('avg_hits', np.mean(hits))

    model.train()
    return np.mean(f1s), np.mean(hits)


if __name__ == "__main__":
    # config_file = sys.argv[2]
    cfg = get_config()
    random.seed(cfg['seed'])
    np.random.seed(cfg['seed'])
    torch.manual_seed(cfg['seed'])
    torch.cuda.manual_seed_all(cfg['seed'])
    if cfg['mode'] == 'train':
        train(cfg)
    elif cfg['mode'] == 'test':
        documents = load_documents(cfg['data_folder'] +
                                   cfg['{}_documents'.format(cfg['mode'])])
        test_data = DataLoader(cfg, documents, mode='test')
        model = KAReader(cfg)
        model = model.to(torch.device('cuda'))
        model_save_path = 'model/{}/{}_best.pt'.format(cfg['name'],
                                                       cfg['model_id'])
        model.load_state_dict(torch.load(model_save_path))
        model.eval()
        test(model, test_data, cfg['eps'])
    else:
        assert False, "--train or --test?"
Exemple #5
0
def train(cfg):
    print("training ...")

    # prepare data
    entity2id = load_dict(cfg['data_folder'] + cfg['entity2id'])
    word2id = load_dict(cfg['data_folder'] + cfg['word2id'])
    relation2id = load_dict(cfg['data_folder'] + cfg['relation2id'])

    train_documents = load_documents(cfg['data_folder'] +
                                     cfg['train_documents'])
    train_document_entity_indices, train_document_texts = index_document_entities(
        train_documents, word2id, entity2id, cfg['max_document_word'])
    train_data = DataLoader(cfg['data_folder'] + cfg['train_data'],
                            train_documents, train_document_entity_indices,
                            train_document_texts, word2id, relation2id,
                            entity2id, cfg['max_query_word'],
                            cfg['max_document_word'], cfg['use_kb'],
                            cfg['use_doc'], cfg['use_inverse_relation'])

    if cfg['dev_documents'] != cfg['train_documents']:
        valid_documents = load_documents(cfg['data_folder'] +
                                         cfg['dev_documents'])
        valid_document_entity_indices, valid_document_texts = index_document_entities(
            valid_documents, word2id, entity2id, cfg['max_document_word'])
    else:
        valid_documents = train_documents
        valid_document_entity_indices, valid_document_texts = train_document_entity_indices, train_document_texts
    valid_data = DataLoader(cfg['data_folder'] + cfg['dev_data'],
                            valid_documents, valid_document_entity_indices,
                            valid_document_texts, word2id, relation2id,
                            entity2id, cfg['max_query_word'],
                            cfg['max_document_word'], cfg['use_kb'],
                            cfg['use_doc'], cfg['use_inverse_relation'])

    if cfg['test_documents'] != cfg['dev_documents']:
        test_documents = load_documents(cfg['data_folder'] +
                                        cfg['test_documents'])
        test_document_entity_indices, test_document_texts = index_document_entities(
            test_documents, word2id, entity2id, cfg['max_document_word'])
    else:
        test_documents = valid_documents
        test_document_entity_indices, test_document_texts = valid_document_entity_indices, valid_document_texts
    test_data = DataLoader(cfg['data_folder'] + cfg['test_data'],
                           test_documents, test_document_entity_indices,
                           test_document_texts, word2id, relation2id,
                           entity2id, cfg['max_query_word'],
                           cfg['max_document_word'], cfg['use_kb'],
                           cfg['use_doc'], cfg['use_inverse_relation'])

    # create model & set parameters
    my_model = get_model(cfg, train_data.num_kb_relation, len(entity2id),
                         len(word2id)).to(device)
    trainable_parameters = [
        p for p in my_model.parameters() if p.requires_grad
    ]
    optimizer = torch.optim.Adam(trainable_parameters, lr=cfg['learning_rate'])

    best_dev_acc = 0.0
    for epoch in range(cfg['num_epoch']):
        try:
            print('epoch', epoch)
            train_data.reset_batches(is_sequential=cfg['is_debug'])
            # Train
            my_model.train()
            train_loss, train_acc, train_max_acc = [], [], []
            for iteration in tqdm(
                    range(train_data.num_data // cfg['batch_size'])):
                batch = train_data.get_batch(iteration, cfg['batch_size'],
                                             cfg['fact_dropout'])
                loss, pred, _ = my_model(batch)
                pred = pred.data.cpu().numpy()
                acc, max_acc = cal_accuracy(pred, batch[-1])
                train_loss.append(float(loss.data))
                train_acc.append(acc)
                train_max_acc.append(max_acc)
                # back propogate
                my_model.zero_grad()
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm(my_model.parameters(),
                                              cfg['gradient_clip'])
                optimizer.step()
            print('avg_training_loss', sum(train_loss) / len(train_loss))
            print('max_training_acc', sum(train_max_acc) / len(train_max_acc))
            print('avg_training_acc', sum(train_acc) / len(train_acc))

            print("validating ...")
            eval_acc = inference(my_model, valid_data, entity2id, cfg)
            if eval_acc > best_dev_acc and cfg['to_save_model']:
                print("saving model to", cfg['save_model_file'])
                torch.save(my_model.state_dict(), cfg['save_model_file'])
                best_dev_acc = eval_acc

        except KeyboardInterrupt:
            break

    # Test set evaluation
    print("evaluating on test")
    print('loading model from ...', cfg['save_model_file'])
    my_model.load_state_dict(torch.load(cfg['save_model_file']))
    test_acc = inference(my_model, test_data, entity2id, cfg, log_info=True)

    return test_acc
Exemple #6
0
 def testLoadDoc(self):
     print(load_documents(self.cfg["data_folder"] + self.cfg['train_documents'])[:3])
Exemple #7
0
def train(cfg):
    print("training ...")

    # prepare data
    # entities.txt, vocab.txt,relations.txt等文件
    """
        entity2id是一个字典,表示每个entity对应的id
        word2id也是一个字典,表示每个word对应的id
        relation2id也是,表示relation,比如has_tags这些谓词
        以上三者数据量较小
        train_documents 文件较大。是一个list,每个元素一个dict,包含document,title,tokens属性。

        document属性下,有text, entities属性。text就是文本,entities是一个列表。表示text中每个entity以及其kb_id。
        title下与document类似,只不过是title文本
        token中应该是document和title的entity列表。

        index_document_entities应该是
    """
    entity2id = load_dict(cfg['data_folder'] + cfg['entity2id'])
    word2id = load_dict(cfg['data_folder'] + cfg['word2id'])
    relation2id = load_dict(cfg['data_folder'] + cfg['relation2id'])

    # train_document.json
    train_documents = load_documents(cfg['data_folder'] +
                                     cfg['train_documents'])

    train_document_entity_indices, train_document_texts = index_document_entities(
        train_documents, word2id, entity2id, cfg['max_document_word'])
    train_data = DataLoader(cfg['data_folder'] + cfg['train_data'],
                            train_documents, train_document_entity_indices,
                            train_document_texts, word2id, relation2id,
                            entity2id, cfg['max_query_word'],
                            cfg['max_document_word'], cfg['use_kb'],
                            cfg['use_doc'], cfg['use_inverse_relation'])

    if cfg['dev_documents'] != cfg['train_documents']:
        valid_documents = load_documents(cfg['data_folder'] +
                                         cfg['dev_documents'])
        valid_document_entity_indices, valid_document_texts = index_document_entities(
            valid_documents, word2id, entity2id, cfg['max_document_word'])
    else:
        valid_documents = train_documents
        valid_document_entity_indices, valid_document_texts = train_document_entity_indices, train_document_texts
    valid_data = DataLoader(cfg['data_folder'] + cfg['dev_data'],
                            valid_documents, valid_document_entity_indices,
                            valid_document_texts, word2id, relation2id,
                            entity2id, cfg['max_query_word'],
                            cfg['max_document_word'], cfg['use_kb'],
                            cfg['use_doc'], cfg['use_inverse_relation'])

    if cfg['test_documents'] != cfg['dev_documents']:
        test_documents = load_documents(cfg['data_folder'] +
                                        cfg['test_documents'])
        test_document_entity_indices, test_document_texts = index_document_entities(
            test_documents, word2id, entity2id, cfg['max_document_word'])
    else:
        test_documents = valid_documents
        test_document_entity_indices, test_document_texts = valid_document_entity_indices, valid_document_texts
    test_data = DataLoader(cfg['data_folder'] + cfg['test_data'],
                           test_documents, test_document_entity_indices,
                           test_document_texts, word2id, relation2id,
                           entity2id, cfg['max_query_word'],
                           cfg['max_document_word'], cfg['use_kb'],
                           cfg['use_doc'], cfg['use_inverse_relation'])

    # create model & set parameters
    my_model = get_model(cfg, train_data.num_kb_relation, len(entity2id),
                         len(word2id), "train")
    trainable_parameters = [
        p for p in my_model.parameters() if p.requires_grad
    ]
    optimizer = torch.optim.Adam(trainable_parameters, lr=cfg['learning_rate'])

    best_dev_acc = 0.0
    for epoch in range(cfg['num_epoch']):
        try:
            print('epoch', epoch)
            train_data.reset_batches(is_sequential=cfg['is_debug'])
            # Train
            my_model.train()
            train_loss, train_acc, train_max_acc = [], [], []
            for iteration in tqdm(
                    range(train_data.num_data // cfg['batch_size'])):
                batch = train_data.get_batch(iteration, cfg['batch_size'],
                                             cfg['fact_dropout'])
                loss, pred, _ = my_model(batch)
                pred = pred.data.cpu().numpy()
                acc, max_acc = cal_accuracy(pred, batch[-1])
                train_loss.append(loss.data)
                train_acc.append(acc)
                train_max_acc.append(max_acc)
                # back propogate
                my_model.zero_grad()
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm(my_model.parameters(),
                                              cfg['gradient_clip'])
                optimizer.step()
            print('avg_training_loss', sum(train_loss) / len(train_loss))
            print('max_training_acc', sum(train_max_acc) / len(train_max_acc))
            print('avg_training_acc', sum(train_acc) / len(train_acc))

            print("validating ...")
            eval_acc = inference(my_model, valid_data, entity2id, cfg)
            if eval_acc > best_dev_acc and cfg['to_save_model']:
                print("saving model to", cfg['save_model_file'])
                torch.save(my_model.state_dict(), cfg['save_model_file'])
                best_dev_acc = eval_acc

        except KeyboardInterrupt:
            break

    # Test set evaluation
    print("evaluating on test")
    print('loading model from ...', cfg['save_model_file'])
    my_model.load_state_dict(torch.load(cfg['save_model_file']))
    test_acc = inference(my_model, test_data, entity2id, cfg, log_info=True)

    return test_acc
Exemple #8
0
from collections import namedtuple
import sys
from sklearn import svm
import numpy as np
from nltk.corpus import wordnet as wn
from util import load_documents
from util import load_mention_pairs
from kernels import load_labels


documents = load_documents()


Feature = namedtuple('Feature', ['word', 'pos', 'cpos', 'chunktag', 'hypernym', 'netypes'])
Instance = namedtuple('Instance', ['tree', 'sent_index', 'filename'])


def features(t, sent_index, filename):
    """
    Get the features of a dep tree
    features include:
    word, POS, Collapsed_POS, ChunkTag, WordNet_Hypernym
    """
    word = t.token.lower()
    parse_tree = documents[filename].parsed_sents[sent_index]
    pos = parse_tree.pos()[t.index]
    collapsed = pos[0]
    treeposition = parse_tree.treeposition_spanning_leaves(t.index, t.index+1)[:-2]
    chunktag = parse_tree[treeposition].label()
    if chunktag.startswith("N"): chunktag = 'NP'
    elif chunktag.startswith("V"): chunktag = "VP"