Example #1
0
    def testLoad(self,cfg):
        cfg = self.cfg
        entity2id = load_dict(cfg['data_folder'] + cfg['entity2id'])
        word2id = load_dict(cfg['data_folder'] + cfg['word2id'])
        relation2id = load_dict(cfg['data_folder'] + cfg['relation2id'])

        train_documents = load_documents(cfg['data_folder'] + cfg['train_documents'])
        train_document_entity_indices, train_document_texts = index_document_entities(train_documents, word2id,
                                                                                      entity2id,
                                                                                      cfg['max_document_word'])
        train_data = DataLoader(cfg['data_folder'] + cfg['train_data'], train_documents, train_document_entity_indices,
                                train_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'],
                                cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation'])
Example #2
0
def test(cfg):
    entity2id = load_dict(cfg['data_folder'] + cfg['entity2id'])
    word2id = load_dict(cfg['data_folder'] + cfg['word2id'])
    relation2id = load_dict(cfg['data_folder'] + cfg['relation2id'])

    test_documents = load_documents(cfg['data_folder'] + cfg['test_documents'])
    test_document_entity_indices, test_document_texts = index_document_entities(
        test_documents, word2id, entity2id, cfg['max_document_word'])
    test_data = DataLoader(cfg['data_folder'] + cfg['test_data'],
                           test_documents, test_document_entity_indices,
                           test_document_texts, word2id, relation2id,
                           entity2id, cfg['max_query_word'],
                           cfg['max_document_word'], cfg['use_kb'],
                           cfg['use_doc'], cfg['use_inverse_relation'])

    my_model = get_model(cfg, test_data.num_kb_relation, len(entity2id),
                         len(word2id)).to(device)
    test_acc = inference(my_model, test_data, entity2id, cfg, log_info=True)
    return test_acc
Example #3
0
def train(cfg):
    print("training ...")

    # prepare data
    entity2id = load_dict(cfg['data_folder'] + cfg['entity2id'])
    word2id = load_dict(cfg['data_folder'] + cfg['word2id'])
    relation2id = load_dict(cfg['data_folder'] + cfg['relation2id'])

    train_documents = load_documents(cfg['data_folder'] +
                                     cfg['train_documents'])
    train_document_entity_indices, train_document_texts = index_document_entities(
        train_documents, word2id, entity2id, cfg['max_document_word'])
    train_data = DataLoader(cfg['data_folder'] + cfg['train_data'],
                            train_documents, train_document_entity_indices,
                            train_document_texts, word2id, relation2id,
                            entity2id, cfg['max_query_word'],
                            cfg['max_document_word'], cfg['use_kb'],
                            cfg['use_doc'], cfg['use_inverse_relation'])

    if cfg['dev_documents'] != cfg['train_documents']:
        valid_documents = load_documents(cfg['data_folder'] +
                                         cfg['dev_documents'])
        valid_document_entity_indices, valid_document_texts = index_document_entities(
            valid_documents, word2id, entity2id, cfg['max_document_word'])
    else:
        valid_documents = train_documents
        valid_document_entity_indices, valid_document_texts = train_document_entity_indices, train_document_texts
    valid_data = DataLoader(cfg['data_folder'] + cfg['dev_data'],
                            valid_documents, valid_document_entity_indices,
                            valid_document_texts, word2id, relation2id,
                            entity2id, cfg['max_query_word'],
                            cfg['max_document_word'], cfg['use_kb'],
                            cfg['use_doc'], cfg['use_inverse_relation'])

    if cfg['test_documents'] != cfg['dev_documents']:
        test_documents = load_documents(cfg['data_folder'] +
                                        cfg['test_documents'])
        test_document_entity_indices, test_document_texts = index_document_entities(
            test_documents, word2id, entity2id, cfg['max_document_word'])
    else:
        test_documents = valid_documents
        test_document_entity_indices, test_document_texts = valid_document_entity_indices, valid_document_texts
    test_data = DataLoader(cfg['data_folder'] + cfg['test_data'],
                           test_documents, test_document_entity_indices,
                           test_document_texts, word2id, relation2id,
                           entity2id, cfg['max_query_word'],
                           cfg['max_document_word'], cfg['use_kb'],
                           cfg['use_doc'], cfg['use_inverse_relation'])

    # create model & set parameters
    my_model = get_model(cfg, train_data.num_kb_relation, len(entity2id),
                         len(word2id)).to(device)
    trainable_parameters = [
        p for p in my_model.parameters() if p.requires_grad
    ]
    optimizer = torch.optim.Adam(trainable_parameters, lr=cfg['learning_rate'])

    best_dev_acc = 0.0
    for epoch in range(cfg['num_epoch']):
        try:
            print('epoch', epoch)
            train_data.reset_batches(is_sequential=cfg['is_debug'])
            # Train
            my_model.train()
            train_loss, train_acc, train_max_acc = [], [], []
            for iteration in tqdm(
                    range(train_data.num_data // cfg['batch_size'])):
                batch = train_data.get_batch(iteration, cfg['batch_size'],
                                             cfg['fact_dropout'])
                loss, pred, _ = my_model(batch)
                pred = pred.data.cpu().numpy()
                acc, max_acc = cal_accuracy(pred, batch[-1])
                train_loss.append(float(loss.data))
                train_acc.append(acc)
                train_max_acc.append(max_acc)
                # back propogate
                my_model.zero_grad()
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm(my_model.parameters(),
                                              cfg['gradient_clip'])
                optimizer.step()
            print('avg_training_loss', sum(train_loss) / len(train_loss))
            print('max_training_acc', sum(train_max_acc) / len(train_max_acc))
            print('avg_training_acc', sum(train_acc) / len(train_acc))

            print("validating ...")
            eval_acc = inference(my_model, valid_data, entity2id, cfg)
            if eval_acc > best_dev_acc and cfg['to_save_model']:
                print("saving model to", cfg['save_model_file'])
                torch.save(my_model.state_dict(), cfg['save_model_file'])
                best_dev_acc = eval_acc

        except KeyboardInterrupt:
            break

    # Test set evaluation
    print("evaluating on test")
    print('loading model from ...', cfg['save_model_file'])
    my_model.load_state_dict(torch.load(cfg['save_model_file']))
    test_acc = inference(my_model, test_data, entity2id, cfg, log_info=True)

    return test_acc
Example #4
0
def train(cfg):
    print("training ...")

    # prepare data
    # entities.txt, vocab.txt,relations.txt等文件
    """
        entity2id是一个字典,表示每个entity对应的id
        word2id也是一个字典,表示每个word对应的id
        relation2id也是,表示relation,比如has_tags这些谓词
        以上三者数据量较小
        train_documents 文件较大。是一个list,每个元素一个dict,包含document,title,tokens属性。

        document属性下,有text, entities属性。text就是文本,entities是一个列表。表示text中每个entity以及其kb_id。
        title下与document类似,只不过是title文本
        token中应该是document和title的entity列表。

        index_document_entities应该是
    """
    entity2id = load_dict(cfg['data_folder'] + cfg['entity2id'])
    word2id = load_dict(cfg['data_folder'] + cfg['word2id'])
    relation2id = load_dict(cfg['data_folder'] + cfg['relation2id'])

    # train_document.json
    train_documents = load_documents(cfg['data_folder'] +
                                     cfg['train_documents'])

    train_document_entity_indices, train_document_texts = index_document_entities(
        train_documents, word2id, entity2id, cfg['max_document_word'])
    train_data = DataLoader(cfg['data_folder'] + cfg['train_data'],
                            train_documents, train_document_entity_indices,
                            train_document_texts, word2id, relation2id,
                            entity2id, cfg['max_query_word'],
                            cfg['max_document_word'], cfg['use_kb'],
                            cfg['use_doc'], cfg['use_inverse_relation'])

    if cfg['dev_documents'] != cfg['train_documents']:
        valid_documents = load_documents(cfg['data_folder'] +
                                         cfg['dev_documents'])
        valid_document_entity_indices, valid_document_texts = index_document_entities(
            valid_documents, word2id, entity2id, cfg['max_document_word'])
    else:
        valid_documents = train_documents
        valid_document_entity_indices, valid_document_texts = train_document_entity_indices, train_document_texts
    valid_data = DataLoader(cfg['data_folder'] + cfg['dev_data'],
                            valid_documents, valid_document_entity_indices,
                            valid_document_texts, word2id, relation2id,
                            entity2id, cfg['max_query_word'],
                            cfg['max_document_word'], cfg['use_kb'],
                            cfg['use_doc'], cfg['use_inverse_relation'])

    if cfg['test_documents'] != cfg['dev_documents']:
        test_documents = load_documents(cfg['data_folder'] +
                                        cfg['test_documents'])
        test_document_entity_indices, test_document_texts = index_document_entities(
            test_documents, word2id, entity2id, cfg['max_document_word'])
    else:
        test_documents = valid_documents
        test_document_entity_indices, test_document_texts = valid_document_entity_indices, valid_document_texts
    test_data = DataLoader(cfg['data_folder'] + cfg['test_data'],
                           test_documents, test_document_entity_indices,
                           test_document_texts, word2id, relation2id,
                           entity2id, cfg['max_query_word'],
                           cfg['max_document_word'], cfg['use_kb'],
                           cfg['use_doc'], cfg['use_inverse_relation'])

    # create model & set parameters
    my_model = get_model(cfg, train_data.num_kb_relation, len(entity2id),
                         len(word2id), "train")
    trainable_parameters = [
        p for p in my_model.parameters() if p.requires_grad
    ]
    optimizer = torch.optim.Adam(trainable_parameters, lr=cfg['learning_rate'])

    best_dev_acc = 0.0
    for epoch in range(cfg['num_epoch']):
        try:
            print('epoch', epoch)
            train_data.reset_batches(is_sequential=cfg['is_debug'])
            # Train
            my_model.train()
            train_loss, train_acc, train_max_acc = [], [], []
            for iteration in tqdm(
                    range(train_data.num_data // cfg['batch_size'])):
                batch = train_data.get_batch(iteration, cfg['batch_size'],
                                             cfg['fact_dropout'])
                loss, pred, _ = my_model(batch)
                pred = pred.data.cpu().numpy()
                acc, max_acc = cal_accuracy(pred, batch[-1])
                train_loss.append(loss.data)
                train_acc.append(acc)
                train_max_acc.append(max_acc)
                # back propogate
                my_model.zero_grad()
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm(my_model.parameters(),
                                              cfg['gradient_clip'])
                optimizer.step()
            print('avg_training_loss', sum(train_loss) / len(train_loss))
            print('max_training_acc', sum(train_max_acc) / len(train_max_acc))
            print('avg_training_acc', sum(train_acc) / len(train_acc))

            print("validating ...")
            eval_acc = inference(my_model, valid_data, entity2id, cfg)
            if eval_acc > best_dev_acc and cfg['to_save_model']:
                print("saving model to", cfg['save_model_file'])
                torch.save(my_model.state_dict(), cfg['save_model_file'])
                best_dev_acc = eval_acc

        except KeyboardInterrupt:
            break

    # Test set evaluation
    print("evaluating on test")
    print('loading model from ...', cfg['save_model_file'])
    my_model.load_state_dict(torch.load(cfg['save_model_file']))
    test_acc = inference(my_model, test_data, entity2id, cfg, log_info=True)

    return test_acc