Exemple #1
0
def make_para_dataset():
    embedding_file = "../glove.840B.300d.txt"
    embedding = "./embedding.pkl"
    src_word2idx_file = "./word2idx.pkl"

    train_squad = "../squad/train-v1.1.json"
    dev_squad = "../squad/dev-v1.1.json"

    train_src_file = "../squad/para-train.txt"
    train_trg_file = "../squad/tgt-train.txt"
    dev_src_file = "../squad/para-dev.txt"
    dev_trg_file = "../squad/tgt-dev.txt"

    test_src_file = "../squad/para-test.txt"
    test_trg_file = "../squad/tgt-test.txt"

    # pre-process training data
    train_examples, counter = process_file(train_squad)
    make_conll_format(train_examples, train_src_file, train_trg_file)
    word2idx = make_vocab_from_squad(src_word2idx_file, counter,
                                     config.vocab_size)
    make_embedding(embedding_file, embedding, word2idx)

    # split dev into dev and test
    dev_test_examples, _ = process_file(dev_squad)
    # random.shuffle(dev_test_examples)
    num_dev = len(dev_test_examples) // 2
    dev_examples = dev_test_examples[:num_dev]
    test_examples = dev_test_examples[num_dev:]
    make_conll_format(dev_examples, dev_src_file, dev_trg_file)
    make_conll_format(test_examples, test_src_file, test_trg_file)
def make_para_dataset():
    embedding_file = "./glove.840B.300d.txt"
    embedding = "./embedding.pkl"
    src_word2idx_file = "./word2idx.pkl"
    ent2idx_file = "./ent2idx.pkl"
    rel2idx_file = "./rel2idx.pkl"
    entity_embedding = "./entity.pkl"
    relation_embedding = "./relation.pkl"

    train_squad = "../squad/train-v1.1.json"
    dev_squad = "../squad/dev-v1.1.json"

    train_src_file = "../squad/para-train.txt"
    train_trg_file = "../squad/tgt-train.txt"
    train_cs_file = "./paracs-train.json"
    dev_src_file = "../squad/para-dev.txt"
    dev_trg_file = "../squad/tgt-dev.txt"
    dev_cs_file = "./paracs-dev.json"

    test_src_file = "../squad/para-test.txt"
    test_trg_file = "../squad/tgt-test.txt"
    test_cs_file = "./paracs-test.json"
    ent_vector = "./entity_transE.txt"
    rel_vector = "./relation_transE.txt"
    ent_file = "./entity.txt"
    rel_file = "./relation.txt"
    cs_file = "./resource.json"

    database = dict()
    with open(cs_file, "r") as f:
        d = json.load(f)
        if d["dict_csk"] is not None:
            database = d["dict_csk"]

    # process the graph vector through the static attention mechanism
    _, _, ent2idx, rel2idx = make_graph_vector(entity_embedding,
                                               relation_embedding,
                                               ent_vector,
                                               ent_file,
                                               rel_vector,
                                               rel_file,
                                               ent2idx_file,
                                               rel2idx_file
                                               )
    # pre-process training data
    train_examples, counter, num = process_file(train_squad, ent2idx, rel2idx, database)
    make_conll_format(train_examples, train_src_file, train_trg_file, train_cs_file, num)
    word2idx = make_vocab_from_squad(src_word2idx_file, counter, config.vocab_size)
    make_embedding(embedding_file, embedding, word2idx)

    # split dev into dev and test
    dev_test_examples, _, num = process_file(dev_squad, ent2idx, rel2idx, database)
    # random.shuffle(dev_test_examples)
    num_dev = len(dev_test_examples) // 2
    dev_examples = dev_test_examples[:num_dev]
    test_examples = dev_test_examples[num_dev:]
    make_conll_format(dev_examples, dev_src_file, dev_trg_file, dev_cs_file, num)
    make_conll_format(test_examples, test_src_file, test_trg_file, test_cs_file, num)
Exemple #3
0
def train():
    train_config = get_train_config()
    _, word_to_id = read_vocab(train_config['vocab_file'])
    _, cat_to_id = read_category()
    # 获得日志
    logger = get_logger(os.path.join(FLAGS.log_path, 'train.log'))
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    with tf.Session(config=config) as sess:
        # 创建模型
        model = Model(train_config)
        # 获取数据训练
        x_train_data, y_train_data = process_file(train_config['train_file'],
                                                  word_to_id, cat_to_id,
                                                  train_config['seq_len'])
        #获取验证数据集
        x_val_data, y_val_data = process_file(train_config['val_file'],
                                              word_to_id, cat_to_id,
                                              train_config['seq_len'])
        #初始化变量
        sess.run(tf.global_variables_initializer())

        len_data = len(y_train_data)  #数据样本数量
        start_time = time.time()
        total_batch = 0  # 总批次
        best_acc_val = 0.0  # 最佳验证集准确率
        last_improved = 0  # 记录上一次提升批次
        require_improvement = 1000  # 如果超过1000轮未提升,提前结束训练
        flag = False  #是否结束训练
        #num_epochs:防止前面的学习丢失了一些特征,需要重复学习样本
        for i in range(train_config['num_epochs']):
            for x_input, y_output in batch_iter(x_train_data, y_train_data,
                                                train_config['batch_size']):
                total_batch += 1

                step, acc, loss = model.run_step(sess, x_input, y_output)
                #迭代100次评估一次模型
                if (total_batch % FLAGS.evaluate_every == 0):
                    time_dif = get_time_dif(start_time)
                    logger.info(
                        "train: iterator{}: step:{}/{} acc:{} loss:{} time:{}".
                        format(i + 1, step % len_data, len_data, acc, loss,
                               time_dif))
                    val_acc, text_los = evaluate(sess, model, x_val_data,
                                                 y_val_data)
                    logger.info("test: acc:{} loss:{} ".format(
                        val_acc, text_los))
                    #保存模型
                    if acc > 0.5 and val_acc > 0.5 and val_acc > best_acc_val:
                        last_improved = total_batch
                        best_acc_val = val_acc
                        checkpoint_path = os.path.join(FLAGS.checkpoints_path,
                                                       'checkpoints')
                        saver = tf.train.Saver(
                            tf.global_variables(),
                            max_to_keep=FLAGS.num_checkpoints)
                        saver.save(sess, checkpoint_path, global_step=step)
                if total_batch - last_improved > require_improvement:
                    # 验证集正确率长期不提升,提前结束训练
                    print("No optimization for a long time, auto-stopping...")
                    flag = True
                    break  # 跳出循环

            if flag:
                time_dif = get_time_dif(start_time)
                logger.info('训练结束:{}'.format(time_dif))
                break