def make_para_dataset(): embedding_file = "../glove.840B.300d.txt" embedding = "./embedding.pkl" src_word2idx_file = "./word2idx.pkl" train_squad = "../squad/train-v1.1.json" dev_squad = "../squad/dev-v1.1.json" train_src_file = "../squad/para-train.txt" train_trg_file = "../squad/tgt-train.txt" dev_src_file = "../squad/para-dev.txt" dev_trg_file = "../squad/tgt-dev.txt" test_src_file = "../squad/para-test.txt" test_trg_file = "../squad/tgt-test.txt" # pre-process training data train_examples, counter = process_file(train_squad) make_conll_format(train_examples, train_src_file, train_trg_file) word2idx = make_vocab_from_squad(src_word2idx_file, counter, config.vocab_size) make_embedding(embedding_file, embedding, word2idx) # split dev into dev and test dev_test_examples, _ = process_file(dev_squad) # random.shuffle(dev_test_examples) num_dev = len(dev_test_examples) // 2 dev_examples = dev_test_examples[:num_dev] test_examples = dev_test_examples[num_dev:] make_conll_format(dev_examples, dev_src_file, dev_trg_file) make_conll_format(test_examples, test_src_file, test_trg_file)
def make_para_dataset(): embedding_file = "./glove.840B.300d.txt" embedding = "./embedding.pkl" src_word2idx_file = "./word2idx.pkl" ent2idx_file = "./ent2idx.pkl" rel2idx_file = "./rel2idx.pkl" entity_embedding = "./entity.pkl" relation_embedding = "./relation.pkl" train_squad = "../squad/train-v1.1.json" dev_squad = "../squad/dev-v1.1.json" train_src_file = "../squad/para-train.txt" train_trg_file = "../squad/tgt-train.txt" train_cs_file = "./paracs-train.json" dev_src_file = "../squad/para-dev.txt" dev_trg_file = "../squad/tgt-dev.txt" dev_cs_file = "./paracs-dev.json" test_src_file = "../squad/para-test.txt" test_trg_file = "../squad/tgt-test.txt" test_cs_file = "./paracs-test.json" ent_vector = "./entity_transE.txt" rel_vector = "./relation_transE.txt" ent_file = "./entity.txt" rel_file = "./relation.txt" cs_file = "./resource.json" database = dict() with open(cs_file, "r") as f: d = json.load(f) if d["dict_csk"] is not None: database = d["dict_csk"] # process the graph vector through the static attention mechanism _, _, ent2idx, rel2idx = make_graph_vector(entity_embedding, relation_embedding, ent_vector, ent_file, rel_vector, rel_file, ent2idx_file, rel2idx_file ) # pre-process training data train_examples, counter, num = process_file(train_squad, ent2idx, rel2idx, database) make_conll_format(train_examples, train_src_file, train_trg_file, train_cs_file, num) word2idx = make_vocab_from_squad(src_word2idx_file, counter, config.vocab_size) make_embedding(embedding_file, embedding, word2idx) # split dev into dev and test dev_test_examples, _, num = process_file(dev_squad, ent2idx, rel2idx, database) # random.shuffle(dev_test_examples) num_dev = len(dev_test_examples) // 2 dev_examples = dev_test_examples[:num_dev] test_examples = dev_test_examples[num_dev:] make_conll_format(dev_examples, dev_src_file, dev_trg_file, dev_cs_file, num) make_conll_format(test_examples, test_src_file, test_trg_file, test_cs_file, num)
def train(): train_config = get_train_config() _, word_to_id = read_vocab(train_config['vocab_file']) _, cat_to_id = read_category() # 获得日志 logger = get_logger(os.path.join(FLAGS.log_path, 'train.log')) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) with tf.Session(config=config) as sess: # 创建模型 model = Model(train_config) # 获取数据训练 x_train_data, y_train_data = process_file(train_config['train_file'], word_to_id, cat_to_id, train_config['seq_len']) #获取验证数据集 x_val_data, y_val_data = process_file(train_config['val_file'], word_to_id, cat_to_id, train_config['seq_len']) #初始化变量 sess.run(tf.global_variables_initializer()) len_data = len(y_train_data) #数据样本数量 start_time = time.time() total_batch = 0 # 总批次 best_acc_val = 0.0 # 最佳验证集准确率 last_improved = 0 # 记录上一次提升批次 require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练 flag = False #是否结束训练 #num_epochs:防止前面的学习丢失了一些特征,需要重复学习样本 for i in range(train_config['num_epochs']): for x_input, y_output in batch_iter(x_train_data, y_train_data, train_config['batch_size']): total_batch += 1 step, acc, loss = model.run_step(sess, x_input, y_output) #迭代100次评估一次模型 if (total_batch % FLAGS.evaluate_every == 0): time_dif = get_time_dif(start_time) logger.info( "train: iterator{}: step:{}/{} acc:{} loss:{} time:{}". format(i + 1, step % len_data, len_data, acc, loss, time_dif)) val_acc, text_los = evaluate(sess, model, x_val_data, y_val_data) logger.info("test: acc:{} loss:{} ".format( val_acc, text_los)) #保存模型 if acc > 0.5 and val_acc > 0.5 and val_acc > best_acc_val: last_improved = total_batch best_acc_val = val_acc checkpoint_path = os.path.join(FLAGS.checkpoints_path, 'checkpoints') saver = tf.train.Saver( tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) saver.save(sess, checkpoint_path, global_step=step) if total_batch - last_improved > require_improvement: # 验证集正确率长期不提升,提前结束训练 print("No optimization for a long time, auto-stopping...") flag = True break # 跳出循环 if flag: time_dif = get_time_dif(start_time) logger.info('训练结束:{}'.format(time_dif)) break