tf.flags.DEFINE_integer("seed", 123, "random seed")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()

MAXLEN = 30
np.random.seed(FLAGS.seed)

train_file = "dataset/train.tsv"
valid_file = "dataset/valid.tsv"
test_file = "dataset/test.tsv"

# statement, credit_history, topic, speaker, job, state, party, location, label

train_statement, train_ch, train_topic, train_speaker, train_job, train_state, \
train_party, train_location, train_y = load_data(train_file)

valid_statement, valid_ch, valid_topic, valid_speaker, valid_job, valid_state, \
valid_party, valid_location, valid_y = load_data(valid_file)

test_statement, test_ch, test_topic, test_speaker, test_job, test_state, \
test_party, test_location, test_y = load_data(test_file)

train_location = [clean_text(lc) for lc in train_location]
valid_location = [clean_text(lc) for lc in valid_location]
test_location = [clean_text(lc) for lc in test_location]

train_location = texts_to_tokens(train_location)
valid_location = texts_to_tokens(valid_location)
test_location = texts_to_tokens(test_location)
Exemple #2
0
def do_train(config):
    train, dev, test = load_data(config)  # 加载数据
    word_to_id, id_to_word, tag_to_id, id_to_tag = create_maps(train, config)  # 创建或读取maps

    # 配置信息及保存
    config["num_chars"] = len(word_to_id)  # 词总数
    config["num_tags"] = len(tag_to_id)  # 标签总数
    with open(config["config_file"], "w") as f:
        json.dump(config, f, ensure_ascii=False, indent=4)

    # 数据处理
    train_data = prepare_dataset(train, word_to_id, tag_to_id, config["lower"])
    dev_data = prepare_dataset(dev, word_to_id, tag_to_id, config["lower"])
    test_data = prepare_dataset(test, word_to_id, tag_to_id, config["lower"])

    print("train/dev/test 句子数:{} / {} / {}".format(len(train_data), len(dev_data), len(test_data)))

    # 分batch
    train_manager = BatchManager(train_data, config["batch_size"])
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    steps_per_epoch = train_manager.len_data  # 每个轮次的steps

    # 创建相关路径
    make_path(config)

    # logger
    logger = get_logger(config["log_file"])

    # GPU限制
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        # 创建模型, 可以提供使用现有参数配置
        model = Model(config)

        ckpt = tf.train.get_checkpoint_state(config["ckpt_path"])  # 从模型路径获取ckpt
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):  # 现有模型
            logger.info("读取现有模型...")
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            logger.info("新建模型...")
            sess.run(tf.global_variables_initializer())  # 不使用预训练的embeddings

            # 如果使用预训练的embeddings
            if config["pre_emb"]:
                emb_weights = sess.run(model.char_lookup.read_value())
                emb_weights = load_word2vec(config["emb_file"], id_to_word, config["char_dim"], emb_weights)
                sess.run(model.char_lookup.assign(emb_weights))
                logger.info("Load pre-trained embedding.")

        logger.info("开始训练...")
        loss = []
        for i in range(config["max_epoch"]):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)

                if step % config["steps_check"] == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, NER loss:{:>9.6f}".format(
                        iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))

                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger, config)
            if best:
                save_model(sess, model, config["ckpt_path"], logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger, config)