Exemple #1
0
def train():
    # 加载数据集
    train_sentences = data_loader.load_sentences(FLAGS.train_file)
    dev_sentences = data_loader.load_sentences(FLAGS.dev_file)
    test_sentences = data_loader.load_sentences(FLAGS.test_file)

    # 转换编码
    data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema)
    data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema)
    data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # 创建单词和词典映射
    if not os.path.isfile(FLAGS.map_file):
        if FLAGS.pre_emb:
            dico_words_train = data_loader.word_mapping(train_sentences)[0]
            dico_word, word_to_id, id_to_word = data_utils.augment_with_pretrained(
                dico_words_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _, word_to_id, id_to_word = data_loader.word_mapping(
                train_sentences)
        _, tag_to_id, id_to_tag = data_loader.tag_mapping(train_sentences)
        with open(FLAGS.map_file, 'wb') as f:
            pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, 'rb') as f:
            word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f)

    # 准备数据
    train_data = data_loader.prepare_dataset(train_sentences, word_to_id,
                                             tag_to_id)
    dev_data = data_loader.prepare_dataset(dev_sentences, word_to_id,
                                           tag_to_id)
    test_data = data_loader.prepare_dataset(test_sentences, word_to_id,
                                            tag_to_id)

    # 将数据分批处理
    train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size)
    dev_manager = data_utils.BatchManager(dev_data, FLAGS.batch_size)
    test_manager = data_utils.BatchManager(test_data, FLAGS.batch_size)

    # 创建不存在的文件夹
    model_utils.make_path(FLAGS)

    # 判断配置文件
    if os.path.isfile(FLAGS.config_file):
        config = model_utils.load_config(FLAGS.config_file)
    else:
        config = model_utils.config_model(FLAGS, word_to_id, tag_to_id)
        model_utils.save_config(config, FLAGS.config_file)

    # 配置印logger
    log_path = os.path.join('log', FLAGS.log_file)
    logger = model_utils.get_logger(log_path)
    model_utils.print_config(config, logger)

    tf_config = tf.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True

    step_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = model_utils.create(sess, Model, FLAGS.ckpt_path, load_word2vec,
                                   config, id_to_word, logger)
        logger.info('开始训练')
        loss = []
        start = time.time()
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.setps_chech == 0:
                    iteration = step // step_per_epoch + 1
                    logger.info(
                        "iteration{}: step{}/{}, NER loss:{:>9.6f}".format(
                            iteration, step % step_per_epoch, step_per_epoch,
                            np.mean(loss)))
                    loss = []
            best = evaluate(sess, model, 'dev', dev_manager, id_to_tag, logger)

            if best:
                model_utils.save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, 'test', test_manager, id_to_tag, logger)
        t = time.time() - start
        logger.info('cost time: %f' % t)
		dev_sentences, word_to_id, tag_to_id

		)



		test_data = data_loader.prepare_dataset(

		test_sentences, word_to_id, tag_to_id

		)



train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size)

dev_manager = data_utils.BatchManager(dev_data, FLAGS.batch_size)

test_manager = data_utils.BatchManager(test_data, FLAGS.batch_size)



print('train_data_num %i, dev_data_num %i, test_data_num %i' % (len(train_data), len(dev_data), len(test_data)))



model_utils.make_path(FLAGS)


Exemple #3
0
    :param tag_to_id:
    :param train:
    :return:
    """
    none_index = tag_to_id['O']

    data = []
    for s in sentences:
        word_list = [w[0] for w in s]
        word_id_list = [
            word_to_id[w if w in word_to_id else '<UNK>'] for w in word_list
        ]
        segs = data_utils.get_seg_features("".join(word_list))
        if train:
            tag_id_list = [tag_to_id[w[-1]] for w in s]
        else:
            tag_id_list = [none_index for w in s]
        data.append([word_list, word_id_list, segs, tag_id_list])

    return data


if __name__ == "__main__":
    path = "data/ner.dev"
    sentences = load_sentences(path)
    update_tag_scheme(sentences, "BIOES")
    _, word_to_id, id_to_word = word_mapping(sentences)
    _, tag_to_id, id_to_tag = tag_mapping(sentences)
    dev_data = prepare_dataset(sentences, word_to_id, tag_to_id)
    data_utils.BatchManager(dev_data, 120)
def train():
    # 加载数据集
    train_sentences = data_loader.load_sentences(FLAGS.train_file)
    dev_sentences = data_loader.load_sentences(FLAGS.dev_file)
    test_sentences = data_loader.load_sentences(FLAGS.test_file)

    # 转换编码 bio转bioes
    data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema)
    data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema)
    data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema)

    # 创建单词映射及标签映射
    if not os.path.isfile(FLAGS.map_file):
        if FLAGS.pre_emb:
            dico_words_train = data_loader.word_mapping(train_sentences)[0]
            dico_word, word_to_id, id_to_word = data_utils.augment_with_pretrained(
                dico_words_train.copy(),
                FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable(
                        [[w[0] for w in s] for s in test_sentences]
                    )
                )
            )
        else:
            _, word_to_id, id_to_word = data_loader.word_mapping(train_sentences)

        _, tag_to_id, id_to_tag = data_loader.tag_mapping(train_sentences)

        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, 'rb') as f:
            word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f)

    train_data = data_loader.prepare_dataset(
        train_sentences, word_to_id, tag_to_id
    )

    dev_data = data_loader.prepare_dataset(
        dev_sentences, word_to_id, tag_to_id
    )

    test_data = data_loader.prepare_dataset(
        test_sentences, word_to_id, tag_to_id
    )

    train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size)
    dev_manager = data_utils.BatchManager(dev_data, FLAGS.batch_size)
    test_manager = data_utils.BatchManager(test_data, FLAGS.batch_size)

    print('train_data_num %i, dev_data_num %i, test_data_num %i' % (len(train_data), len(dev_data), len(test_data)))

    model_utils.make_path(FLAGS)

    if os.path.isfile(FLAGS.config_file):
        config = model_utils.load_config(FLAGS.config_file)
    else:
        config = model_utils.config_model(FLAGS, word_to_id, tag_to_id)
        model_utils.save_config(config, FLAGS.config_file)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = model_utils.get_logger(log_path)
    model_utils.print_config(config, logger)

    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch =train_manager.len_data
    with tf.Session(config = tf_config) as sess:
        model = model_utils.create(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_word, logger)
        logger.info("开始训练")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.setps_chech== 0:
                    iterstion = step // steps_per_epoch + 1
                    logger.info("iteration:{} step{}/{},NER loss:{:>9.6f}".format(iterstion, step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess,model,"dev", dev_manager, id_to_tag, logger)

            if best:
                model_utils.save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
Exemple #5
0
def train():
    # load data sets
    # sentences 的格式如下  ['在', 'O'], ['厦', 'B-LOC'], ['门', 'I-LOC']
    # train_sentences = loader.load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    # dev_sentences = loader.load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    # test_sentences = loader.load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    train_sentences = loader.load_folder_sentences(FLAGS.train_file,
                                                   FLAGS.lower, FLAGS.zeros)
    dev_sentences = loader.load_folder_sentences(FLAGS.dev_file, FLAGS.lower,
                                                 FLAGS.zeros)
    test_sentences = loader.load_folder_sentences(FLAGS.test_file, FLAGS.lower,
                                                  FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    # update_tag_scheme 后sentence没有太大的变化
    loader.update_tag_scheme(train_sentences, FLAGS.tag_schema)
    loader.update_tag_scheme(test_sentences, FLAGS.tag_schema)

    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

    # create maps if not exist
    # 是否存在maps.pkl文件,如果不存在就需要读取训练数据,
    # 获得char_to_id  tag_to_id

    # create maps if not exist
    # 是否存在maps.pkl文件,
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = loader.char_mapping(train_sentences,
                                                   FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = loader.augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = loader.char_mapping(
                train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = loader.tag_mapping(train_sentences)

        print('tag_to_id: ', tag_to_id)

        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # print('tag_to_id: ', tag_to_id)

    print('tag_to_id: ', tag_to_id)
    # prepare data, get a collection of list containing index
    train_data = loader.prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                        FLAGS.lower)
    dev_data = loader.prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                                      FLAGS.lower)
    test_data = loader.prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                       FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size)
    dev_manager = data_utils.BatchManager(dev_data, 100)
    test_manager = data_utils.BatchManager(test_data, 100)

    # make path for store log and model if not exist
    utils.make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = utils.load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        utils.save_config(config, FLAGS.config_file)
    utils.make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)  # ./log/train.log
    logger = utils.get_logger(log_path)
    utils.print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = utils.create_model(sess, Model, FLAGS.ckpt_path,
                                   data_utils.load_word2vec, config,
                                   id_to_char, logger)
        logger.info("start training")
        loss = []

        for i in range(FLAGS.iterations):
            # for i in range(10):
            logger.info('epoch: {}'.format(i))
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                utils.save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
Exemple #6
0
def train():
    tran_batch_manager = data_utils.BatchManager("data/train.txt.id40000.in",
                                                 config.batch_size)
    test_batch_manager = data_utils.BatchManager("data/test.txt.id40000.in",
                                                 config.batch_size)

    with tf.Session() as sess:
        graph_writer = tf.summary.FileWriter(config.model_dir,
                                             graph=sess.graph)
        model_obj = model.Seq2SeqModel('train')
        model_obj.model_restore(sess)

        #outputTensors = []
        #print(model_obj.decoder_pred_decode.name.replace(":0",""))
        #outputTensors.append(model_obj.decoder_pred_decode.name.replace(":0",""))

        #output_graph_with_weight = tf.graph_util.convert_variables_to_constants(sess,sess.graph_def,outputTensors)
        #with tf.gfile.FastGFile(os.path.join(config.model_dir, "weight_seq2seq.pb"),
        #                        'wb') as gf:
        #    gf.write(output_graph_with_weight.SerializeToString())

        print("开始训练")
        loss = 0.0
        start_time = time.time()
        best_loss = 10000.0
        for epoch_id in range(config.max_epochs):
            for step, train_batch in enumerate(tran_batch_manager.iterbatch()):
                if train_batch['encode'] is None:
                    continue
                print("开始第%d轮第%d次训练,globa_step %d" %
                      (epoch_id + 1, step + 1, model_obj.global_step.eval()))

                # Execute a single training step
                step_loss, summary = model_obj.train(
                    sess,
                    encoder_inputs=train_batch['encode'],
                    decoder_inputs=train_batch['decode'],
                    encoder_inputs_length=train_batch['encode_lengths'],
                    decoder_inputs_length=train_batch['decode_lengths'])
                loss += float(step_loss) / config.display_freq

                if (model_obj.global_step.eval() +
                        1) % config.display_freq == 0:
                    if loss < best_loss:
                        best_loss = loss
                        print("保存模型。。。。。。。")
                        checkpoint_path = model_obj.mode_save_path
                        model_obj.saver.save(sess,
                                             checkpoint_path,
                                             global_step=model_obj.global_step)

                    avg_perplexity = math.exp(
                        float(loss)) if loss < 300 else float("inf")

                    #计算时间
                    time_cost = time.time() - start_time
                    step_time = time_cost / config.display_freq
                    print(
                        '第%d轮训练,第%d的步,loss值为 %.2f , Preplexity值为 %.2f,花费时间 %f'
                        % (epoch_id, model_obj.global_step.eval(), loss,
                           avg_perplexity, step_time))
                    loss = 0.0
                    start_time = time.time()

                    # Record training summary for the current batch
                    graph_writer.add_summary(summary,
                                             model_obj.global_step.eval())

                #验证模型
                if (model_obj.global_step.eval() + 1) % config.valid_freq == 0:
                    print("验证模型。。。。。")
                    valid_loss = 0.0
                    totoal_sentent = 0
                    for test_batch in test_batch_manager.iterbatch():
                        step_loss, summary = model_obj.eval(
                            sess,
                            encoder_inputs=test_batch['encode'],
                            decoder_inputs=test_batch['decode'],
                            encoder_inputs_length=test_batch['encode_lengths'],
                            decoder_inputs_length=test_batch['decode_lengths'])
                        batch_size = test_batch['encode_lengths'].shape[0]
                        valid_loss += step_loss * batch_size
                        totoal_sentent += batch_size
                    valid_loss = valid_loss / totoal_sentent
                    print("验证集上面的loss值为 %.2f, Preplexity值为 %.2f" %
                          (valid_loss, math.exp(valid_loss)))

                if (model_obj.global_step.eval() + 1) % config.save_freq == 0:
                    print("保存模型。。。。。。。")
                    checkpoint_path = model_obj.mode_save_path
                    model_obj.saver.save(sess,
                                         checkpoint_path,
                                         global_step=model_obj.global_step)