Example #1
0
def train():
    #加载训练数据并生成可训练数据
    train_sor_data, train_mub_data = load_sentences(FLAGS.train_sor_path,
                                                    FLAGS.train_mub_path)
    #将训练数据处理成N批次数据
    train_manager = BatchManager(train_sor_data, train_mub_data,
                                 FLAGS.batch_size)
    #设置gpu参数
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    #加载FLAGS参数
    config = config_model()
    logger = get_logger(config["logger_path"])
    #计算批次数
    word2id, id2word = load_sor_vocab()
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model_and_embedding(sess, Model, FLAGS.model_path,
                                           config, True)
        logger.info("start training")
        loss = []
        with tf.device('/gpu:0'):
            for i in range(FLAGS.num_of_epoch):
                for batch in train_manager.iter_batch(shuffle=True):
                    step, batch_loss = model.run_step(sess, True, batch)
                    loss.append(batch_loss)
                    if step % FLAGS.steps_check == 0:
                        iteration = step // steps_per_epoch + 1
                        logger.info(
                            "iteration:{} step:{}/{},chatbot loss:{:>9.6f}".
                            format(iteration, step % steps_per_epoch,
                                   steps_per_epoch, np.mean(loss)))
                        loss = []
                if i % 10 == 0:
                    save_model(sess, model, FLAGS.model_path, logger)
Example #2
0
def train():
    train_sentences, dico, char_to_id, id_to_char = load_sentence(
        FLAGS.train_file)
    if not os.path.isfile(FLAGS.map_file):
        if FLAGS.pre_emb:
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico.copy(),
                FLAGS.emb_file,
            )
        else:
            sentences, dico, char_to_id, id_to_char = load_sentence(
                FLAGS.train_file)
        print(train_sentences[0])
        with open(FLAGS.map_file, 'wb') as f:
            pickle.dump([char_to_id, id_to_char], f)
    else:
        with open(FLAGS.map_file, 'rb') as f:
            char_to_id, id_to_char = pickle.load(f)

    train_data, test_data, dev_data = prepare_dataset(train_sentences,
                                                      char_to_id)
    print(train_data[0])
    print(test_data[0])
    print(dev_data[0])
    print(len(train_data), len(dev_data), len(test_data))
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    test_manager = BatchManager(test_data, 100)
    dev_manager = BatchManager(dev_data, 100)

    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)
    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)
    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
    tf_config = tf.ConfigProto(gpu_options=gpu_options)
    tf_config.gpu_options.allow_growth = True

    steps_per_epoch = train_manager.len_data

    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        best = 0
        # sess.graph.finalize()
        for i in range(50):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{},".format(
                        iteration, step % steps_per_epoch, steps_per_epoch))
                    loss = []
            Acc_result = evaluate(sess, model, "dev", dev_manager, logger)
            logger.info("Acc{}".format(Acc_result))
            logger.info("test")
            # precision, recall, f1_score = model.evaluete_(sess,test_manager)
            # logger.info("P, R, F,{},{},{}".format(precision, recall, f1_score))
            test_result = evaluate(sess, model, "test", test_manager, logger)
            if test_result > best:
                best = test_result
                save_model(sess, model, FLAGS.ckpt_path, logger)
Example #3
0
def train(conf):
    train_sentences = load_sentences(conf.train_file, conf.zeros)
    dev_sentences = load_sentences(conf.dev_file, conf.zeros)
    test_sentences = load_sentences(conf.test_file, conf.zeros)

    dico_chars_train = char_mapping(train_sentences, conf.lower)[0]
    dico_chars, char_to_id, id_to_char = augment_with_pretrained(
        dico_chars_train.copy(), conf.emb_file,
        list(
            itertools.chain.from_iterable([[w[0] for w in s]
                                           for s in test_sentences])))
    _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 conf.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               conf.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                conf.lower)

    #loading word embeddings
    all_word_embeds = {}
    for i, line in enumerate(codecs.open(conf.emb_file, 'r', 'utf-8')):
        s = line.strip().split()
        if len(s) == conf.embedding_dim + 1:
            all_word_embeds[s[0]] = np.array([float(i) for i in s[1:]])
    word_embeds_dict = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06),
                                         (len(char_to_id), conf.embedding_dim))
    for w in char_to_id:
        if w in all_word_embeds:
            word_embeds_dict[char_to_id[w]] = all_word_embeds[w]
        elif w.lower() in all_word_embeds:
            word_embeds_dict[char_to_id[w]] = all_word_embeds[w.lower()]
    print('Loaded %i pretrained embeddings.' % len(all_word_embeds))

    train_manager = BatchManager(train_data, conf.batch_size)

    model = BiLSTM_CRF(conf, tag_to_id, char_to_id, word_embeds_dict)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=conf.learning_rate,
                                weight_decay=1e-4)
    epoch = conf.epochs
    dev_f1_ = 0
    for epoch in range(1, epoch + 1):
        print(f'train on epoch {epoch}')
        j = 1
        for batch in train_manager.iter_batch(shuffle=True):
            batch_loss = 0.0
            sentences = batch[1]
            tags = batch[-1]
            for i, index in enumerate(np.random.permutation(len(sentences))):
                model.zero_grad()
                sentence_in = sentences[index]
                tags_in = tags[index]
                loss = model.neg_log_likelihood(sentence_in, tags_in)
                loss.backward()
                optimizer.step()
                batch_loss += loss.data
            print(
                f'[batch {j},batch size:{conf.batch_size}] On this batch loss: {batch_loss}'
            )
            j = j + 1
        print(f'Begin validing result on [epoch {epoch}] valid dataset ...')
        dev_results = get_predictions(model, dev_data, id_to_tag)
        dev_f1 = evaluate_ner(dev_results, conf)
        if dev_f1 > dev_f1_:
            torch.save(model, conf.model_file)
            print('save model success.')
        test_results = get_predictions(model, test_data, id_to_tag)
        test_f1 = evaluate_ner(test_results, conf)
        print(f'[epoch {epoch}] On test dataset] f1: {test_f1:3f}')
Example #4
0
    if singletons is not None:
        words = insert_singletons(words, singletons)
    if parameters['cap_dim']:
        caps = data['caps']
    char_for, char_rev, char_pos = pad_word_chars(chars)
    input = []
    if parameters['word_dim']:
        input.append(words)
    if parameters['char_dim']:
        input.append(char_for)
        if parameters['char_bidirect']:
            input.append(char_rev)
        input.append(char_pos)
    if parameters['cap_dim']:
        input.append(caps)
    if add_label:
        input.append(data['tags'])
    return input


if __name__ == "__main__":
    train_sentences = load_sentences("./data/input.train", True)
    print(train_sentences)
    # create maps if not exist
    _c, char_to_id, id_to_char = char_mapping(train_sentences, True)
    _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, True)
    train_manager = BatchManager(train_data, 100)
    for batch in train_manager.iter_batch(shuffle=True):
        print(batch[0])
        print(batch[-1])
Example #5
0
def main(_):
    if not os.path.isdir(FLAGS.log_path):
        os.makedirs(FLAGS.log_path)
    if not os.path.isdir(FLAGS.model_path):
        os.makedirs(FLAGS.model_path)
    if not os.path.isdir(FLAGS.result_path):
        os.makedirs(FLAGS.result_path)
    tag_to_id = {
        "O": 0,
        "B-LOC": 1,
        "I-LOC": 2,
        "B-PER": 3,
        "I-PER": 4,
        "B-ORG": 5,
        "I-ORG": 6
    }
    # load data
    id_to_word, id_to_tag, train_data, dev_data, test_data = load_data(
        FLAGS, tag_to_id)
    train_manager = BatchManager(train_data, len(id_to_tag),
                                 FLAGS.word_max_len, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, len(id_to_tag), FLAGS.word_max_len,
                               FLAGS.valid_batch_size)
    test_manager = BatchManager(test_data, len(id_to_tag), FLAGS.word_max_len,
                                FLAGS.valid_batch_size)
    with tf.Session() as sess:
        model = create_model(sess, id_to_word, id_to_tag)
        loss = 0
        best_test_f1 = 0
        steps_per_epoch = len(train_data) // FLAGS.batch_size + 1
        for _ in range(FLAGS.max_epoch):
            iteration = (model.global_step.eval()) // steps_per_epoch + 1
            train_manager.shuffle()
            for batch in train_manager.iter_batch():
                global_step = model.global_step.eval()
                step = global_step % steps_per_epoch
                batch_loss = model.run_step(sess, True, batch)
                loss += batch_loss / FLAGS.steps_per_checkpoint
                if global_step % FLAGS.steps_per_checkpoint == 0:
                    model.logger.info(
                        "iteration:{} step:{}/{}, NER loss:{:>9.6f}".format(
                            iteration, step, steps_per_epoch, loss))
                    loss = 0

            model.logger.info("validating ner")
            ner_results = model.predict(sess, dev_manager)
            eval_lines = test_ner(ner_results, FLAGS.result_path)
            for line in eval_lines:
                model.logger.info(line)
            test_f1 = float(eval_lines[1].strip().split()[-1])
            if test_f1 > best_test_f1:
                best_test_f1 = test_f1
                model.logger.info("new best f1 score:{:>.3f}".format(test_f1))
                model.logger.info("saving model ...")
                checkpoint_path = os.path.join(FLAGS.model_path,
                                               "translate.ckpt")
                model.saver.save(sess,
                                 checkpoint_path,
                                 global_step=model.global_step)
        # test model
        model.logger.info("testing ner")
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_path)
        model.logger.info("Reading model parameters from %s" %
                          ckpt.model_checkpoint_path)
        model.saver.restore(sess, ckpt.model_checkpoint_path)
        ner_results = model.predict(sess, test_manager)
        eval_lines = test_ner(ner_results, FLAGS.result_path)
        for line in eval_lines:
            model.logger.info(line)