Ejemplo n.º 1
0
def train():
    #加载训练数据并生成可训练数据
    train_sor_data, train_mub_data = load_sentences(FLAGS.train_sor_path,
                                                    FLAGS.train_mub_path)
    #将训练数据处理成N批次数据
    train_manager = BatchManager(train_sor_data, train_mub_data,
                                 FLAGS.batch_size)
    #设置gpu参数
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    #加载FLAGS参数
    config = config_model()
    logger = get_logger(config["logger_path"])
    #计算批次数
    word2id, id2word = load_sor_vocab()
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model_and_embedding(sess, Model, FLAGS.model_path,
                                           config, True)
        logger.info("start training")
        loss = []
        with tf.device('/gpu:0'):
            for i in range(FLAGS.num_of_epoch):
                for batch in train_manager.iter_batch(shuffle=True):
                    step, batch_loss = model.run_step(sess, True, batch)
                    loss.append(batch_loss)
                    if step % FLAGS.steps_check == 0:
                        iteration = step // steps_per_epoch + 1
                        logger.info(
                            "iteration:{} step:{}/{},chatbot loss:{:>9.6f}".
                            format(iteration, step % steps_per_epoch,
                                   steps_per_epoch, np.mean(loss)))
                        loss = []
                if i % 10 == 0:
                    save_model(sess, model, FLAGS.model_path, logger)
Ejemplo n.º 2
0
    def train(self):
        batch_manager = BatchManager(self.encoder_vec, self.decoder_vec, self.batch_size)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        loss_track = []
        total_time = 0
        nums_batch = len(batch_manager.batch_data)
        for epoch in range(self.max_epoch):
            print "[->] epoch {}".format(epoch)   
            batch_index = 0
            for batch in batch_manager.batch():
                batch_index += 1
                # 获取fd [time_steps, batch_size]
                fd = self.get_fd(batch, self.model)
                _, loss, logits, labels = self.sess.run([self.model.train_op, 
                                    self.model.loss,
                                    self.model.logits,
                                    self.model.decoder_labels], fd)
                loss_track.append(loss)
                if batch_index % self.show_batch == 0:
                    print "\tstep: {}/{}".format(batch_index, nums_batch)
                    print '\tloss: {}'.format(loss)
                    print "\t"+"-"*50
                checkpoint_path = self.model_path+"chatbot_seq2seq.ckpt"
                # 保存模型
                self.model.saver.save(self.sess, checkpoint_path, global_step=self.model.global_step)
Ejemplo n.º 3
0
    def train(self):
        print("++++++++train+++++++")
        batch_manager = BatchManager(self.encoder_vec, self.decoder_vec,
                                     self.batch_size)

        #用来配置tf的sess,使用gpu还是cpu
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        #存放交叉熵结果
        loss_track = []
        total_time = 0
        #算第几轮用的参数
        nums_batch = len(batch_manager.batch_data)
        for epoch in range(self.max_epoch):
            print("[->] epoch {}".format(epoch))
            batch_index = 0
            for batch in batch_manager.batch():
                batch_index += 1
                # 获取fd [time_steps, batch_size]
                fd = self.get_fd(batch, self.model)
                #sess.run计算model的张量tensor,这里利用优化器做优化
                _, loss, logits, labels = self.sess.run([
                    self.model.train_op, self.model.loss, self.model.logits,
                    self.model.decoder_labels
                ], fd)
                loss_track.append(loss)
                if batch_index % self.show_batch == 0:
                    print("\tstep: {}/{}".format(batch_index, nums_batch))
                    print('\tloss: {}'.format(loss))
                    print("\t" + "-" * 50)
                checkpoint_path = self.model_path + "chatbot_seq2seq.ckpt"
                # 保存模型
                self.model.saver.save(self.sess,
                                      checkpoint_path,
                                      global_step=self.model.global_step)
def main():
    print(args)

    data_dir = '/home/tiankeke/workspace/datas/sumdata/'
    TRAIN_X = os.path.join(data_dir, 'train/train.article.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt')
    VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt')
    VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt')

    small_vocab_file = 'sumdata/small_vocab.json'
    if os.path.exists(small_vocab_file):
        small_vocab = json.load(open(small_vocab_file))
    else:
        small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000)

    # emb_file = '/home/tiankeke/workspace/embeddings/giga-vec1.bin'
    # vocab, embeddings = load_word2vec_embedding(emb_file)

    max_src_len = 60
    max_tgt_len = 20

    bs = args.batch_size
    n_train = args.n_train
    n_valid = args.n_valid

    vocab = small_vocab

    train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, vocab)
    train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, vocab)
    valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, vocab)
    valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, vocab)

    # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 4, 256,
    #                     64, 64, 1024, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda()
    # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 6, 300,
    #                     50, 50, 1200, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda()
    # elmo_requries_grad=True after epoch 3
    model = ElmoTransformer(max_src_len, len(vocab), 2, 8, 64, 64, 256, 512, 2048,
                            dropout=0.5, elmo_requires_grad=False).cuda()

    # print(model)
    saved_state = {'epoch': 0, 'lr': 0.001}
    if os.path.exists(args.ckpt_file):
        saved_state = torch.load(args.ckpt_file)
        model.load_state_dict(saved_state['state_dict'])
        logging.info('Load model parameters from %s' % args.ckpt_file)

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=saved_state['lr'])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5)
    scheduler.step()  # last_epoch=-1, which will not update lr at the first time

    # eval_model(valid_x, valid_y, vocab, model)
    train(train_x, train_y, valid_x, valid_y, model,
          optimizer, vocab, scheduler, args.n_epochs, saved_state['epoch'])
Ejemplo n.º 5
0
def main():
    print(args)

    N_EPOCHS = args.n_epochs
    N_TRAIN = args.n_train
    N_VALID = args.n_valid
    BATCH_SIZE = args.batch_size

    data_dir = args.data_dir
    TRAIN_X = os.path.join(data_dir, 'train/train.article.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt')
    VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt')
    VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt')
    """
	vocab_file = os.path.join(data_dir, "vocab.json")
	if not os.path.exists(vocab_file):
		utils.build_vocab([TRAIN_X, TRAIN_Y], vocab_file, n_vocab=80000)
	vocab = json.load(open(vocab_file))
	"""

    embedding_path = '/home/kaiying/coco/embeddings/giga-256d.bin'
    vocab, embeddings = utils.load_word2vec_embedding(embedding_path)
    print(len(vocab), embeddings.shape)

    train_x = BatchManager(load_data(TRAIN_X, vocab, N_TRAIN), BATCH_SIZE)
    train_y = BatchManager(load_data(TRAIN_Y, vocab, N_TRAIN), BATCH_SIZE)

    valid_x = BatchManager(load_data(VALID_X, vocab, N_VALID), BATCH_SIZE)
    valid_y = BatchManager(load_data(VALID_Y, vocab, N_VALID), BATCH_SIZE)

    model = Model(vocab, emb_dim=256, hid_dim=512,
                  embeddings=embeddings).cuda()
    # model.embedding_look_up.to(torch.device("cpu"))

    ckpt_file = args.ckpt_file
    saved_state = {'lr': 0.001, 'epoch': 0}
    if os.path.exists(ckpt_file):
        saved_state = torch.load(ckpt_file)
        model.load_state_dict(saved_state['state_dict'])
        logging.info('Load model parameters from %s' % ckpt_file)

    optimizer = torch.optim.Adam(model.parameters(), lr=saved_state['lr'])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=1,
                                                gamma=0.5)
    scheduler.step()

    train(train_x, train_y, valid_x, valid_y, model, optimizer, scheduler,
          saved_state['epoch'], N_EPOCHS)
Ejemplo n.º 6
0
def main():
    # if not os.path.exists(args.ckpt_file):
    #     raise FileNotFoundError("model file not found")

    data_dir = '~/Textsum/textsum-transformer-master/sumdata/'
    TRAIN_X = os.path.join(data_dir, 'train/train.article.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt')
    TEST_X = args.input_file

    small_vocab_file = 'sumdata/small_vocab.json'
    if os.path.exists(small_vocab_file):
        small_vocab = json.load(open(small_vocab_file))
    else:
        small_vocab = build_vocab([TRAIN_X, TRAIN_Y],
                                  small_vocab_file,
                                  vocab_size=80000)

    max_src_len = 60
    max_tgt_len = 20

    test_x = BatchManager(load_data(TEST_X, max_src_len, args.n_test),
                          args.batch_size, small_vocab)

    model = TransformerShareEmbedding(len(small_vocab), max_src_len, 1, 6, 300,
                                      50, 50, 1200, False).cuda()

    # saved_state = torch.load(args.ckpt_file)
    # model.load_state_dict(saved_state['state_dict'])
    # print('Load model parameters from %s' % args.ckpt_file)

    my_test(test_x, model, small_vocab)
Ejemplo n.º 7
0
def get_train_data():
    normal_train, normal_test = get_sentence(args.train_data, args.test_data)
    transfer_train, transfer_test = get_sentence(args.transfer_train_data, args.transfer_test_data)
    char2id, id2char, tag2id, id2tag, transfer_tag2id, transfer_id2tag = get_transform(normal_train + transfer_train,
                                                                                       args.map_path,
                                                                                       args.tag2label_path,
                                                                                       args.transfer_tag2label_path)
    train_data = preprocess_data(normal_train, char2id, tag2id)
    train_manager = BatchManager(train_data, args.batch_size)
    test_data = preprocess_data(normal_test, char2id, tag2id)
    test_manager = BatchManager(test_data, args.batch_size)
    transfer_train_data = preprocess_data(transfer_train, char2id, transfer_tag2id)
    transfer_train_manager = BatchManager(transfer_train_data, args.batch_size)
    transfer_test_data = preprocess_data(transfer_test, char2id, transfer_tag2id)
    transfer_test_manager = BatchManager(transfer_test_data, args.batch_size)

    return train_manager, test_manager, transfer_train_manager, transfer_test_manager, id2char, id2tag, transfer_id2tag
Ejemplo n.º 8
0
def main():
    vocab, max_src_len, max_tgt_len, inputs, targets = load_data('vocab.json',
                                                                 n_data=850)
    inputs, targets = shuffle(inputs, targets)

    # set d_model = d_word_vec
    model = Transformer(n_src_vocab=len(vocab),
                        n_tgt_vocab=len(vocab),
                        max_src_len=max_src_len,
                        max_tgt_len=max_tgt_len,
                        d_word_vec=32,
                        N=6,
                        n_head=4,
                        d_q=16,
                        d_k=16,
                        d_v=16,
                        d_model=32,
                        d_inner=64)
    model.cuda()
    # model = Encoder(len(vocab), max_src_len, d_src_emb=32, N=3, n_head=4,
    #                 d_q=16, d_k=16, d_v=16, d_model=32, d_inner=32)

    model_file = 'models/params_transformer.pkl'
    if os.path.exists(model_file):
        print("Loading parameters from %s" % model_file)
        model.load_state_dict(torch.load(model_file))

    train_idx = int(len(inputs) * 0.90)
    valid_idx = int(len(inputs) * 0.95)

    train_x = BatchManager(inputs[:train_idx], 32)
    train_y = BatchManager(targets[:train_idx], 32)

    valid_x = BatchManager(inputs[train_idx:valid_idx], 64)
    valid_y = BatchManager(targets[train_idx:valid_idx], 64)

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = Adam(parameters, lr=0.0001)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=1)

    # train(train_x, train_y, valid_x, valid_y, model, optimizer, n_epochs=100, scheduler=scheduler)
    eval(model, vocab, inputs[train_idx:], targets[train_idx:], out_len=12)
Ejemplo n.º 9
0
def main():
    if not os.path.exists(args.ckpt_file):
        raise FileNotFoundError("model file not found")

    data_dir = '/home/tiankeke/workspace/datas/sumdata/'
    TRAIN_X = os.path.join(data_dir, 'train/train.article.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt')
    TEST_X = args.input_file

    small_vocab_file = 'sumdata/small_vocab.json'
    if os.path.exists(small_vocab_file):
        small_vocab = json.load(open(small_vocab_file))
    else:
        small_vocab = build_vocab([TRAIN_X, TRAIN_Y],
                                  small_vocab_file,
                                  vocab_size=80000)

    max_src_len = 101
    max_tgt_len = 47

    test_x = BatchManager(load_data(TEST_X, max_src_len, args.n_test),
                          args.batch_size, small_vocab)

    model = Transformer(len(small_vocab),
                        len(small_vocab),
                        max_src_len,
                        d_word_vec=300,
                        d_model=300,
                        d_inner=1200,
                        n_layers=1,
                        n_head=6,
                        d_k=50,
                        d_v=50,
                        dropout=0.1,
                        tgt_emb_prj_weight_sharing=True,
                        emb_src_tgt_weight_sharing=True).cuda()
    # print(model)
    model.eval()

    saved_state = torch.load(args.ckpt_file)
    model.load_state_dict(saved_state['state_dict'])
    print('Load model parameters from %s' % args.ckpt_file)

    my_test(test_x, model, small_vocab)
Ejemplo n.º 10
0
def main():
    if not os.path.exists(args.ckpt_file):
        raise FileNotFoundError("model file not found")

    data_dir = '/home/tiankeke/workspace/datas/sumdata/'
    TRAIN_X = os.path.join(data_dir, 'train/train.article.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt')
    TEST_X = args.input_file

    small_vocab_file = 'sumdata/small_vocab.json'
    if os.path.exists(small_vocab_file):
        small_vocab = json.load(open(small_vocab_file))
    else:
        small_vocab = build_vocab([TRAIN_X, TRAIN_Y],
                                  small_vocab_file,
                                  vocab_size=80000)

    max_src_len = 60
    max_tgt_len = 20

    bs = args.batch_size
    n_test = args.n_test

    vocab = small_vocab
    test_x = BatchManager(load_data(TEST_X, max_src_len, n_test), bs, vocab)

    model = ElmoTransformer(max_src_len,
                            len(vocab),
                            2,
                            8,
                            64,
                            64,
                            256,
                            512,
                            2048,
                            dropout=0.5,
                            elmo_requires_grad=False).cuda()

    saved_state = torch.load(args.ckpt_file)
    model.load_state_dict(saved_state['state_dict'])
    print('Load model parameters from %s' % args.ckpt_file)

    my_test(test_x, model, small_vocab)
Ejemplo n.º 11
0
def main():
    print(args)

    data_dir = 'data/'
    TRAIN_X = os.path.join(data_dir, 'train/in.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/out.txt')
    VALID_X = os.path.join(data_dir, 'dev/in.txt')
    VALID_Y = os.path.join(data_dir, 'dev/out.txt')
    EVAL_X = os.path.join(data_dir, 'test/in.txt')
    EVAL_Y = os.path.join(data_dir, 'test/out.txt')

    small_vocab_file = os.path.join(data_dir, 'vocab.json')
    if os.path.exists(small_vocab_file):
        print("Vocab exists!")
        small_vocab = json.load(open(small_vocab_file))
    else:
        small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=800000)

    max_src_len = 34
    max_tgt_len = 34

    bs = args.batch_size
    n_train = args.n_train
    n_valid = args.n_valid
    n_eval = args.n_eval

    vocab = small_vocab

    train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, vocab)
    train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, vocab)
    valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, vocab)
    valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, vocab)
    eval_x = BatchManager(load_data(EVAL_X, max_src_len, n_eval), bs, vocab)
    eval_y = BatchManager(load_data(EVAL_Y, max_tgt_len, n_eval), bs, vocab)
    print("vocab length is: "+ str(len(vocab)))
    model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 6, 8, 256, 64, 64, 1024, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda()
    saved_state = {'epoch': 0, 'lr': 0.001}
    if os.path.exists(args.ckpt_file):
        saved_state = torch.load(args.ckpt_file)
        model.load_state_dict(saved_state['state_dict'])
        logging.info('Load model parameters from %s' % args.ckpt_file)

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=saved_state['lr'])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.3)
    scheduler.step()  # last_epoch=-1, which will not update lr at the first time

    # myeval(valid_x, valid_y, vocab, model)
    # train(train_x, train_y, valid_x, valid_y, model, optimizer, vocab, scheduler, args.n_epochs, saved_state['epoch'])
    myeval(eval_x, eval_y, vocab, model)
Ejemplo n.º 12
0
def main():
    # if not os.path.exists(args.ckpt_file):
    #     raise FileNotFoundError("model file not found")

    data_dir = '/home/tiankeke/workspace/datas/sumdata/'
    TRAIN_X = os.path.join(data_dir, 'train/train.article.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt')
    TEST_X = args.input_file

    small_vocab_file = 'sumdata/small_vocab.json'
    if os.path.exists(small_vocab_file):
        small_vocab = json.load(open(small_vocab_file))
    else:
        small_vocab = build_vocab([TRAIN_X, TRAIN_Y],
                                  small_vocab_file,
                                  vocab_size=80000)

    max_src_len = 100
    max_tgt_len = 40
    vocab = small_vocab

    test_x = BatchManager(load_data(TEST_X, max_src_len, args.n_test),
                          args.batch_size, small_vocab)

    model = Transformer(len(vocab),
                        len(vocab),
                        200,
                        200,
                        2,
                        4,
                        256,
                        1024,
                        src_tgt_emb_share=True,
                        tgt_prj_wt_share=True).cuda()

    saved_state = torch.load(args.ckpt_file)
    model.load_state_dict(saved_state['state_dict'])
    print('Load model parameters from %s' % args.ckpt_file)

    my_test(test_x, model, small_vocab)
Ejemplo n.º 13
0
def main():

    N_TEST = args.n_test
    BATCH_SIZE = args.batch_size

    # vocab = json.load(open('sumdata/vocab.json'))

    embedding_path = '/home/kaiying/coco/embeddings/giga-256d.bin'
    vocab, embeddings = utils.load_word2vec_embedding(embedding_path)

    test_x = BatchManager(load_data(args.input_file, vocab, N_TEST),
                          BATCH_SIZE)
    # model = Seq2SeqAttention(len(vocab), EMB_DIM, HID_DIM, BATCH_SIZE, vocab, max_trg_len=25).cuda()
    model = Model(vocab, emb_dim=256, hid_dim=512,
                  embeddings=embeddings).cuda()
    model.eval()

    file = args.ckpt_file
    if os.path.exists(file):
        saved_state = torch.load(file)
        model.load_state_dict(saved_state['state_dict'])
        print('Load model parameters from %s' % file)

        my_test(test_x, model)
Ejemplo n.º 14
0
def train(inputs, targets, model, optimizer, batch_size=32, epochs=200):
    inputs_batch_manager = BatchManager(inputs, batch_size)
    targets_batch_manager = BatchManager(targets, batch_size)
    steps = inputs_batch_manager.steps

    for epoch in range(epochs):
        for i in range(steps):
            optimizer.zero_grad()
            batch_inputs = torch.tensor(inputs_batch_manager.next_batch(),
                                        dtype=torch.long)
            batch_targets = torch.tensor(targets_batch_manager.next_batch(),
                                         dtype=torch.long)
            logits = model(batch_inputs, batch_targets)  # exclude start token
            loss = model.loss_layer(logits.transpose(1, 2), batch_targets[:,
                                                                          1:])
            loss.backward()
            optimizer.step()
        print(loss)

    torch.save(model.state_dict(), os.path.join("models", "params.pkl"))
Ejemplo n.º 15
0
def eval(model, vocab, inputs, targets, out_len=12):
    model.eval()
    batch_x = BatchManager(inputs, 32)
    batch_y = BatchManager(targets, 32)
    hits = 0
    total = 0
    for i in range(batch_x.steps):
        x = torch.tensor(batch_x.next_batch(), dtype=torch.long).cuda()
        y = torch.tensor(batch_y.next_batch(), dtype=torch.long).cuda()

        tgt_seq = torch.ones(x.shape[0], out_len, dtype=torch.long).cuda()
        tgt_seq *= vocab['<pad>']
        tgt_seq[:, 0] = vocab['<s>']
        for j in range(1, out_len):
            logits = model(x, tgt_seq)
            last_word = torch.argmax(logits[:, j - 1, :], dim=-1).view(-1, 1)
            tgt_seq[:, j] = last_word.squeeze()
            if j != out_len - 1:
                tgt_seq[:, j + 1] = vocab['</s>']
        hits += visualize(x, y, tgt_seq, vocab)
        total += x.shape[0]

    print('%d/%d, accuracy=%f' % (hits, total, hits / total))
    model.train()
Ejemplo n.º 16
0
def main():
    print(args)

    data_dir = '/home/disk3/tiankeke/sumdata/'
    TRAIN_X = os.path.join(data_dir, 'train/train.article.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt')
    VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt')
    VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt')

    src_vocab_file = 'sumdata/src_vocab.txt'
    if not os.path.exists(src_vocab_file):
        build_vocab([TRAIN_X], src_vocab_file)
    src_vocab = load_vocab(src_vocab_file, vocab_size=90000)

    tgt_vocab_file = 'sumdata/tgt_vocab.txt'
    if not os.path.exists(tgt_vocab_file):
        build_vocab([TRAIN_Y], tgt_vocab_file)
    tgt_vocab = load_vocab(tgt_vocab_file)

    # emb_file = '/home/tiankeke/workspace/embeddings/giga-vec1.bin'
    # vocab, embeddings = load_word2vec_embedding(emb_file)

    max_src_len = 100
    max_tgt_len = 40
    max_pos = 200

    bs = args.batch_size
    n_train = args.n_train
    n_valid = args.n_valid

    train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs,
                           src_vocab)
    train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs,
                           tgt_vocab)
    train_x, train_y = utils.shuffle(train_x, train_y)

    valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs,
                           src_vocab)
    valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs,
                           tgt_vocab)
    valid_x, valid_y = utils.shuffle(valid_x, valid_y)
    # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 4, 256,
    #                     64, 64, 1024, src_tgt_emb_share=True, tgt_prj_wt_share=True).cuda()
    model = Transformer(len(src_vocab),
                        len(tgt_vocab),
                        max_pos,
                        max_pos,
                        2,
                        4,
                        256,
                        1024,
                        src_tgt_emb_share=False,
                        tgt_prj_wt_share=True).cuda()
    # model = TransformerShareEmbedding(len(vocab), max_src_len, 2, 4,
    #                                   256, 1024, False, True).cuda()

    # print(model)
    saved_state = {'epoch': 0, 'lr': 0.001}
    if os.path.exists(args.ckpt_file):
        saved_state = torch.load(args.ckpt_file)
        model.load_state_dict(saved_state['state_dict'])
        logging.info('Load model parameters from %s' % args.ckpt_file)

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=saved_state['lr'])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=1,
                                                gamma=0.3)
    scheduler.step(
    )  # last_epoch=-1, which will not update lr at the first time

    # eval_model(valid_x, valid_y, vocab, model)
    train(train_x, train_y, valid_x, valid_y, model, optimizer, tgt_vocab,
          scheduler, args.n_epochs, saved_state['epoch'])
Ejemplo n.º 17
0
        with tf.Session(config=config) as sess:
            print('============= demo =============')
            saver.restore(sess, ckpt_file)

            while True:
                print('Please input your sentence (or key \'exit\' to exit):')
                demo_sent = input().strip()
                demo_sent = demo_sent.replace(" ", "")
                # if demo_sent == '' or demo_sent.isspace():
                if demo_sent == 'exit':
                    print('See you next time!')
                    break
                else:
                    demo_transfer_test = load_input_sentence(demo_sent)
                    demo_transfer_test_data = preprocess_data(demo_transfer_test, char2id, transfer_tag2id)
                    demo_transfer_test_manager = BatchManager(demo_transfer_test_data, args.batch_size)
                    demo_data = model.evaluate(sess, demo_transfer_test_manager, transfer_id2tag)
                    """
                        demo_data format:
                            [
                               [
                                   'char <O> <pred>', ...,
                               ]
                            ]
                        Notes:
                            char
                            <O>: default tag (no meaning)
                            <pred>: predicted transfer tag
                    """

                    ret = { "product_name": [],
Ejemplo n.º 18
0
def train(conf):
    train_sentences = load_sentences(conf.train_file, conf.zeros)
    dev_sentences = load_sentences(conf.dev_file, conf.zeros)
    test_sentences = load_sentences(conf.test_file, conf.zeros)

    dico_chars_train = char_mapping(train_sentences, conf.lower)[0]
    dico_chars, char_to_id, id_to_char = augment_with_pretrained(
        dico_chars_train.copy(), conf.emb_file,
        list(
            itertools.chain.from_iterable([[w[0] for w in s]
                                           for s in test_sentences])))
    _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 conf.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               conf.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                conf.lower)

    #loading word embeddings
    all_word_embeds = {}
    for i, line in enumerate(codecs.open(conf.emb_file, 'r', 'utf-8')):
        s = line.strip().split()
        if len(s) == conf.embedding_dim + 1:
            all_word_embeds[s[0]] = np.array([float(i) for i in s[1:]])
    word_embeds_dict = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06),
                                         (len(char_to_id), conf.embedding_dim))
    for w in char_to_id:
        if w in all_word_embeds:
            word_embeds_dict[char_to_id[w]] = all_word_embeds[w]
        elif w.lower() in all_word_embeds:
            word_embeds_dict[char_to_id[w]] = all_word_embeds[w.lower()]
    print('Loaded %i pretrained embeddings.' % len(all_word_embeds))

    train_manager = BatchManager(train_data, conf.batch_size)

    model = BiLSTM_CRF(conf, tag_to_id, char_to_id, word_embeds_dict)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=conf.learning_rate,
                                weight_decay=1e-4)
    epoch = conf.epochs
    dev_f1_ = 0
    for epoch in range(1, epoch + 1):
        print(f'train on epoch {epoch}')
        j = 1
        for batch in train_manager.iter_batch(shuffle=True):
            batch_loss = 0.0
            sentences = batch[1]
            tags = batch[-1]
            for i, index in enumerate(np.random.permutation(len(sentences))):
                model.zero_grad()
                sentence_in = sentences[index]
                tags_in = tags[index]
                loss = model.neg_log_likelihood(sentence_in, tags_in)
                loss.backward()
                optimizer.step()
                batch_loss += loss.data
            print(
                f'[batch {j},batch size:{conf.batch_size}] On this batch loss: {batch_loss}'
            )
            j = j + 1
        print(f'Begin validing result on [epoch {epoch}] valid dataset ...')
        dev_results = get_predictions(model, dev_data, id_to_tag)
        dev_f1 = evaluate_ner(dev_results, conf)
        if dev_f1 > dev_f1_:
            torch.save(model, conf.model_file)
            print('save model success.')
        test_results = get_predictions(model, test_data, id_to_tag)
        test_f1 = evaluate_ner(test_results, conf)
        print(f'[epoch {epoch}] On test dataset] f1: {test_f1:3f}')
Ejemplo n.º 19
0
def train():
    train_sentences, dico, char_to_id, id_to_char = load_sentence(
        FLAGS.train_file)
    if not os.path.isfile(FLAGS.map_file):
        if FLAGS.pre_emb:
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico.copy(),
                FLAGS.emb_file,
            )
        else:
            sentences, dico, char_to_id, id_to_char = load_sentence(
                FLAGS.train_file)
        print(train_sentences[0])
        with open(FLAGS.map_file, 'wb') as f:
            pickle.dump([char_to_id, id_to_char], f)
    else:
        with open(FLAGS.map_file, 'rb') as f:
            char_to_id, id_to_char = pickle.load(f)

    train_data, test_data, dev_data = prepare_dataset(train_sentences,
                                                      char_to_id)
    print(train_data[0])
    print(test_data[0])
    print(dev_data[0])
    print(len(train_data), len(dev_data), len(test_data))
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    test_manager = BatchManager(test_data, 100)
    dev_manager = BatchManager(dev_data, 100)

    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)
    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)
    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
    tf_config = tf.ConfigProto(gpu_options=gpu_options)
    tf_config.gpu_options.allow_growth = True

    steps_per_epoch = train_manager.len_data

    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        best = 0
        # sess.graph.finalize()
        for i in range(50):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{},".format(
                        iteration, step % steps_per_epoch, steps_per_epoch))
                    loss = []
            Acc_result = evaluate(sess, model, "dev", dev_manager, logger)
            logger.info("Acc{}".format(Acc_result))
            logger.info("test")
            # precision, recall, f1_score = model.evaluete_(sess,test_manager)
            # logger.info("P, R, F,{},{},{}".format(precision, recall, f1_score))
            test_result = evaluate(sess, model, "test", test_manager, logger)
            if test_result > best:
                best = test_result
                save_model(sess, model, FLAGS.ckpt_path, logger)
Ejemplo n.º 20
0
def main():
    print(args)

    data_dir = '/home/tiankeke/workspace/datas/sumdata/'
    TRAIN_X = os.path.join(data_dir, 'train/train.article.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt')
    VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt')
    VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt')

    src_vocab, tgt_vocab = get_vocab(TRAIN_X, TRAIN_Y)

    small_vocab_file = 'sumdata/small_vocab.json'
    if os.path.exists(small_vocab_file):
        small_vocab = json.load(open(small_vocab_file))
    else:
        small_vocab = build_vocab([TRAIN_X, TRAIN_Y],
                                  small_vocab_file,
                                  vocab_size=80000)

    max_src_len = 101
    max_tgt_len = 47
    bs = args.batch_size
    n_train = args.n_train
    n_valid = args.n_valid

    vocab = small_vocab

    train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, vocab)
    train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, vocab)
    valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, vocab)
    valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, vocab)

    model = Transformer(len(vocab),
                        len(vocab),
                        max_src_len,
                        d_word_vec=300,
                        d_model=300,
                        d_inner=1200,
                        n_layers=1,
                        n_head=6,
                        d_k=50,
                        d_v=50,
                        dropout=0.1,
                        tgt_emb_prj_weight_sharing=True,
                        emb_src_tgt_weight_sharing=True).cuda()
    # print(model)

    saved_state = {'epoch': 0, 'lr': 0.001}
    if os.path.exists(args.ckpt_file):
        saved_state = torch.load(args.ckpt_file)
        model.load_state_dict(saved_state['state_dict'])
        logging.info('Load model parameters from %s' % args.ckpt_file)

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=saved_state['lr'])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=1,
                                                gamma=0.3)
    scheduler.step(
    )  # last_epoch=-1, which will not update lr at the first time

    train(train_x, train_y, valid_x, valid_y, model, optimizer, scheduler,
          args.n_epochs, saved_state['epoch'])
Ejemplo n.º 21
0
def main(_):
    if not os.path.isdir(FLAGS.log_path):
        os.makedirs(FLAGS.log_path)
    if not os.path.isdir(FLAGS.model_path):
        os.makedirs(FLAGS.model_path)
    if not os.path.isdir(FLAGS.result_path):
        os.makedirs(FLAGS.result_path)
    tag_to_id = {
        "O": 0,
        "B-LOC": 1,
        "I-LOC": 2,
        "B-PER": 3,
        "I-PER": 4,
        "B-ORG": 5,
        "I-ORG": 6
    }
    # load data
    id_to_word, id_to_tag, train_data, dev_data, test_data = load_data(
        FLAGS, tag_to_id)
    train_manager = BatchManager(train_data, len(id_to_tag),
                                 FLAGS.word_max_len, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, len(id_to_tag), FLAGS.word_max_len,
                               FLAGS.valid_batch_size)
    test_manager = BatchManager(test_data, len(id_to_tag), FLAGS.word_max_len,
                                FLAGS.valid_batch_size)
    with tf.Session() as sess:
        model = create_model(sess, id_to_word, id_to_tag)
        loss = 0
        best_test_f1 = 0
        steps_per_epoch = len(train_data) // FLAGS.batch_size + 1
        for _ in range(FLAGS.max_epoch):
            iteration = (model.global_step.eval()) // steps_per_epoch + 1
            train_manager.shuffle()
            for batch in train_manager.iter_batch():
                global_step = model.global_step.eval()
                step = global_step % steps_per_epoch
                batch_loss = model.run_step(sess, True, batch)
                loss += batch_loss / FLAGS.steps_per_checkpoint
                if global_step % FLAGS.steps_per_checkpoint == 0:
                    model.logger.info(
                        "iteration:{} step:{}/{}, NER loss:{:>9.6f}".format(
                            iteration, step, steps_per_epoch, loss))
                    loss = 0

            model.logger.info("validating ner")
            ner_results = model.predict(sess, dev_manager)
            eval_lines = test_ner(ner_results, FLAGS.result_path)
            for line in eval_lines:
                model.logger.info(line)
            test_f1 = float(eval_lines[1].strip().split()[-1])
            if test_f1 > best_test_f1:
                best_test_f1 = test_f1
                model.logger.info("new best f1 score:{:>.3f}".format(test_f1))
                model.logger.info("saving model ...")
                checkpoint_path = os.path.join(FLAGS.model_path,
                                               "translate.ckpt")
                model.saver.save(sess,
                                 checkpoint_path,
                                 global_step=model.global_step)
        # test model
        model.logger.info("testing ner")
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_path)
        model.logger.info("Reading model parameters from %s" %
                          ckpt.model_checkpoint_path)
        model.saver.restore(sess, ckpt.model_checkpoint_path)
        ner_results = model.predict(sess, test_manager)
        eval_lines = test_ner(ner_results, FLAGS.result_path)
        for line in eval_lines:
            model.logger.info(line)
Ejemplo n.º 22
0
    if singletons is not None:
        words = insert_singletons(words, singletons)
    if parameters['cap_dim']:
        caps = data['caps']
    char_for, char_rev, char_pos = pad_word_chars(chars)
    input = []
    if parameters['word_dim']:
        input.append(words)
    if parameters['char_dim']:
        input.append(char_for)
        if parameters['char_bidirect']:
            input.append(char_rev)
        input.append(char_pos)
    if parameters['cap_dim']:
        input.append(caps)
    if add_label:
        input.append(data['tags'])
    return input


if __name__ == "__main__":
    train_sentences = load_sentences("./data/input.train", True)
    print(train_sentences)
    # create maps if not exist
    _c, char_to_id, id_to_char = char_mapping(train_sentences, True)
    _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, True)
    train_manager = BatchManager(train_data, 100)
    for batch in train_manager.iter_batch(shuffle=True):
        print(batch[0])
        print(batch[-1])
Ejemplo n.º 23
0
def main():
    print(args)

    # local
    """
    data_dir = 'sumdata/'
    TRAIN_X = os.path.join(data_dir, 'train/train.article.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt')
    VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt')
    VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt')
    EVAL_X = os.path.join(data_dir, 'train/valid.article.filter.txt')
    EVAL_Y = os.path.join(data_dir, 'train/valid.title.filter.txt')
    """

    # server
    data_dir = 'sumdata/'
    TRAIN_X = os.path.join(data_dir, 'train/train.article_01_new.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title_01_new.txt')
    VALID_X = os.path.join(data_dir, 'train/train.article_000_new.txt')
    VALID_Y = os.path.join(data_dir, 'train/train.title_000_new.txt')
    EVAL_X = os.path.join(data_dir, 'train/train.article_001_new.txt')
    EVAL_Y = os.path.join(data_dir, 'train/train.title_001_new.txt')

    small_vocab_file = 'sumdata/small_vocab.json'
    if os.path.exists(small_vocab_file):
        small_vocab = json.load(open(small_vocab_file))
    else:
        small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000)

    # bert embeddings
    emb_file = 'sumdata/bert-large-uncased.30522.1024d.vec'
    vocab, embeddings = load_word2vec_embedding(emb_file)

    max_src_len = 101
    max_tgt_len = 47

    bs = args.batch_size
    n_train = args.n_train
    n_valid = args.n_valid
    n_eval = args.n_eval

    # vocab = small_vocab

    train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, vocab)
    train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, vocab)
    valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, vocab)
    valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, vocab)
    eval_x = BatchManager(load_data(EVAL_X, max_src_len, n_eval), bs, vocab)
    eval_y = BatchManager(load_data(EVAL_Y, max_tgt_len, n_eval), bs, vocab)
    # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 4, 256,
    #                     64, 64, 1024, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda()
    # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 6, 300,
    #                     50, 50, 1200, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda()
    # model = TransformerShareEmbedding(len(vocab), max_src_len, 1, 6, 300, 50, 50, 1200, False).cuda()
    model = TransformerShareEmbedding(len(vocab), max_src_len, 1, 6, 1024, 50, 50, 1200, False, embeddings = embeddings).cuda()

    # print(model)
    saved_state = {'epoch': 0, 'lr': 0.001}
    if os.path.exists(args.ckpt_file):
        saved_state = torch.load(args.ckpt_file)
        model.load_state_dict(saved_state['state_dict'])
        logging.info('Load model parameters from %s' % args.ckpt_file)

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=saved_state['lr'])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.3)
    scheduler.step()  # last_epoch=-1, which will not update lr at the first time

    # eval_model(valid_x, valid_y, vocab, model)
    # train(train_x, train_y, valid_x, valid_y, model, optimizer, vocab, scheduler, args.n_epochs, saved_state['epoch'])
    myeval(eval_x, eval_y, vocab, model)