def main():
    """Entry point.
    """
    # Load data
    train_data, dev_data, test_data = data_utils.load_data_numpy(
        config_data.input_dir, config_data.filename_prefix)
    with open(config_data.vocab_file, 'rb') as f:
        id2w = pickle.load(f)

    beam_width = getattr(config_model, "beam_width", 1)

    # Create logging
    tx.utils.maybe_create_dir(args.model_dir)
    logging_file = os.path.join(args.model_dir, 'logging.txt')
    logger = utils.get_logger(logging_file)
    print(f"logging file is saved in: {logging_file}")

    model = Transformer(config_model, config_data)
    if torch.cuda.is_available():
        model = model.cuda()
        device = torch.cuda.current_device()
    else:
        device = None

    best_results = {'score': 0, 'epoch': -1}
    lr_config = config_model.lr_config
    if lr_config["learning_rate_schedule"] == "static":
        init_lr = lr_config["static_lr"]
        scheduler_lambda = lambda x: 1.0
    else:
        init_lr = lr_config["lr_constant"]
        scheduler_lambda = functools.partial(
            utils.get_lr_multiplier, warmup_steps=lr_config["warmup_steps"])
    optim = torch.optim.Adam(
        model.parameters(), lr=init_lr, betas=(0.9, 0.997), eps=1e-9)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optim, scheduler_lambda)

    def _eval_epoch(epoch, mode):
        torch.cuda.empty_cache()
        if mode == 'eval':
            eval_data = dev_data
        elif mode == 'test':
            eval_data = test_data
        else:
            raise ValueError("`mode` should be either \"eval\" or \"test\".")

        references, hypotheses = [], []
        bsize = config_data.test_batch_size
        for i in tqdm(range(0, len(eval_data), bsize)):
            sources, targets = zip(*eval_data[i:i + bsize])
            with torch.no_grad():
                x_block = data_utils.source_pad_concat_convert(
                    sources, device=device)
                predictions = model(
                    encoder_input=x_block,
                    is_train_mode=False,
                    beam_width=beam_width)
                if beam_width == 1:
                    decoded_ids = predictions[0].sample_id
                else:
                    decoded_ids = predictions["sample_id"][:, :, 0]

                hypotheses.extend(h.tolist() for h in decoded_ids)
                references.extend(r.tolist() for r in targets)
                hypotheses = utils.list_strip_eos(hypotheses, eos_token_id)
                references = utils.list_strip_eos(references, eos_token_id)

        if mode == 'eval':
            # Writes results to files to evaluate BLEU
            # For 'eval' mode, the BLEU is based on token ids (rather than
            # text tokens) and serves only as a surrogate metric to monitor
            # the training process
            # TODO: Use texar.evals.bleu
            fname = os.path.join(args.model_dir, 'tmp.eval')
            hwords, rwords = [], []
            for hyp, ref in zip(hypotheses, references):
                hwords.append([str(y) for y in hyp])
                rwords.append([str(y) for y in ref])
            hwords = tx.utils.str_join(hwords)
            rwords = tx.utils.str_join(rwords)
            hyp_fn, ref_fn = tx.utils.write_paired_text(
                hwords, rwords, fname, mode='s',
                src_fname_suffix='hyp', tgt_fname_suffix='ref')
            eval_bleu = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True)
            eval_bleu = 100. * eval_bleu
            logger.info("epoch: %d, eval_bleu %.4f", epoch, eval_bleu)
            print(f"epoch: {epoch:d}, eval_bleu {eval_bleu:.4f}")

            if eval_bleu > best_results['score']:
                logger.info("epoch: %d, best bleu: %.4f", epoch, eval_bleu)
                best_results['score'] = eval_bleu
                best_results['epoch'] = epoch
                model_path = os.path.join(args.model_dir, args.model_fn)
                logger.info("Saving model to %s", model_path)
                print(f"Saving model to {model_path}")

                states = {
                    'model': model.state_dict(),
                    'optimizer': optim.state_dict(),
                    'scheduler': scheduler.state_dict(),
                }
                torch.save(states, model_path)

        elif mode == 'test':
            # For 'test' mode, together with the cmds in README.md, BLEU
            # is evaluated based on text tokens, which is the standard metric.
            fname = os.path.join(args.model_dir, 'test.output')
            hwords, rwords = [], []
            for hyp, ref in zip(hypotheses, references):
                hwords.append([id2w[y] for y in hyp])
                rwords.append([id2w[y] for y in ref])
            hwords = tx.utils.str_join(hwords)
            rwords = tx.utils.str_join(rwords)
            hyp_fn, ref_fn = tx.utils.write_paired_text(
                hwords, rwords, fname, mode='s',
                src_fname_suffix='hyp', tgt_fname_suffix='ref')
            logger.info("Test output written to file: %s", hyp_fn)
            print(f"Test output written to file: {hyp_fn}")

    def _train_epoch(epoch: int):
        torch.cuda.empty_cache()
        random.shuffle(train_data)
        train_iter = data.iterator.pool(
            train_data,
            config_data.batch_size,
            key=lambda x: (len(x[0]), len(x[1])),
            # key is not used if sort_within_batch is False by default
            batch_size_fn=utils.batch_size_fn,
            random_shuffler=data.iterator.RandomShuffler())

        for _, train_batch in tqdm(enumerate(train_iter)):
            optim.zero_grad()
            in_arrays = data_utils.seq2seq_pad_concat_convert(
                train_batch, device=device)
            loss = model(
                encoder_input=in_arrays[0],
                is_train_mode=True,
                decoder_input=in_arrays[1],
                labels=in_arrays[2],
            )
            loss.backward()

            optim.step()
            scheduler.step()

            step = scheduler.last_epoch
            if step % config_data.display_steps == 0:
                logger.info('step: %d, loss: %.4f', step, loss)
                lr = optim.param_groups[0]['lr']
                print(f"lr: {lr} step: {step}, loss: {loss:.4}")
            if step and step % config_data.eval_steps == 0:
                _eval_epoch(epoch, mode='eval')

    if args.run_mode == 'train_and_evaluate':
        logger.info("Begin running with train_and_evaluate mode")
        model_path = os.path.join(args.model_dir, args.model_fn)
        if os.path.exists(model_path):
            logger.info("Restore latest checkpoint in", model_path)
            ckpt = torch.load(model_path)
            model.load_state_dict(ckpt['model'])
            optim.load_state_dict(ckpt['optimizer'])
            scheduler.load_state_dict(ckpt['scheduler'])
            _eval_epoch(0, mode='test')

        for epoch in range(config_data.max_train_epoch):
            _train_epoch(epoch)
            _eval_epoch(epoch, mode='eval')

    elif args.run_mode == 'eval':
        logger.info("Begin running with evaluate mode")
        model_path = os.path.join(args.model_dir, args.model_fn)
        logger.info("Restore latest checkpoint in %s", model_path)
        ckpt = torch.load(model_path)
        model.load_state_dict(ckpt['model'])
        _eval_epoch(0, mode='eval')

    elif args.run_mode == 'test':
        logger.info("Begin running with test mode")
        model_path = os.path.join(args.model_dir, args.model_fn)
        logger.info("Restore latest checkpoint in", model_path)
        ckpt = torch.load(model_path)
        model.load_state_dict(ckpt['model'])
        _eval_epoch(0, mode='test')

    else:
        raise ValueError(f"Unknown mode: {args.run_mode}")
Exemple #2
0
def main():
    """Entrypoint.
    """
    # Load data
    print('Loading data ...')
    train_data, dev_data, test_data = data_utils.load_data_numpy(
        config_data.input_dir, config_data.filename_prefix)
    print('Load data done')
    with open(config_data.vocab_file, 'rb') as f:
        id2w = pickle.load(f)
    vocab_size = len(id2w)
    print('vocab_size {}'.format(vocab_size))
    bos_token_id, eos_token_id = 1, 2

    beam_width = config_model.beam_width

    # Create logging
    tx.utils.maybe_create_dir(FLAGS.model_dir)
    logging_file = os.path.join(FLAGS.model_dir, 'logging.txt')
    logger = utils.get_logger(logging_file)
    print('logging file is saved in: %s', logging_file)

    # Build model graph
    encoder_input = tf.placeholder(tf.int64, shape=(None, None))
    decoder_input = tf.placeholder(tf.int64, shape=(None, None))
    # (text sequence length excluding padding)
    encoder_input_length = tf.reduce_sum(
        1 - tf.to_int32(tf.equal(encoder_input, 0)), axis=1)
    decoder_input_length = tf.reduce_sum(
        1 - tf.to_int32(tf.equal(decoder_input, 0)), axis=1)

    labels = tf.placeholder(tf.int64, shape=(None, None))
    is_target = tf.to_float(tf.not_equal(labels, 0))

    global_step = tf.Variable(0, dtype=tf.int64, trainable=False)
    learning_rate = tf.placeholder(tf.float64, shape=(), name='lr')

    embedder = tx.modules.WordEmbedder(vocab_size=vocab_size,
                                       hparams=config_model.emb)
    encoder = TransformerEncoder(hparams=config_model.encoder)

    encoder_output = encoder(inputs=embedder(encoder_input),
                             sequence_length=encoder_input_length)

    # The decoder ties the input word embedding with the output logit layer.
    # As the decoder masks out <PAD>'s embedding, which in effect means
    # <PAD> has all-zero embedding, so here we explicitly set <PAD>'s embedding
    # to all-zero.
    tgt_embedding = tf.concat(
        [tf.zeros(shape=[1, embedder.dim]), embedder.embedding[1:, :]], axis=0)
    decoder = TransformerDecoder(embedding=tgt_embedding,
                                 hparams=config_model.decoder)
    # For training
    outputs = decoder(memory=encoder_output,
                      memory_sequence_length=encoder_input_length,
                      inputs=embedder(decoder_input),
                      sequence_length=decoder_input_length,
                      decoding_strategy='train_greedy',
                      mode=tf.estimator.ModeKeys.TRAIN)

    mle_loss = transformer_utils.smoothing_cross_entropy(
        outputs.logits, labels, vocab_size, config_model.loss_label_confidence)
    mle_loss = tf.reduce_sum(mle_loss * is_target) / tf.reduce_sum(is_target)

    train_op = tx.core.get_train_op(mle_loss,
                                    learning_rate=learning_rate,
                                    global_step=global_step,
                                    hparams=config_model.opt)

    tf.summary.scalar('lr', learning_rate)
    tf.summary.scalar('mle_loss', mle_loss)
    summary_merged = tf.summary.merge_all()

    # For inference
    start_tokens = tf.fill([tx.utils.get_batch_size(encoder_input)],
                           bos_token_id)
    predictions = decoder(memory=encoder_output,
                          memory_sequence_length=encoder_input_length,
                          decoding_strategy='infer_greedy',
                          beam_width=beam_width,
                          alpha=config_model.alpha,
                          start_tokens=start_tokens,
                          end_token=eos_token_id,
                          max_decoding_length=config_data.max_decoding_length,
                          mode=tf.estimator.ModeKeys.PREDICT)
    if beam_width <= 1:
        inferred_ids = predictions[0].sample_id
    else:
        # Uses the best sample by beam search
        inferred_ids = predictions['sample_id'][:, :, 0]

    saver = tf.train.Saver(max_to_keep=5)
    best_results = {'score': 0, 'epoch': -1}

    def _eval_epoch(sess, epoch, mode):
        if mode == 'eval':
            eval_data = dev_data
        elif mode == 'test':
            eval_data = test_data
        else:
            raise ValueError('`mode` should be either "eval" or "test".')

        references, hypotheses = [], []
        bsize = config_data.test_batch_size
        for i in range(0, len(eval_data), bsize):
            #print("eval {}/{}".format(i, len(eval_data)))
            sources, targets = zip(*eval_data[i:i + bsize])
            x_block = data_utils.source_pad_concat_convert(sources)
            feed_dict = {
                encoder_input: x_block,
                tx.global_mode(): tf.estimator.ModeKeys.EVAL,
            }
            fetches = {
                'inferred_ids': inferred_ids,
            }
            fetches_ = sess.run(fetches, feed_dict=feed_dict)

            hypotheses.extend(h.tolist() for h in fetches_['inferred_ids'])
            references.extend(r.tolist() for r in targets)
            hypotheses = utils.list_strip_eos(hypotheses, eos_token_id)
            references = utils.list_strip_eos(references, eos_token_id)

        if mode == 'eval':
            # Writes results to files to evaluate BLEU
            # For 'eval' mode, the BLEU is based on token ids (rather than
            # text tokens) and serves only as a surrogate metric to monitor
            # the training process
            fname = os.path.join(FLAGS.model_dir, 'tmp.eval')
            hypotheses = tx.utils.str_join(hypotheses)
            references = tx.utils.str_join(references)
            hyp_fn, ref_fn = tx.utils.write_paired_text(hypotheses,
                                                        references,
                                                        fname,
                                                        mode='s')
            eval_bleu = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True)
            eval_bleu = 100. * eval_bleu
            logger.info('epoch: %d, eval_bleu %.4f', epoch, eval_bleu)
            print('epoch: %d, eval_bleu %.4f' % (epoch, eval_bleu))

            if eval_bleu > best_results['score']:
                logger.info('epoch: %d, best bleu: %.4f', epoch, eval_bleu)
                best_results['score'] = eval_bleu
                best_results['epoch'] = epoch
                model_path = os.path.join(FLAGS.model_dir, 'best-model.ckpt')
                logger.info('saving model to %s', model_path)
                print('saving model to %s' % model_path)
                saver.save(sess, model_path)

        elif mode == 'test':
            # For 'test' mode, together with the cmds in README.md, BLEU
            # is evaluated based on text tokens, which is the standard metric.
            fname = os.path.join(FLAGS.model_dir, 'test.output')
            hwords, rwords = [], []
            for hyp, ref in zip(hypotheses, references):
                hwords.append([id2w[y] for y in hyp])
                rwords.append([id2w[y] for y in ref])
            hwords = tx.utils.str_join(hwords)
            rwords = tx.utils.str_join(rwords)
            hyp_fn, ref_fn = tx.utils.write_paired_text(hwords,
                                                        rwords,
                                                        fname,
                                                        mode='s')
            logger.info('Test output writtn to file: %s', hyp_fn)
            print('Test output writtn to file: %s' % hyp_fn)

    def _train_epoch(sess, epoch, step, smry_writer):
        random.shuffle(train_data)
        train_iter = data.iterator.pool(
            train_data,
            config_data.batch_size,
            key=lambda x: (len(x[0]), len(x[1])),
            batch_size_fn=utils.batch_size_fn,
            random_shuffler=data.iterator.RandomShuffler())

        for _, train_batch in enumerate(train_iter):
            if len(train_batch) == 0:
                continue
            in_arrays = data_utils.seq2seq_pad_concat_convert(train_batch)
            feed_dict = {
                encoder_input: in_arrays[0],
                decoder_input: in_arrays[1],
                labels: in_arrays[2],
                learning_rate: utils.get_lr(step, config_model.lr)
            }
            fetches = {
                'step': global_step,
                'train_op': train_op,
                'smry': summary_merged,
                'loss': mle_loss,
            }

            fetches_ = sess.run(fetches, feed_dict=feed_dict)

            step, loss = fetches_['step'], fetches_['loss']
            if step and step % config_data.display_steps == 0:
                logger.info('step: %d, loss: %.4f', step, loss)
                print('step: %d, loss: %.4f' % (step, loss))
                smry_writer.add_summary(fetches_['smry'], global_step=step)

            if step and step % config_data.eval_steps == 0:
                _eval_epoch(sess, epoch, mode='eval')
        return step

    # Run the graph
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        sess.run(tf.tables_initializer())

        smry_writer = tf.summary.FileWriter(FLAGS.model_dir, graph=sess.graph)

        if FLAGS.run_mode == 'train_and_evaluate':
            step = 0
            for epoch in range(config_data.max_train_epoch):
                step = _train_epoch(sess, epoch, step, smry_writer)

        elif FLAGS.run_mode == 'test':
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.model_dir))
            _eval_epoch(sess, 0, mode='test')
def main():
    """Entrypoint.
    """
    # Load data
    train_data, dev_data, test_data = data_utils.load_data_numpy(
        config_data.input_dir, config_data.filename_prefix)
    with open(config_data.vocab_file, 'rb') as f:
        id2w = pickle.load(f)
    vocab_size = len(id2w)

    beam_width = config_model.beam_width

    # Create logging
    tx.utils.maybe_create_dir(FLAGS.model_dir)
    logging_file = os.path.join(FLAGS.model_dir, 'logging.txt')
    logger = utils.get_logger(logging_file)
    print('logging file is saved in: %s', logging_file)

    # Build model graph
    encoder_input = tf.placeholder(tf.int64, shape=(None, None))
    decoder_input = tf.placeholder(tf.int64, shape=(None, None))
    batch_size = tf.shape(encoder_input)[0]
    # (text sequence length excluding padding)
    encoder_input_length = tf.reduce_sum(
        1 - tf.cast(tf.equal(encoder_input, 0), tf.int32), axis=1)

    labels = tf.placeholder(tf.int64, shape=(None, None))
    is_target = tf.cast(tf.not_equal(labels, 0), tf.float32)

    global_step = tf.Variable(0, dtype=tf.int64, trainable=False)
    learning_rate = tf.placeholder(tf.float64, shape=(), name='lr')

    # Source word embedding
    src_word_embedder = tx.modules.WordEmbedder(vocab_size=vocab_size,
                                                hparams=config_model.emb)
    src_word_embeds = src_word_embedder(encoder_input)
    src_word_embeds = src_word_embeds * config_model.hidden_dim**0.5

    # Position embedding (shared b/w source and target)
    pos_embedder = tx.modules.SinusoidsPositionEmbedder(
        position_size=config_data.max_decoding_length,
        hparams=config_model.position_embedder_hparams)
    src_seq_len = tf.ones([batch_size], tf.int32) * tf.shape(encoder_input)[1]
    src_pos_embeds = pos_embedder(sequence_length=src_seq_len)

    src_input_embedding = src_word_embeds + src_pos_embeds

    encoder = TransformerEncoder(hparams=config_model.encoder)
    encoder_output = encoder(inputs=src_input_embedding,
                             sequence_length=encoder_input_length)

    # The decoder ties the input word embedding with the output logit layer.
    # As the decoder masks out <PAD>'s embedding, which in effect means
    # <PAD> has all-zero embedding, so here we explicitly set <PAD>'s embedding
    # to all-zero.
    tgt_embedding = tf.concat([
        tf.zeros(shape=[1, src_word_embedder.dim]),
        src_word_embedder.embedding[1:, :]
    ],
                              axis=0)
    tgt_embedder = tx.modules.WordEmbedder(tgt_embedding)
    tgt_word_embeds = tgt_embedder(decoder_input)
    tgt_word_embeds = tgt_word_embeds * config_model.hidden_dim**0.5

    tgt_seq_len = tf.ones([batch_size], tf.int32) * tf.shape(decoder_input)[1]
    tgt_pos_embeds = pos_embedder(sequence_length=tgt_seq_len)

    tgt_input_embedding = tgt_word_embeds + tgt_pos_embeds

    _output_w = tf.transpose(tgt_embedder.embedding, (1, 0))

    decoder = TransformerDecoder(vocab_size=vocab_size,
                                 output_layer=_output_w,
                                 hparams=config_model.decoder)
    # For training
    outputs = decoder(memory=encoder_output,
                      memory_sequence_length=encoder_input_length,
                      inputs=tgt_input_embedding,
                      decoding_strategy='train_greedy',
                      mode=tf.estimator.ModeKeys.TRAIN)
    # Graph matching in Transformer
    _tgt_embedding = tgt_embedder(soft_ids=outputs.logits)

    src_words = tf.nn.l2_normalize(src_word_embeds, 2, epsilon=1e-12)
    tgt_words = tf.nn.l2_normalize(_tgt_embedding, 2, epsilon=1e-12)

    cosine_cost = 1 - tf.einsum('aij,ajk->aik', src_words,
                                tf.transpose(tgt_words, [0, 2, 1]))
    # NOTE: prune
    _beta = 0.2
    minval = tf.reduce_min(cosine_cost)
    maxval = tf.reduce_max(cosine_cost)
    threshold = minval + _beta * (maxval - minval)
    cosine_cost = tf.nn.relu(cosine_cost - threshold)

    # TODO: Gromov wasserstein distance
    Cs = 1 - tf.einsum('aij,ajk->aik', src_words,
                       tf.transpose(src_words, [0, 2, 1]))
    Ct = 1 - tf.einsum('aij,ajk->aik', tgt_words,
                       tf.transpose(tgt_words, [0, 2, 1]))
    Css = OT.prune(Cs)
    Ctt = OT.prune(Ct)

    # OT_loss = tf.reduce_mean(OT.IPOT_distance2(cosine_cost))
    # GW_loss = tf.reduce_mean(OT.GW_distance(Css, Ctt))
    GW_loss, W_loss = OT.FGW_distance(Css, Ctt, cosine_cost)
    FGW_loss = tf.reduce_mean(0.1 * GW_loss + 1 * W_loss)

    mle_loss = transformer_utils.smoothing_cross_entropy(
        outputs.logits, labels, vocab_size, config_model.loss_label_confidence)
    mle_loss = tf.reduce_sum(mle_loss * is_target) / tf.reduce_sum(is_target)

    total_loss = mle_loss + FGW_loss * 0.1

    train_op = tx.core.get_train_op(total_loss,
                                    learning_rate=learning_rate,
                                    global_step=global_step,
                                    hparams=config_model.opt)

    tf.summary.scalar('lr', learning_rate)
    tf.summary.scalar('mle_loss', mle_loss)
    summary_merged = tf.summary.merge_all()

    # For inference (beam-search)
    start_tokens = tf.fill([batch_size], bos_token_id)

    def _embedding_fn(x, y):
        x_w_embed = tgt_embedder(x)
        y_p_embed = pos_embedder(y)
        return x_w_embed * config_model.hidden_dim**0.5 + y_p_embed

    predictions = decoder(memory=encoder_output,
                          memory_sequence_length=encoder_input_length,
                          beam_width=beam_width,
                          length_penalty=config_model.length_penalty,
                          start_tokens=start_tokens,
                          end_token=eos_token_id,
                          embedding=_embedding_fn,
                          max_decoding_length=config_data.max_decoding_length,
                          mode=tf.estimator.ModeKeys.PREDICT)
    # Uses the best sample by beam search
    beam_search_ids = predictions['sample_id'][:, :, 0]

    saver = tf.train.Saver(max_to_keep=5)
    best_results = {'score': 0, 'epoch': -1}

    def _eval_epoch(sess, epoch, mode):
        if mode == 'eval':
            eval_data = dev_data
        elif mode == 'test':
            eval_data = test_data
        else:
            raise ValueError('`mode` should be either "eval" or "test".')

        references, hypotheses = [], []
        bsize = config_data.test_batch_size
        for i in range(0, len(eval_data), bsize):
            sources, targets = zip(*eval_data[i:i + bsize])
            x_block = data_utils.source_pad_concat_convert(sources)
            feed_dict = {
                encoder_input: x_block,
                tx.global_mode(): tf.estimator.ModeKeys.EVAL,
            }
            fetches = {
                'beam_search_ids': beam_search_ids,
            }
            fetches_ = sess.run(fetches, feed_dict=feed_dict)

            hypotheses.extend(h.tolist() for h in fetches_['beam_search_ids'])
            references.extend(r.tolist() for r in targets)
            hypotheses = utils.list_strip_eos(hypotheses, eos_token_id)
            references = utils.list_strip_eos(references, eos_token_id)

        if mode == 'eval':
            # Writes results to files to evaluate BLEU
            # For 'eval' mode, the BLEU is based on token ids (rather than
            # text tokens) and serves only as a surrogate metric to monitor
            # the training process
            fname = os.path.join(FLAGS.model_dir, 'tmp.eval')
            hypotheses = tx.utils.str_join(hypotheses)
            references = tx.utils.str_join(references)
            hyp_fn, ref_fn = tx.utils.write_paired_text(hypotheses,
                                                        references,
                                                        fname,
                                                        mode='s')
            eval_bleu = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True)
            eval_bleu = 100. * eval_bleu
            logger.info('epoch: %d, eval_bleu %.4f', epoch, eval_bleu)
            print('epoch: %d, eval_bleu %.4f' % (epoch, eval_bleu))

            if eval_bleu > best_results['score']:
                logger.info('epoch: %d, best bleu: %.4f', epoch, eval_bleu)
                best_results['score'] = eval_bleu
                best_results['epoch'] = epoch
                model_path = os.path.join(FLAGS.model_dir, 'best-model.ckpt')
                logger.info('saving model to %s', model_path)
                print('saving model to %s' % model_path)
                saver.save(sess, model_path)

        elif mode == 'test':
            # For 'test' mode, together with the cmds in README.md, BLEU
            # is evaluated based on text tokens, which is the standard metric.
            fname = os.path.join(FLAGS.model_dir, 'test.output')
            hwords, rwords = [], []
            for hyp, ref in zip(hypotheses, references):
                hwords.append([id2w[y] for y in hyp])
                rwords.append([id2w[y] for y in ref])
            hwords = tx.utils.str_join(hwords)
            rwords = tx.utils.str_join(rwords)
            hyp_fn, ref_fn = tx.utils.write_paired_text(hwords,
                                                        rwords,
                                                        fname,
                                                        mode='s',
                                                        src_fname_suffix='hyp',
                                                        tgt_fname_suffix='ref')
            logger.info('Test output writtn to file: %s', hyp_fn)
            print('Test output writtn to file: %s' % hyp_fn)

    def _train_epoch(sess, epoch, step, smry_writer):
        random.shuffle(train_data)
        train_iter = data.iterator.pool(
            train_data,
            config_data.batch_size,
            key=lambda x: (len(x[0]), len(x[1])),
            batch_size_fn=utils.batch_size_fn,
            random_shuffler=data.iterator.RandomShuffler())

        for _, train_batch in enumerate(train_iter):
            in_arrays = data_utils.seq2seq_pad_concat_convert(train_batch)
            feed_dict = {
                encoder_input: in_arrays[0],
                decoder_input: in_arrays[1],
                labels: in_arrays[2],
                learning_rate: utils.get_lr(step, config_model.lr)
            }
            fetches = {
                'step': global_step,
                'train_op': train_op,
                'smry': summary_merged,
                'loss': mle_loss,
            }

            fetches_ = sess.run(fetches, feed_dict=feed_dict)

            step, loss = fetches_['step'], fetches_['loss']
            if step and step % config_data.display_steps == 0:
                logger.info('step: %d, loss: %.4f', step, loss)
                print('step: %d, loss: %.4f' % (step, loss))
                smry_writer.add_summary(fetches_['smry'], global_step=step)

            if step and step % config_data.eval_steps == 0:
                _eval_epoch(sess, epoch, mode='eval')
        return step

    # Run the graph
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        sess.run(tf.tables_initializer())

        smry_writer = tf.summary.FileWriter(FLAGS.model_dir, graph=sess.graph)

        if FLAGS.run_mode == 'train_and_evaluate':
            logger.info('Begin running with train_and_evaluate mode')

            if tf.train.latest_checkpoint(FLAGS.model_dir) is not None:
                logger.info('Restore latest checkpoint in %s' %
                            FLAGS.model_dir)
                saver.restore(sess,
                              tf.train.latest_checkpoint(FLAGS.model_dir))

            step = 0
            for epoch in range(config_data.max_train_epoch):
                step = _train_epoch(sess, epoch, step, smry_writer)

        elif FLAGS.run_mode == 'test':
            logger.info('Begin running with test mode')

            logger.info('Restore latest checkpoint in %s' % FLAGS.model_dir)
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.model_dir))

            _eval_epoch(sess, 0, mode='test')

        else:
            raise ValueError('Unknown mode: {}'.format(FLAGS.run_mode))