Example #1
0
def train(src_vocab,
          src_data_file,
          trg_vocab,
          trg_data_file,
          params,
          batch_size=1,
          max_step=300,
          train_steps=200000,
          lr_rate=0.0005,
          clip_gradient_norm=5.,
          check_every_step=500,
          model_dir='models/',
          burn_in_step=500,
          increment_step=1000,
          mode=MODE.TRAIN):
    # ------------------------------------
    # prepare data
    # ------------------------------------

    # load parallel data
    parallel_data_generator = \
        build_parallel_char_inputs(src_vocab, trg_vocab,
                              src_data_file, trg_data_file,
                              batch_size=batch_size, buffer_size=36,
                              mode=MODE.TRAIN)

    # ------------------------------------
    # build model
    # ------------------------------------

    # placeholder
    source_ids = tf.placeholder(tf.int32,
                                shape=(None, None),
                                name='source_ids')
    source_seq_length = tf.placeholder(tf.int32,
                                       shape=(None, ),
                                       name='source_seq_length')
    source_sample_matrix = tf.placeholder(tf.float32,
                                          shape=(None, None, None),
                                          name='source_sample_matrix')
    source_word_seq_length = tf.placeholder(tf.int32,
                                            shape=(None, ),
                                            name='source_word_seq_length')

    target_ids = tf.placeholder(tf.int32,
                                shape=(None, None),
                                name='target_ids')
    target_seq_length = tf.placeholder(tf.int32,
                                       shape=(None, ),
                                       name='target_seq_length')

    source_placeholders = {
        'src': source_ids,
        'src_len': source_seq_length,
        'src_sample_matrix': source_sample_matrix,
        'src_word_len': source_word_seq_length
    }
    target_placeholders = {'trg': target_ids, 'trg_len': target_seq_length}

    # Creates a variable to hold the global_step.
    global_step_tensor = tf.Variable(0, trainable=False, name='global_step')

    # attention model for training
    _, total_loss_avg, entropy_loss_avg, reward_loss_rmse, reward_predicted = \
        build_attention_model(params, src_vocab, trg_vocab,
                              source_placeholders,
                              target_placeholders,
                              mode=mode,
                              burn_in_step=burn_in_step,
                              increment_step=increment_step,
                              max_step=max_step)

    # attention model for evaluating
    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
        decoder_output_eval, _ = \
            build_attention_model(params, src_vocab, trg_vocab,
                                  source_placeholders,
                                  target_placeholders,
                                  mode=MODE.EVAL,
                                  max_step=max_step)

    # optimizer
    optimizer = tf.train.GradientDescentOptimizer(0.001)

    gradients, variables = zip(*optimizer.compute_gradients(total_loss_avg))
    gradients_norm = tf.global_norm(gradients)

    gradients, _ = tf.clip_by_global_norm(gradients,
                                          clip_gradient_norm,
                                          use_norm=gradients_norm)
    train_op = optimizer.apply_gradients(zip(gradients, variables),
                                         global_step=global_step_tensor)

    # record loss curve
    tf.summary.scalar('total_loss', total_loss_avg)
    tf.summary.scalar('entropy_loss_avg', entropy_loss_avg)
    tf.summary.scalar('reward_predicted', reward_predicted)
    tf.summary.scalar('reward_loss_rmse', reward_loss_rmse)
    tf.summary.scalar('gradients_norm', gradients_norm)

    # Create a saver object which will save all the variables
    saver_var_list = tf.trainable_variables()
    saver_var_list.append(global_step_tensor)
    saver = tf.train.Saver(var_list=saver_var_list, max_to_keep=3)

    # GPU config
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = False

    # ------------------------------------
    # training
    # ------------------------------------

    with tf.Session(config=config) as sess:
        # init
        init = tf.global_variables_initializer()
        sess.run(init)

        model_path = os.path.join(model_dir, 'model.ckpt')
        last_ckpt = tf.train.latest_checkpoint(model_dir)

        # Merge all the summaries and write them out
        summary_merged = tf.summary.merge_all()
        train_writer = tf.summary.FileWriter(model_dir + '/train', sess.graph)
        if last_ckpt:
            optimistic_restore(sess, last_ckpt)

        tf.logging.info('Train model ...')

        # start training
        start_time = time.time()
        for step in range(1, train_steps):
            current_input = next(parallel_data_generator)
            current_input_dict = current_input._asdict()
            feed_dict = {}
            for key in source_placeholders.keys():
                feed_dict[source_placeholders[key]] = current_input_dict[key]
            for key in target_placeholders.keys():
                feed_dict[target_placeholders[key]] = current_input_dict[key]

            _, total_loss_avg_np, summary, gradients_norm_np, gradients_np, reward_predicted_np, global_step = \
                sess.run([train_op, total_loss_avg, summary_merged, gradients_norm, gradients,
                          reward_predicted, global_step_tensor],
                         feed_dict=feed_dict)
            train_writer.add_summary(summary, global_step)

            if numpy.isnan(gradients_norm_np):
                print(gradients_norm_np, gradients_np)
                break

            if step % check_every_step == 0:
                tf.logging.info('start_time: {}, {} steps / sec'.format(
                    datetime.fromtimestamp(start_time).strftime('%Y-%m-%d '
                                                                '%H:%M:%S'),
                    check_every_step / (time.time() - start_time)))
                tf.logging.info(
                    'global_step: {}, step: {}, total_loss: {}'.format(
                        global_step, step, total_loss_avg_np))
                start_time = time.time()

                saver.save(sess, model_path, global_step=global_step)
                predicted_ids_np = \
                    sess.run(decoder_output_eval.predicted_ids,
                             feed_dict=feed_dict)

                # print eval results
                for i in range(10):
                    pids = predicted_ids_np[:, i].tolist()
                    if TIME_MAJOR:
                        sids = current_input_dict['src'][:, i].tolist()
                        tids = current_input_dict['trg'][:, i].tolist()
                    else:
                        sids = current_input_dict['src'][i, :].tolist()
                        tids = current_input_dict['trg'][i, :].tolist()
                    print('src:', src_vocab.id_to_token(sids))
                    print('prd:', trg_vocab.id_to_token(pids))
                    print('trg:', trg_vocab.id_to_token(tids))
                    print('---------------------------------')
                print('---------------------------------')
                print('---------------------------------')
                print('\n')
Example #2
0
def infer(src_vocab,
          src_data_file,
          trg_vocab,
          params,
          beam_size=1,
          batch_size=1,
          max_step=100,
          output_file='test.out',
          model_dir='models/'):

    save_output_dir = 'dev_outputs/'
    if not os.path.exists(save_output_dir):
        os.makedirs(save_output_dir)

    # ------------------------------------
    # prepare data
    # trg_data_file may be empty.
    # ------------------------------------

    # load parallel data
    parallel_data_generator = \
        build_source_char_inputs(src_vocab, src_data_file,
                                batch_size=batch_size, buffer_size=96,
                                mode=MODE.INFER)

    # ------------------------------------
    # build model
    # ------------------------------------

    # placeholder
    source_ids = tf.placeholder(tf.int32,
                                shape=(None, None),
                                name='source_ids')
    source_seq_length = tf.placeholder(tf.int32,
                                       shape=(None, ),
                                       name='source_seq_length')
    source_sample_matrix = tf.placeholder(tf.float32,
                                          shape=(None, None, None),
                                          name='source_sample_matrix')
    source_word_seq_length = tf.placeholder(tf.int32,
                                            shape=(None, ),
                                            name='source_word_seq_length')

    target_ids = None
    target_seq_length = None

    source_placeholders = {
        'src': source_ids,
        'src_len': source_seq_length,
        'src_sample_matrix': source_sample_matrix,
        'src_word_len': source_word_seq_length
    }
    target_placeholders = {'trg': target_ids, 'trg_len': target_seq_length}


    decoder_output_eval, decoder_final_state = \
        build_attention_model(params, src_vocab, trg_vocab,
                              source_placeholders,
                              target_placeholders,
                              beam_size=beam_size, mode=MODE.INFER,
                              max_step=max_step)

    # GPU config
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        last_ckpt = tf.train.latest_checkpoint(model_dir)
        if last_ckpt:
            optimistic_restore(sess, last_ckpt)
        else:
            raise Exception('No checkpoint found ...')

        output_file_name = os.path.join(save_output_dir,
                                        output_file + last_ckpt.split('-')[-1])
        output_ = open(output_file_name, 'w')

        for step, current_input in enumerate(parallel_data_generator):
            current_input_dict = current_input._asdict()
            feed_dict = {}
            for key in source_placeholders.keys():
                feed_dict[source_placeholders[key]] = current_input_dict[key]

            # beam_ids_np: [seq_len, beam_size]
            # predicted_ids_np: [seq_len, beam_size]
            predicted_ids_np, beam_ids_np, log_probs_np = sess.run(
                [
                    decoder_output_eval.predicted_ids,
                    decoder_output_eval.beam_ids, decoder_final_state.log_probs
                ],
                feed_dict=feed_dict)

            src_len_np = current_input_dict['src_len']
            data_batch_size = len(src_len_np)

            gathered_pred_ids = numpy.zeros_like(beam_ids_np)
            for idx in range(beam_ids_np.shape[0]):
                gathered_pred_ids = gathered_pred_ids[:, beam_ids_np[idx] %
                                                      beam_ids_np.shape[1]]
                gathered_pred_ids[idx, :] = predicted_ids_np[idx]

            seq_lens = []
            for idx in range(beam_ids_np.shape[1]):
                pred_ids_list = gathered_pred_ids[:, idx].tolist()
                seq_lens.append(pred_ids_list.index(trg_vocab.eos_id) + 1 \
                                    if trg_vocab.eos_id in pred_ids_list \
                                    else len(pred_ids_list))

            log_probs_np = log_probs_np / numpy.array(seq_lens)
            log_probs_np_list = numpy.split(log_probs_np,
                                            data_batch_size,
                                            axis=0)
            each_max_idx = [
                numpy.argmax(log_prob) + b * beam_size
                for b, log_prob in enumerate(log_probs_np_list)
            ]

            pids = gathered_pred_ids[:, each_max_idx]

            for b in range(data_batch_size):
                p = trg_vocab.id_to_token(pids[:, b].tolist())
                if TIME_MAJOR:
                    s = src_vocab.id_to_token(
                        current_input_dict['src'][:, b].tolist())
                else:
                    s = src_vocab.id_to_token(
                        current_input_dict['src'][b, :].tolist())
                print('src:', s)
                print('prd:', p)
                print('---------------------------')
                print('\n')
                output_.write(p + '\n')
            output_.flush()
        output_.close()
        copyfile(output_file_name, output_file)
Example #3
0
def infer(src_vocab_file,
          src_embedding_dim,
          src_data_file,
          trg_vocab_file,
          trg_embedding_dim,
          trg_data_file,
          params,
          output_file,
          beam_size=1,
          batch_size=1,
          model_dir='models/'):
    # ------------------------------------
    # prepare data
    # trg_data_file may be empty.
    # ------------------------------------

    # load vocab
    src_vocab = build_vocab(src_vocab_file, src_embedding_dim, ' ')
    trg_vocab = build_vocab(trg_vocab_file, trg_embedding_dim, '')

    # load parallel data
    parallel_data_generator = \
        build_parallel_inputs(src_vocab, trg_vocab,
                              src_data_file, trg_data_file,
                              batch_size=batch_size, buffer_size=96,
                              mode=MODE.INFER)

    # ------------------------------------
    # build model
    # ------------------------------------

    # placeholder
    source_ids = tf.placeholder(tf.int32,
                                shape=(None, None),
                                name='source_ids')
    source_seq_length = tf.placeholder(tf.int32,
                                       shape=(None, ),
                                       name='source_seq_length')

    target_ids = tf.placeholder(tf.int32,
                                shape=(None, None),
                                name='target_ids')
    target_seq_length = tf.placeholder(tf.int32,
                                       shape=(None, ),
                                       name='target_seq_length')

    decoder_output_eval, decoder_final_state = \
        build_attention_model(params, src_vocab, trg_vocab, source_ids,
                              source_seq_length, target_ids, target_seq_length,
                              beam_size=beam_size, mode=MODE.INFER,
                              max_step=100)

    # GPU config
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        last_ckpt = tf.train.latest_checkpoint(model_dir)
        if last_ckpt:
            optimistic_restore(sess, last_ckpt)
        else:
            raise Exception('No checkpoint found ...')

        output_ = open(output_file, 'w')
        for step, curr_data in enumerate(parallel_data_generator):
            src_np, src_len_np, trg_np, trg_len_np = curr_data
            # beam_ids_np: [seq_len, beam_size]
            # predicted_ids_np: [seq_len, beam_size]
            predicted_ids_np, beam_ids_np, log_probs_np = sess.run(
                [
                    decoder_output_eval.predicted_ids,
                    decoder_output_eval.beam_ids, decoder_final_state.log_probs
                ],
                feed_dict={
                    source_ids: src_np,
                    source_seq_length: src_len_np,
                    target_ids: trg_np,
                    target_seq_length: trg_len_np
                })

            data_batch_size = len(src_len_np)

            gathered_pred_ids = numpy.zeros_like(beam_ids_np)
            for idx in range(beam_ids_np.shape[0]):
                gathered_pred_ids = gathered_pred_ids[:, beam_ids_np[idx] %
                                                      beam_ids_np.shape[1]]
                gathered_pred_ids[idx, :] = predicted_ids_np[idx]

            seq_lens = []
            for idx in range(beam_ids_np.shape[1]):
                pred_ids_list = gathered_pred_ids[:, idx].tolist()
                seq_lens.append(
                    pred_ids_list.index(trg_vocab.eos_id) +
                    1 if trg_vocab.eos_id in
                    pred_ids_list else len(pred_ids_list))

            log_probs_np = log_probs_np / numpy.array(seq_lens)
            log_probs_np_list = numpy.split(log_probs_np,
                                            data_batch_size,
                                            axis=0)
            each_max_idx = [
                numpy.argmax(log_prob) + b * beam_size
                for b, log_prob in enumerate(log_probs_np_list)
            ]

            pids = gathered_pred_ids[:, each_max_idx]

            for b in range(data_batch_size):
                p = trg_vocab.id_to_token(pids[:, b].tolist())
                tf.logging.info(p)
                output_.write(p + '\n')
            output_.flush()
        output_.close()
Example #4
0
def train(src_vocab_file,
          src_embedding_dim,
          src_data_file,
          trg_vocab_file,
          trg_embedding_dim,
          trg_data_file,
          params,
          train_step=200000,
          lr_rate=0.0005,
          batch_size=1,
          check_every_step=500,
          model_dir='models/'):
    # ------------------------------------
    # prepare data
    # ------------------------------------

    # load vocab
    src_vocab = build_vocab(src_vocab_file, src_embedding_dim, ' ')
    trg_vocab = build_vocab(trg_vocab_file, trg_embedding_dim, '')

    # load parallel data
    parallel_data_generator = \
        build_parallel_inputs(src_vocab, trg_vocab,
                              src_data_file, trg_data_file,
                              batch_size=batch_size, buffer_size=96,
                              mode=MODE.TRAIN)

    # ------------------------------------
    # build model
    # ------------------------------------

    # placeholder
    source_ids = tf.placeholder(tf.int32,
                                shape=(None, None),
                                name='source_ids')
    source_seq_length = tf.placeholder(tf.int32,
                                       shape=(None, ),
                                       name='source_seq_length')

    target_ids = tf.placeholder(tf.int32,
                                shape=(None, None),
                                name='target_ids')
    target_seq_length = tf.placeholder(tf.int32,
                                       shape=(None, ),
                                       name='target_seq_length')

    # attention model for training
    decoder_output, losses = build_attention_model(params,
                                                   src_vocab,
                                                   trg_vocab,
                                                   source_ids,
                                                   source_seq_length,
                                                   target_ids,
                                                   target_seq_length,
                                                   mode=MODE.TRAIN)

    # attention model for evaluating
    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
        decoder_output_eval, _ = build_attention_model(params,
                                                       src_vocab,
                                                       trg_vocab,
                                                       source_ids,
                                                       source_seq_length,
                                                       target_ids,
                                                       target_seq_length,
                                                       mode=MODE.EVAL)

    # Calculate the average log perplexity in each batch
    loss_avg = tf.reduce_sum(losses) / tf.to_float(
        tf.reduce_sum(target_seq_length))

    # record loss curve
    tf.summary.scalar('loss', loss_avg)

    # optimizer
    optimizer = tf.train.AdamOptimizer(lr_rate)
    gradients, variables = zip(*optimizer.compute_gradients(loss_avg))
    gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
    train_op = optimizer.apply_gradients(zip(gradients, variables))

    # Create a saver object which will save all the variables
    saver = tf.train.Saver(max_to_keep=3)

    # GPU config
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    # ------------------------------------
    # training
    # ------------------------------------

    with tf.Session(config=config) as sess:
        # init
        init = tf.global_variables_initializer()
        global_step = 0
        model_path = os.path.join(model_dir, 'model.ckpt')
        last_ckpt = tf.train.latest_checkpoint(model_dir)

        # Merge all the summaries and write them out
        merged = tf.summary.merge_all()
        train_writer = tf.summary.FileWriter(model_dir + '/train', sess.graph)
        if last_ckpt:
            global_step = int(last_ckpt.split('-')[-1])
            saver.restore(sess, last_ckpt)
        else:
            sess.run(init)

        # start training
        start_time = time.time()
        for step in range(train_step):
            src_np, src_len_np, trg_np, trg_len_np = next(
                parallel_data_generator)

            _, loss_avg_np, summary = \
                sess.run([train_op, loss_avg, merged],
                         feed_dict={source_ids: src_np,
                                    source_seq_length: src_len_np,
                                    target_ids: trg_np,
                                    target_seq_length: trg_len_np})
            train_writer.add_summary(summary, global_step)

            if step % check_every_step == 0:
                tf.logging.info('start_time: {}, {} steps / sec'.format(
                    start_time, check_every_step / (time.time() - start_time)))
                tf.logging.info('step: {}, loss: {}'.format(step, loss_avg_np))
                start_time = time.time()

                saver.save(sess, model_path, global_step=global_step)
                predicted_ids_np = \
                    sess.run(decoder_output_eval.predicted_ids,
                             feed_dict={source_ids: src_np,
                                        source_seq_length: src_len_np,
                                        target_ids: trg_np,
                                        target_seq_length: trg_len_np})

                # print eval results
                for i in range(10):
                    pids = predicted_ids_np[:, i].tolist()
                    if TIME_MAJOR:
                        tids = trg_np[:, i].tolist()
                    else:
                        tids = trg_np[i, :].tolist()
                    print(trg_vocab.id_to_token(pids))
                    print(trg_vocab.id_to_token(tids))
                    print('----------------------------------')

            global_step += 1
Example #5
0
def train(src_vocab,
          src_data_file,
          trg_vocab,
          trg_data_file,
          params,
          batch_size=1,
          max_step=300,
          train_steps=200000,
          lr_rate=0.0005,
          clip_gradient_norm=5.,
          check_every_step=100,
          model_dir='models/',
          burn_in_step=500,
          increment_step=1000,
          mode=MODE.TRAIN):
    # ------------------------------------
    # prepare data
    # ------------------------------------

    # load parallel data
    parallel_data_generator = \
        build_parallel_char_inputs(src_vocab, trg_vocab,
                              src_data_file, trg_data_file,
                              batch_size=batch_size, buffer_size=36,
                              mode=MODE.TRAIN)

    # ------------------------------------
    # build model
    # ------------------------------------

    # placeholder
    source_ids = tf.placeholder(tf.int32,
                                shape=(None, None),
                                name='source_ids')
    source_seq_length = tf.placeholder(tf.int32,
                                       shape=(None, ),
                                       name='source_seq_length')
    source_sample_matrix = tf.placeholder(tf.float32,
                                          shape=(None, None, None),
                                          name='source_sample_matrix')

    target_ids = tf.placeholder(tf.int32,
                                shape=(None, None),
                                name='target_ids')
    target_seq_length = tf.placeholder(tf.int32,
                                       shape=(None, ),
                                       name='target_seq_length')

    source_placeholders = {
        'src': source_ids,
        'src_len': source_seq_length,
        'src_sample_matrix': source_sample_matrix
    }
    target_placeholders = {'trg': target_ids, 'trg_len': target_seq_length}

    # Creates a variable to hold the global_step.
    global_step_tensor = tf.Variable(0, trainable=False, name='global_step')

    # attention model for training
    tmp = \
        build_attention_model(params, src_vocab, trg_vocab,
                              source_placeholders,
                              target_placeholders,
                              mode=mode,
                              burn_in_step=burn_in_step,
                              increment_step=increment_step,
                              max_step=max_step)

    # GPU config
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    # ------------------------------------
    # training
    # ------------------------------------

    with tf.Session(config=config) as sess:
        # init
        init = tf.global_variables_initializer()
        sess.run(init)

        tf.logging.info('Train model ...')

        # start training
        start_time = time.time()
        for step in range(1, train_steps):
            current_input = next(parallel_data_generator)
            current_input_dict = current_input._asdict()
            feed_dict = {}
            for key in source_placeholders.keys():
                feed_dict[source_placeholders[key]] = current_input_dict[key]
            for key in target_placeholders.keys():
                feed_dict[target_placeholders[key]] = current_input_dict[key]

            tmp_np, global_step = \
                sess.run([tmp, global_step_tensor],
                         feed_dict=feed_dict)
            print(global_step, tmp_np.shape)