def train(src_vocab, src_data_file, trg_vocab, trg_data_file, params, batch_size=1, max_step=300, train_steps=200000, lr_rate=0.0005, clip_gradient_norm=5., check_every_step=500, model_dir='models/', burn_in_step=500, increment_step=1000, mode=MODE.TRAIN): # ------------------------------------ # prepare data # ------------------------------------ # load parallel data parallel_data_generator = \ build_parallel_char_inputs(src_vocab, trg_vocab, src_data_file, trg_data_file, batch_size=batch_size, buffer_size=36, mode=MODE.TRAIN) # ------------------------------------ # build model # ------------------------------------ # placeholder source_ids = tf.placeholder(tf.int32, shape=(None, None), name='source_ids') source_seq_length = tf.placeholder(tf.int32, shape=(None, ), name='source_seq_length') source_sample_matrix = tf.placeholder(tf.float32, shape=(None, None, None), name='source_sample_matrix') source_word_seq_length = tf.placeholder(tf.int32, shape=(None, ), name='source_word_seq_length') target_ids = tf.placeholder(tf.int32, shape=(None, None), name='target_ids') target_seq_length = tf.placeholder(tf.int32, shape=(None, ), name='target_seq_length') source_placeholders = { 'src': source_ids, 'src_len': source_seq_length, 'src_sample_matrix': source_sample_matrix, 'src_word_len': source_word_seq_length } target_placeholders = {'trg': target_ids, 'trg_len': target_seq_length} # Creates a variable to hold the global_step. global_step_tensor = tf.Variable(0, trainable=False, name='global_step') # attention model for training _, total_loss_avg, entropy_loss_avg, reward_loss_rmse, reward_predicted = \ build_attention_model(params, src_vocab, trg_vocab, source_placeholders, target_placeholders, mode=mode, burn_in_step=burn_in_step, increment_step=increment_step, max_step=max_step) # attention model for evaluating with tf.variable_scope(tf.get_variable_scope(), reuse=True): decoder_output_eval, _ = \ build_attention_model(params, src_vocab, trg_vocab, source_placeholders, target_placeholders, mode=MODE.EVAL, max_step=max_step) # optimizer optimizer = tf.train.GradientDescentOptimizer(0.001) gradients, variables = zip(*optimizer.compute_gradients(total_loss_avg)) gradients_norm = tf.global_norm(gradients) gradients, _ = tf.clip_by_global_norm(gradients, clip_gradient_norm, use_norm=gradients_norm) train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=global_step_tensor) # record loss curve tf.summary.scalar('total_loss', total_loss_avg) tf.summary.scalar('entropy_loss_avg', entropy_loss_avg) tf.summary.scalar('reward_predicted', reward_predicted) tf.summary.scalar('reward_loss_rmse', reward_loss_rmse) tf.summary.scalar('gradients_norm', gradients_norm) # Create a saver object which will save all the variables saver_var_list = tf.trainable_variables() saver_var_list.append(global_step_tensor) saver = tf.train.Saver(var_list=saver_var_list, max_to_keep=3) # GPU config config = tf.ConfigProto() config.gpu_options.allow_growth = False # ------------------------------------ # training # ------------------------------------ with tf.Session(config=config) as sess: # init init = tf.global_variables_initializer() sess.run(init) model_path = os.path.join(model_dir, 'model.ckpt') last_ckpt = tf.train.latest_checkpoint(model_dir) # Merge all the summaries and write them out summary_merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(model_dir + '/train', sess.graph) if last_ckpt: optimistic_restore(sess, last_ckpt) tf.logging.info('Train model ...') # start training start_time = time.time() for step in range(1, train_steps): current_input = next(parallel_data_generator) current_input_dict = current_input._asdict() feed_dict = {} for key in source_placeholders.keys(): feed_dict[source_placeholders[key]] = current_input_dict[key] for key in target_placeholders.keys(): feed_dict[target_placeholders[key]] = current_input_dict[key] _, total_loss_avg_np, summary, gradients_norm_np, gradients_np, reward_predicted_np, global_step = \ sess.run([train_op, total_loss_avg, summary_merged, gradients_norm, gradients, reward_predicted, global_step_tensor], feed_dict=feed_dict) train_writer.add_summary(summary, global_step) if numpy.isnan(gradients_norm_np): print(gradients_norm_np, gradients_np) break if step % check_every_step == 0: tf.logging.info('start_time: {}, {} steps / sec'.format( datetime.fromtimestamp(start_time).strftime('%Y-%m-%d ' '%H:%M:%S'), check_every_step / (time.time() - start_time))) tf.logging.info( 'global_step: {}, step: {}, total_loss: {}'.format( global_step, step, total_loss_avg_np)) start_time = time.time() saver.save(sess, model_path, global_step=global_step) predicted_ids_np = \ sess.run(decoder_output_eval.predicted_ids, feed_dict=feed_dict) # print eval results for i in range(10): pids = predicted_ids_np[:, i].tolist() if TIME_MAJOR: sids = current_input_dict['src'][:, i].tolist() tids = current_input_dict['trg'][:, i].tolist() else: sids = current_input_dict['src'][i, :].tolist() tids = current_input_dict['trg'][i, :].tolist() print('src:', src_vocab.id_to_token(sids)) print('prd:', trg_vocab.id_to_token(pids)) print('trg:', trg_vocab.id_to_token(tids)) print('---------------------------------') print('---------------------------------') print('---------------------------------') print('\n')
def infer(src_vocab, src_data_file, trg_vocab, params, beam_size=1, batch_size=1, max_step=100, output_file='test.out', model_dir='models/'): save_output_dir = 'dev_outputs/' if not os.path.exists(save_output_dir): os.makedirs(save_output_dir) # ------------------------------------ # prepare data # trg_data_file may be empty. # ------------------------------------ # load parallel data parallel_data_generator = \ build_source_char_inputs(src_vocab, src_data_file, batch_size=batch_size, buffer_size=96, mode=MODE.INFER) # ------------------------------------ # build model # ------------------------------------ # placeholder source_ids = tf.placeholder(tf.int32, shape=(None, None), name='source_ids') source_seq_length = tf.placeholder(tf.int32, shape=(None, ), name='source_seq_length') source_sample_matrix = tf.placeholder(tf.float32, shape=(None, None, None), name='source_sample_matrix') source_word_seq_length = tf.placeholder(tf.int32, shape=(None, ), name='source_word_seq_length') target_ids = None target_seq_length = None source_placeholders = { 'src': source_ids, 'src_len': source_seq_length, 'src_sample_matrix': source_sample_matrix, 'src_word_len': source_word_seq_length } target_placeholders = {'trg': target_ids, 'trg_len': target_seq_length} decoder_output_eval, decoder_final_state = \ build_attention_model(params, src_vocab, trg_vocab, source_placeholders, target_placeholders, beam_size=beam_size, mode=MODE.INFER, max_step=max_step) # GPU config config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: last_ckpt = tf.train.latest_checkpoint(model_dir) if last_ckpt: optimistic_restore(sess, last_ckpt) else: raise Exception('No checkpoint found ...') output_file_name = os.path.join(save_output_dir, output_file + last_ckpt.split('-')[-1]) output_ = open(output_file_name, 'w') for step, current_input in enumerate(parallel_data_generator): current_input_dict = current_input._asdict() feed_dict = {} for key in source_placeholders.keys(): feed_dict[source_placeholders[key]] = current_input_dict[key] # beam_ids_np: [seq_len, beam_size] # predicted_ids_np: [seq_len, beam_size] predicted_ids_np, beam_ids_np, log_probs_np = sess.run( [ decoder_output_eval.predicted_ids, decoder_output_eval.beam_ids, decoder_final_state.log_probs ], feed_dict=feed_dict) src_len_np = current_input_dict['src_len'] data_batch_size = len(src_len_np) gathered_pred_ids = numpy.zeros_like(beam_ids_np) for idx in range(beam_ids_np.shape[0]): gathered_pred_ids = gathered_pred_ids[:, beam_ids_np[idx] % beam_ids_np.shape[1]] gathered_pred_ids[idx, :] = predicted_ids_np[idx] seq_lens = [] for idx in range(beam_ids_np.shape[1]): pred_ids_list = gathered_pred_ids[:, idx].tolist() seq_lens.append(pred_ids_list.index(trg_vocab.eos_id) + 1 \ if trg_vocab.eos_id in pred_ids_list \ else len(pred_ids_list)) log_probs_np = log_probs_np / numpy.array(seq_lens) log_probs_np_list = numpy.split(log_probs_np, data_batch_size, axis=0) each_max_idx = [ numpy.argmax(log_prob) + b * beam_size for b, log_prob in enumerate(log_probs_np_list) ] pids = gathered_pred_ids[:, each_max_idx] for b in range(data_batch_size): p = trg_vocab.id_to_token(pids[:, b].tolist()) if TIME_MAJOR: s = src_vocab.id_to_token( current_input_dict['src'][:, b].tolist()) else: s = src_vocab.id_to_token( current_input_dict['src'][b, :].tolist()) print('src:', s) print('prd:', p) print('---------------------------') print('\n') output_.write(p + '\n') output_.flush() output_.close() copyfile(output_file_name, output_file)
def infer(src_vocab_file, src_embedding_dim, src_data_file, trg_vocab_file, trg_embedding_dim, trg_data_file, params, output_file, beam_size=1, batch_size=1, model_dir='models/'): # ------------------------------------ # prepare data # trg_data_file may be empty. # ------------------------------------ # load vocab src_vocab = build_vocab(src_vocab_file, src_embedding_dim, ' ') trg_vocab = build_vocab(trg_vocab_file, trg_embedding_dim, '') # load parallel data parallel_data_generator = \ build_parallel_inputs(src_vocab, trg_vocab, src_data_file, trg_data_file, batch_size=batch_size, buffer_size=96, mode=MODE.INFER) # ------------------------------------ # build model # ------------------------------------ # placeholder source_ids = tf.placeholder(tf.int32, shape=(None, None), name='source_ids') source_seq_length = tf.placeholder(tf.int32, shape=(None, ), name='source_seq_length') target_ids = tf.placeholder(tf.int32, shape=(None, None), name='target_ids') target_seq_length = tf.placeholder(tf.int32, shape=(None, ), name='target_seq_length') decoder_output_eval, decoder_final_state = \ build_attention_model(params, src_vocab, trg_vocab, source_ids, source_seq_length, target_ids, target_seq_length, beam_size=beam_size, mode=MODE.INFER, max_step=100) # GPU config config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: last_ckpt = tf.train.latest_checkpoint(model_dir) if last_ckpt: optimistic_restore(sess, last_ckpt) else: raise Exception('No checkpoint found ...') output_ = open(output_file, 'w') for step, curr_data in enumerate(parallel_data_generator): src_np, src_len_np, trg_np, trg_len_np = curr_data # beam_ids_np: [seq_len, beam_size] # predicted_ids_np: [seq_len, beam_size] predicted_ids_np, beam_ids_np, log_probs_np = sess.run( [ decoder_output_eval.predicted_ids, decoder_output_eval.beam_ids, decoder_final_state.log_probs ], feed_dict={ source_ids: src_np, source_seq_length: src_len_np, target_ids: trg_np, target_seq_length: trg_len_np }) data_batch_size = len(src_len_np) gathered_pred_ids = numpy.zeros_like(beam_ids_np) for idx in range(beam_ids_np.shape[0]): gathered_pred_ids = gathered_pred_ids[:, beam_ids_np[idx] % beam_ids_np.shape[1]] gathered_pred_ids[idx, :] = predicted_ids_np[idx] seq_lens = [] for idx in range(beam_ids_np.shape[1]): pred_ids_list = gathered_pred_ids[:, idx].tolist() seq_lens.append( pred_ids_list.index(trg_vocab.eos_id) + 1 if trg_vocab.eos_id in pred_ids_list else len(pred_ids_list)) log_probs_np = log_probs_np / numpy.array(seq_lens) log_probs_np_list = numpy.split(log_probs_np, data_batch_size, axis=0) each_max_idx = [ numpy.argmax(log_prob) + b * beam_size for b, log_prob in enumerate(log_probs_np_list) ] pids = gathered_pred_ids[:, each_max_idx] for b in range(data_batch_size): p = trg_vocab.id_to_token(pids[:, b].tolist()) tf.logging.info(p) output_.write(p + '\n') output_.flush() output_.close()
def train(src_vocab_file, src_embedding_dim, src_data_file, trg_vocab_file, trg_embedding_dim, trg_data_file, params, train_step=200000, lr_rate=0.0005, batch_size=1, check_every_step=500, model_dir='models/'): # ------------------------------------ # prepare data # ------------------------------------ # load vocab src_vocab = build_vocab(src_vocab_file, src_embedding_dim, ' ') trg_vocab = build_vocab(trg_vocab_file, trg_embedding_dim, '') # load parallel data parallel_data_generator = \ build_parallel_inputs(src_vocab, trg_vocab, src_data_file, trg_data_file, batch_size=batch_size, buffer_size=96, mode=MODE.TRAIN) # ------------------------------------ # build model # ------------------------------------ # placeholder source_ids = tf.placeholder(tf.int32, shape=(None, None), name='source_ids') source_seq_length = tf.placeholder(tf.int32, shape=(None, ), name='source_seq_length') target_ids = tf.placeholder(tf.int32, shape=(None, None), name='target_ids') target_seq_length = tf.placeholder(tf.int32, shape=(None, ), name='target_seq_length') # attention model for training decoder_output, losses = build_attention_model(params, src_vocab, trg_vocab, source_ids, source_seq_length, target_ids, target_seq_length, mode=MODE.TRAIN) # attention model for evaluating with tf.variable_scope(tf.get_variable_scope(), reuse=True): decoder_output_eval, _ = build_attention_model(params, src_vocab, trg_vocab, source_ids, source_seq_length, target_ids, target_seq_length, mode=MODE.EVAL) # Calculate the average log perplexity in each batch loss_avg = tf.reduce_sum(losses) / tf.to_float( tf.reduce_sum(target_seq_length)) # record loss curve tf.summary.scalar('loss', loss_avg) # optimizer optimizer = tf.train.AdamOptimizer(lr_rate) gradients, variables = zip(*optimizer.compute_gradients(loss_avg)) gradients, _ = tf.clip_by_global_norm(gradients, 5.0) train_op = optimizer.apply_gradients(zip(gradients, variables)) # Create a saver object which will save all the variables saver = tf.train.Saver(max_to_keep=3) # GPU config config = tf.ConfigProto() config.gpu_options.allow_growth = True # ------------------------------------ # training # ------------------------------------ with tf.Session(config=config) as sess: # init init = tf.global_variables_initializer() global_step = 0 model_path = os.path.join(model_dir, 'model.ckpt') last_ckpt = tf.train.latest_checkpoint(model_dir) # Merge all the summaries and write them out merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(model_dir + '/train', sess.graph) if last_ckpt: global_step = int(last_ckpt.split('-')[-1]) saver.restore(sess, last_ckpt) else: sess.run(init) # start training start_time = time.time() for step in range(train_step): src_np, src_len_np, trg_np, trg_len_np = next( parallel_data_generator) _, loss_avg_np, summary = \ sess.run([train_op, loss_avg, merged], feed_dict={source_ids: src_np, source_seq_length: src_len_np, target_ids: trg_np, target_seq_length: trg_len_np}) train_writer.add_summary(summary, global_step) if step % check_every_step == 0: tf.logging.info('start_time: {}, {} steps / sec'.format( start_time, check_every_step / (time.time() - start_time))) tf.logging.info('step: {}, loss: {}'.format(step, loss_avg_np)) start_time = time.time() saver.save(sess, model_path, global_step=global_step) predicted_ids_np = \ sess.run(decoder_output_eval.predicted_ids, feed_dict={source_ids: src_np, source_seq_length: src_len_np, target_ids: trg_np, target_seq_length: trg_len_np}) # print eval results for i in range(10): pids = predicted_ids_np[:, i].tolist() if TIME_MAJOR: tids = trg_np[:, i].tolist() else: tids = trg_np[i, :].tolist() print(trg_vocab.id_to_token(pids)) print(trg_vocab.id_to_token(tids)) print('----------------------------------') global_step += 1
def train(src_vocab, src_data_file, trg_vocab, trg_data_file, params, batch_size=1, max_step=300, train_steps=200000, lr_rate=0.0005, clip_gradient_norm=5., check_every_step=100, model_dir='models/', burn_in_step=500, increment_step=1000, mode=MODE.TRAIN): # ------------------------------------ # prepare data # ------------------------------------ # load parallel data parallel_data_generator = \ build_parallel_char_inputs(src_vocab, trg_vocab, src_data_file, trg_data_file, batch_size=batch_size, buffer_size=36, mode=MODE.TRAIN) # ------------------------------------ # build model # ------------------------------------ # placeholder source_ids = tf.placeholder(tf.int32, shape=(None, None), name='source_ids') source_seq_length = tf.placeholder(tf.int32, shape=(None, ), name='source_seq_length') source_sample_matrix = tf.placeholder(tf.float32, shape=(None, None, None), name='source_sample_matrix') target_ids = tf.placeholder(tf.int32, shape=(None, None), name='target_ids') target_seq_length = tf.placeholder(tf.int32, shape=(None, ), name='target_seq_length') source_placeholders = { 'src': source_ids, 'src_len': source_seq_length, 'src_sample_matrix': source_sample_matrix } target_placeholders = {'trg': target_ids, 'trg_len': target_seq_length} # Creates a variable to hold the global_step. global_step_tensor = tf.Variable(0, trainable=False, name='global_step') # attention model for training tmp = \ build_attention_model(params, src_vocab, trg_vocab, source_placeholders, target_placeholders, mode=mode, burn_in_step=burn_in_step, increment_step=increment_step, max_step=max_step) # GPU config config = tf.ConfigProto() config.gpu_options.allow_growth = True # ------------------------------------ # training # ------------------------------------ with tf.Session(config=config) as sess: # init init = tf.global_variables_initializer() sess.run(init) tf.logging.info('Train model ...') # start training start_time = time.time() for step in range(1, train_steps): current_input = next(parallel_data_generator) current_input_dict = current_input._asdict() feed_dict = {} for key in source_placeholders.keys(): feed_dict[source_placeholders[key]] = current_input_dict[key] for key in target_placeholders.keys(): feed_dict[target_placeholders[key]] = current_input_dict[key] tmp_np, global_step = \ sess.run([tmp, global_step_tensor], feed_dict=feed_dict) print(global_step, tmp_np.shape)