def _cell_list(unit_type, num_units, num_layers, num_residual_layers, forget_bias, dropout, mode, dtype=None, single_cell_fn=None, residual_fn=None, use_block_lstm=False): """Create a list of RNN cells.""" if not single_cell_fn: single_cell_fn = _single_cell # Multi-GPU cell_list = [] for i in range(num_layers): utils.print_out(" cell %d" % i, new_line=False) single_cell = single_cell_fn( unit_type=unit_type, num_units=num_units, forget_bias=forget_bias, dropout=dropout, mode=mode, dtype=dtype, residual_connection=(i >= num_layers - num_residual_layers), residual_fn=residual_fn, use_block_lstm=use_block_lstm) utils.print_out("") cell_list.append(single_cell) return cell_list
def _get_learning_rate_warmup(self, hparams): """Get learning rate warmup.""" warmup_steps = hparams.warmup_steps warmup_scheme = hparams.warmup_scheme utils.print_out( " learning_rate=%g, warmup_steps=%d, warmup_scheme=%s" % (hparams.learning_rate, warmup_steps, warmup_scheme)) if not warmup_scheme: return self.learning_rate # Apply inverse decay if global steps less than warmup steps. # Inspired by https://arxiv.org/pdf/1706.03762.pdf (Section 5.3) # When step < warmup_steps, # learing_rate *= warmup_factor ** (warmup_steps - step) if warmup_scheme == "t2t": # 0.01^(1/warmup_steps): we start with a lr, 100 times smaller warmup_factor = tf.exp(tf.log(0.01) / warmup_steps) inv_decay = warmup_factor**(tf.to_float(warmup_steps - self.global_step)) else: raise ValueError("Unknown warmup scheme %s" % warmup_scheme) return tf.cond(self.global_step < hparams.warmup_steps, lambda: inv_decay * self.learning_rate, lambda: self.learning_rate, name="learning_rate_warump_cond")
def build_graph_dist_strategy(self, features, labels, mode, params): """Model function.""" del labels, params misc_utils.print_out("Running dist_strategy mode_fn") hparams = self.hparams # Create a GNMT model for training. # assert (hparams.encoder_type == "gnmt" or # hparams.attention_architecture in ["gnmt", "gnmt_v2"]) with mixed_precision_scope(): model = gnmt_model.GNMTModel(hparams, mode=mode, features=features) if mode == tf.contrib.learn.ModeKeys.INFER: sample_ids = model.sample_id reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file( hparams.tgt_vocab_file, default_value=vocab_utils.UNK) sample_words = reverse_target_vocab_table.lookup( tf.to_int64(sample_ids)) # make sure outputs is of shape [batch_size, time] or [beam_width, # batch_size, time] when using beam search. if hparams.time_major: sample_words = tf.transpose(sample_words) elif sample_words.shape.ndims == 3: # beam search output in [batch_size, time, beam_width] shape. sample_words = tf.transpose(sample_words, [2, 0, 1]) predictions = {"predictions": sample_words} # return loss, vars, grads, predictions, train_op, scaffold return None, None, None, predictions, None, None elif mode == tf.contrib.learn.ModeKeys.TRAIN: loss = model.train_loss train_op = model.update return loss, model.params, model.grads, None, train_op, None else: raise ValueError("Unknown mode in model_fn: %s" % mode)
def tokenize(hparams, file, tokenized_file): utils.print_out("tokenizing {} -> {}".format(file, tokenized_file)) with open(file, 'rb') as input_file: with open(tokenized_file, 'wb') as output_file: subprocess.run([hparams.tokenizer_file, '-l', hparams.src], stdin=input_file, stdout=output_file)
def print_variables_in_ckpt(ckpt_path): """Print a list of variables in a checkpoint together with their shapes.""" utils.print_out("# Variables in ckpt %s" % ckpt_path) reader = tf.train.NewCheckpointReader(ckpt_path) variable_map = reader.get_variable_to_shape_map() for key in sorted(variable_map.keys()): utils.print_out(" %s: %s" % (key, variable_map[key]))
def _compute_tower_grads(self, tower_loss, tower_params, learning_rate, use_fp16=False, loss_scale=None, colocate_gradients_with_ops=True): """docstring.""" if use_fp16: assert loss_scale scaled_loss = tf.multiply(tower_loss, tf.convert_to_tensor( loss_scale, dtype=tower_loss.dtype), name="scaling_loss") else: scaled_loss = tower_loss opt = self.get_optimizer(self.hparams, learning_rate) grads_and_vars = opt.compute_gradients( scaled_loss, tower_params, colocate_gradients_with_ops=self.hparams. colocate_gradients_with_ops) grads = [x for (x, _) in grads_and_vars] assert grads for g in grads: assert g.dtype == tf.float32, "grad.dtype isn't fp32: %s" % g.name # Downscale grads for var, grad in zip(tower_params, grads): if grad is None: misc_utils.print_out("%s gradient is None!" % var.name) if use_fp16: grads = [grad * tf.reciprocal(loss_scale) for grad in grads] return tower_params, grads, opt
def build_graph(self, hparams, scope=None): """Subclass must implement this method. Creates a sequence-to-sequence model with dynamic RNN decoder API. Args: hparams: Hyperparameter configurations. scope: VariableScope for the created subgraph; default "dynamic_seq2seq". Returns: A tuple of the form (logits, loss_tuple, final_context_state, sample_id), where: logits: float32 Tensor [batch_size x num_decoder_symbols]. loss: loss = the total loss / batch_size. final_context_state: the final state of decoder RNN. sample_id: sampling indices. Raises: ValueError: if encoder_type differs from mono and bi, or attention_option is not (luong | scaled_luong | bahdanau | normed_bahdanau). """ utils.print_out("# Creating %s graph ..." % self.mode) # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): self.output_layer = tf.layers.Dense(self.tgt_vocab_size, use_bias=False, name="output_projection", dtype=self.dtype) with tf.variable_scope(scope or "dynamic_seq2seq", dtype=self.dtype): # Encoder if hparams.language_model: # no encoder for language modeling utils.print_out(" language modeling: no encoder") self.encoder_outputs = None encoder_state = None else: self.encoder_outputs, encoder_state = self._build_encoder( hparams) ## Decoder logits, sample_id = (self._build_decoder(self.encoder_outputs, encoder_state, hparams)) ## Loss if self.mode != tf.contrib.learn.ModeKeys.INFER: loss = self._compute_loss(logits, hparams.label_smoothing) else: loss = tf.constant(0.0) return logits, loss, sample_id
def train_fn(hparams): """Train function.""" model_fn = make_model_fn(hparams) input_fn = make_input_fn(hparams, tf.contrib.learn.ModeKeys.TRAIN) log_step_count_steps = hparams.log_step_count_steps save_checkpoints_steps = hparams.save_checkpoints_steps if hparams.use_dist_strategy: distribution_strategy = get_distribution_strategy(hparams.num_gpus) config = tf.estimator.RunConfig( train_distribute=distribution_strategy, log_step_count_steps=log_step_count_steps, keep_checkpoint_max=None, save_checkpoints_steps=save_checkpoints_steps) else: sess_config = tf.ConfigProto(allow_soft_placement=True) if hparams.use_autojit_xla: sess_config.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) if not hparams.use_pintohost_optimizer: sess_config.graph_options.rewrite_options.pin_to_host_optimization = ( rewriter_config_pb2.RewriterConfig.OFF) config = tf.estimator.RunConfig( log_step_count_steps=log_step_count_steps, session_config=sess_config, keep_checkpoint_max=None, save_checkpoints_steps=save_checkpoints_steps) misc_utils.print_out("sess master is %s" % config.master) estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=hparams.output_dir, config=config) benchmark_hook = BenchmarkHook(hparams.batch_size, hparams.warmup_steps + 5) train_hooks = [benchmark_hook] if hparams.profile: train_hooks.append( tf.train.ProfilerHook(output_dir=hparams.output_dir, save_steps=hparams.profile_save_steps, show_dataflow=True, show_memory=True)) max_steps = hparams.debug_num_train_steps estimator.train( input_fn=input_fn, max_steps=max_steps, hooks=train_hooks, ) return benchmark_hook.get_average_speed_and_latencies()
def get_post_init_ops(self): # Copy initialized values for variables on GPU 0 to other GPUs. global_vars = tf.global_variables() var_by_name = dict([(v.name, v) for v in global_vars]) post_init_ops = [] copy_froms = set() skipped_vars = [] for v in global_vars: split_name = v.name.split('/') # TODO(b/62630508): use more specific prefix than v or v0. if split_name[0] == 'v0' or not v.name.startswith('v'): skipped_vars.append(v) continue # Only vars starts with "v[number]" are synced. split_name[0] = 'v0' copy_from = var_by_name['/'.join(split_name)] copy_froms.add(copy_from) post_init_ops.append(v.assign(copy_from.read_value())) post_init_ops += self._warmup_ops # If copy-froms is empty, then all vars are actually saved. misc_utils.print_out('All copy-from vars(%d): ' % len(copy_froms)) for gv in copy_froms: misc_utils.print_out(gv.name) misc_utils.print_out('All skippped vars(%d): ' % len(skipped_vars)) for gv in skipped_vars: misc_utils.print_out(gv.name) assert len(skipped_vars) >= len(copy_froms) return post_init_ops
def _get_infer_maximum_iterations(self, hparams, source_sequence_length): """Maximum decoding steps at inference time.""" if hparams.tgt_max_len_infer: maximum_iterations = hparams.tgt_max_len_infer utils.print_out(" decoding maximum_iterations %d" % maximum_iterations) else: # TODO(thangluong): add decoding_length_factor flag decoding_length_factor = 2.0 max_encoder_length = tf.reduce_max(source_sequence_length) maximum_iterations = tf.to_int32( tf.round( tf.to_float(max_encoder_length) * decoding_length_factor)) return maximum_iterations
def create_or_load_model(model, model_dir, session, name): """Create translation model and initialize or load parameters in session.""" latest_ckpt = tf.train.latest_checkpoint(model_dir) if latest_ckpt: model = load_model(model, latest_ckpt, session, name) else: start_time = time.time() session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) utils.print_out( " created %s model with fresh parameters, time %.2fs" % (name, time.time() - start_time)) global_step = model.global_step.eval(session=session) return model, global_step
def _get_learning_rate_decay(self, hparams): """Get learning rate decay.""" start_decay_step, decay_steps, decay_factor = self._get_decay_info( hparams) utils.print_out( " decay_scheme=%s, start_decay_step=%d, decay_steps %d, " "decay_factor %g" % (hparams.decay_scheme, start_decay_step, decay_steps, decay_factor)) return tf.cond( self.global_step < start_decay_step, lambda: self.learning_rate, lambda: tf.train.exponential_decay( # pylint: disable=g-long-lambda self.learning_rate, (self.global_step - start_decay_step), decay_steps, decay_factor, staircase=True), name="learning_rate_decay_cond")
def _print_varinfo(self, var_params, tower_id): # Print trainable variables misc_utils.print_out("# Trainable variables for tower: %d" % tower_id) misc_utils.print_out( "Format: <name>, <shape>, <dtype>, <(soft) device placement>") for param in var_params: misc_utils.print_out( " %s, %s, %s, %s" % (param.name, str( param.get_shape()), param.dtype.name, param.op.device)) misc_utils.print_out("Total params size: %.2f GB" % (4. * np.sum([ p.get_shape().num_elements() for p in var_params if p.get_shape().is_fully_defined() ]) / 2**30))
def run_main(flags, default_hparams, estimator_fn): """Run main.""" # Random random_seed = flags.random_seed if random_seed is not None and random_seed > 0: utils.print_out("# Set random seed to %d" % random_seed) random.seed(random_seed) np.random.seed(random_seed) tf.set_random_seed(random_seed) # Model output directory output_dir = flags.output_dir if output_dir and not tf.gfile.Exists(output_dir): utils.print_out("# Creating output directory %s ..." % output_dir) tf.gfile.MakeDirs(output_dir) # Load hparams. hparams = create_or_load_hparams(default_hparams, flags.hparams_path) # Train or Evaluation estimator_fn(hparams) return hparams
def load_model(model, ckpt_path, session, name): """Load model from a checkpoint.""" start_time = time.time() try: model.saver.restore(session, ckpt_path) except tf.errors.NotFoundError as e: utils.print_out("Can't load checkpoint") print_variables_in_ckpt(ckpt_path) utils.print_out("%s" % str(e)) session.run(tf.tables_initializer()) utils.print_out(" loaded %s model parameters from %s, time %.2fs" % (name, ckpt_path, time.time() - start_time)) return model
def _build_encoder(self, hparams): """Build a GNMT encoder.""" assert hparams.encoder_type == "gnmt" # Build GNMT encoder. num_bi_layers = 1 num_uni_layers = self.num_encoder_layers - num_bi_layers utils.print_out("# Build a GNMT encoder") utils.print_out(" num_bi_layers = %d" % num_bi_layers) utils.print_out(" num_uni_layers = %d" % num_uni_layers) # source is batch-majored source = self.features["source"] import sys print('source.shape: %s' % source.shape, file=sys.stderr) if self.time_major: # Later rnn would use time-majored inputs source = tf.transpose(source) with tf.variable_scope("encoder"): dtype = self.dtype encoder_emb_inp = tf.cast( self.encoder_emb_lookup_fn(self.embedding_encoder, source), dtype) # Build 1st bidi layer. bi_encoder_outputs, bi_encoder_state = self._build_encoder_layers_bidi( encoder_emb_inp, self.features["source_sequence_length"], hparams, dtype) # Build all the rest unidi layers encoder_state, encoder_outputs = self._build_encoder_layers_unidi( bi_encoder_outputs, self.features["source_sequence_length"], num_uni_layers, hparams, dtype) # Pass all encoder states to the decoder # except the first bi-directional layer encoder_state = (bi_encoder_state[1],) + ( (encoder_state,) if num_uni_layers == 1 else encoder_state) return encoder_outputs, encoder_state
def _create_pretrained_emb_from_txt(vocab_file, embed_file, num_trainable_tokens=3, dtype=tf.float32, scope=None): """Load pretrain embeding from embed_file, and return an embedding matrix. Args: vocab_file: Path to vocab file. embed_file: Path to a Glove formmated embedding txt file. num_trainable_tokens: Make the first n tokens in the vocab file as trainable variables. Default is 3, which is "<unk>", "<s>" and "</s>". dtype: data type. scope: tf scope name. Returns: pretrained embedding table variable. """ vocab, _ = vocab_utils.load_vocab(vocab_file) trainable_tokens = vocab[:num_trainable_tokens] utils.print_out("# Using pretrained embedding: %s." % embed_file) utils.print_out(" with trainable tokens: ") emb_dict, emb_size = vocab_utils.load_embed_txt(embed_file) for token in trainable_tokens: utils.print_out(" %s" % token) if token not in emb_dict: emb_dict[token] = [0.0] * emb_size emb_mat = np.array([emb_dict[token] for token in vocab], dtype=dtype.as_numpy_dtype()) emb_mat = tf.constant(emb_mat) emb_mat_const = tf.slice(emb_mat, [num_trainable_tokens, 0], [-1, -1]) with tf.variable_scope(scope or "pretrain_embeddings", dtype=dtype) as scope: emb_mat_var = tf.get_variable("emb_mat_var", [num_trainable_tokens, emb_size]) return tf.concat([emb_mat_var, emb_mat_const], 0)
def avg_checkpoints(model_dir, num_last_checkpoints, global_step_name): """Average the last N checkpoints in the model_dir.""" checkpoint_state = tf.train.get_checkpoint_state(model_dir) if not checkpoint_state: utils.print_out("# No checkpoint file found in directory: %s" % model_dir) return None # Checkpoints are ordered from oldest to newest. checkpoints = ( checkpoint_state.all_model_checkpoint_paths[-num_last_checkpoints:]) if len(checkpoints) < num_last_checkpoints: utils.print_out( "# Skipping averaging checkpoints because not enough checkpoints is " "available.") return None avg_model_dir = os.path.join(model_dir, "avg_checkpoints") if not tf.gfile.Exists(avg_model_dir): utils.print_out( "# Creating new directory %s for saving averaged checkpoints." % avg_model_dir) tf.gfile.MakeDirs(avg_model_dir) utils.print_out("# Reading and averaging variables in checkpoints:") var_list = tf.contrib.framework.list_variables(checkpoints[0]) var_values, var_dtypes = {}, {} for (name, shape) in var_list: if name != global_step_name: var_values[name] = np.zeros(shape) for checkpoint in checkpoints: utils.print_out(" %s" % checkpoint) reader = tf.contrib.framework.load_checkpoint(checkpoint) for name in var_values: tensor = reader.get_tensor(name) var_dtypes[name] = tensor.dtype var_values[name] += tensor for name in var_values: var_values[name] /= len(checkpoints) # Build a graph with same variables in the checkpoints, and save the averaged # variables into the avg_model_dir. with tf.Graph().as_default(): tf_vars = [ tf.get_variable(v, shape=var_values[v].shape, dtype=var_dtypes[name]) for v in var_values ] placeholders = [ tf.placeholder(v.dtype, shape=v.shape) for v in tf_vars ] assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)] saver = tf.train.Saver(tf.all_variables(), save_relative_paths=True) with tf.Session() as sess: sess.run(tf.initialize_all_variables()) for p, assign_op, (name, value) in zip(placeholders, assign_ops, six.iteritems(var_values)): sess.run(assign_op, {p: value}) # Use the built saver to save the averaged checkpoint. Only keep 1 # checkpoint and the best checkpoint will be moved to avg_best_metric_dir. saver.save(sess, os.path.join(avg_model_dir, "translate.ckpt")) return avg_model_dir
def _set_params_initializer(self, hparams, mode, features, scope, extra_args=None): """Set various params for self and initialize.""" self.mode = mode self.src_vocab_size = hparams.src_vocab_size self.tgt_vocab_size = hparams.tgt_vocab_size self.features = features self.time_major = hparams.time_major if hparams.use_char_encode: assert (not self.time_major), ("Can't use time major for" " char-level inputs.") self.dtype = tf.float16 if hparams.use_fp16 else tf.float32 # extra_args: to make it flexible for adding external customizable code self.single_cell_fn = None if extra_args: self.single_cell_fn = extra_args.single_cell_fn # Set num units self.num_units = hparams.num_units # Set num layers self.num_encoder_layers = hparams.num_encoder_layers self.num_decoder_layers = hparams.num_decoder_layers assert self.num_encoder_layers assert self.num_decoder_layers # Set num residual layers if hasattr(hparams, "num_residual_layers"): # compatible common_test_utils self.num_encoder_residual_layers = hparams.num_residual_layers self.num_decoder_residual_layers = hparams.num_residual_layers else: self.num_encoder_residual_layers = hparams.num_encoder_residual_layers self.num_decoder_residual_layers = hparams.num_decoder_residual_layers # Batch size self.batch_size = tf.size(self.features["source_sequence_length"]) # Global step global_step = tf.train.get_global_step() if global_step is not None: utils.print_out("global_step already created!") self.global_step = tf.train.get_or_create_global_step() utils.print_out("model.global_step.name: %s" % self.global_step.name) # Initializer self.random_seed = hparams.random_seed initializer = model_helper.get_initializer(hparams.init_op, self.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings self.encoder_emb_lookup_fn = tf.nn.embedding_lookup self.init_embeddings(hparams, scope)
def extend_hparams(hparams): """Add new arguments to hparams.""" # Sanity checks if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0: raise ValueError("For bi, num_encoder_layers %d should be even" % hparams.num_encoder_layers) if (hparams.attention_architecture in ["gnmt"] and hparams.num_encoder_layers < 2): raise ValueError("For gnmt attention architecture, " "num_encoder_layers %d should be >= 2" % hparams.num_encoder_layers) if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]: raise ValueError("subword option must be either spm, or bpe") if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0: raise ValueError( "beam_width must greater than 0 when using beam_search" "decoder.") if hparams.mode == "translate" and not hparams.translate_file: raise ValueError( "--translate_file flag must be specified in translate mode") # Different number of encoder / decoder layers assert hparams.num_encoder_layers and hparams.num_decoder_layers if hparams.num_encoder_layers != hparams.num_decoder_layers: hparams.pass_hidden_state = False utils.print_out( "Num encoder layer %d is different from num decoder layer" " %d, so set pass_hidden_state to False" % (hparams.num_encoder_layers, hparams.num_decoder_layers)) # Set residual layers num_encoder_residual_layers = 0 num_decoder_residual_layers = 0 if hparams.residual: if hparams.num_encoder_layers > 1: num_encoder_residual_layers = hparams.num_encoder_layers - 1 if hparams.num_decoder_layers > 1: num_decoder_residual_layers = hparams.num_decoder_layers - 1 if hparams.encoder_type == "gnmt": # The first unidirectional layer (after the bi-directional layer) in # the GNMT encoder can't have residual connection due to the input is # the concatenation of fw_cell and bw_cell's outputs. num_encoder_residual_layers = hparams.num_encoder_layers - 2 # Compatible for GNMT models if hparams.num_encoder_layers == hparams.num_decoder_layers: num_decoder_residual_layers = num_encoder_residual_layers _add_argument(hparams, "num_encoder_residual_layers", num_encoder_residual_layers) _add_argument(hparams, "num_decoder_residual_layers", num_decoder_residual_layers) # Language modeling if hparams.language_model: hparams.attention = "" hparams.attention_architecture = "" hparams.pass_hidden_state = False hparams.share_vocab = True hparams.src = hparams.tgt utils.print_out( "For language modeling, we turn off attention and " "pass_hidden_state; turn on share_vocab; set src to tgt.") ## Vocab # Get vocab file names first if hparams.vocab_prefix: src_vocab_file = hparams.vocab_prefix + "." + hparams.src tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt else: raise ValueError("hparams.vocab_prefix must be provided.") # Source vocab src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, hparams.output_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK, pad_vocab=True) # Target vocab if hparams.share_vocab: utils.print_out(" using source vocab for target") tgt_vocab_file = src_vocab_file tgt_vocab_size = src_vocab_size else: tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab( tgt_vocab_file, hparams.output_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) _add_argument(hparams, "src_vocab_size", src_vocab_size) _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size) _add_argument(hparams, "src_vocab_file", src_vocab_file) _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file) # Num embedding partitions _add_argument(hparams, "num_enc_emb_partitions", hparams.num_embeddings_partitions) _add_argument(hparams, "num_dec_emb_partitions", hparams.num_embeddings_partitions) # Pretrained Embeddings _add_argument(hparams, "src_embed_file", "") _add_argument(hparams, "tgt_embed_file", "") if hparams.embed_prefix: src_embed_file = hparams.embed_prefix + "." + hparams.src tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt if tf.gfile.Exists(src_embed_file): utils.print_out(" src_embed_file %s exist" % src_embed_file) hparams.src_embed_file = src_embed_file utils.print_out( "For pretrained embeddings, set num_enc_emb_partitions to 1") hparams.num_enc_emb_partitions = 1 else: utils.print_out(" src_embed_file %s doesn't exist" % src_embed_file) if tf.gfile.Exists(tgt_embed_file): utils.print_out(" tgt_embed_file %s exist" % tgt_embed_file) hparams.tgt_embed_file = tgt_embed_file utils.print_out( "For pretrained embeddings, set num_dec_emb_partitions to 1") hparams.num_dec_emb_partitions = 1 else: utils.print_out(" tgt_embed_file %s doesn't exist" % tgt_embed_file) # Evaluation metric = "bleu" best_metric_dir = os.path.join(hparams.output_dir, "best_" + metric) tf.gfile.MakeDirs(best_metric_dir) _add_argument(hparams, "best_" + metric, 0, update=False) _add_argument(hparams, "best_" + metric + "_dir", best_metric_dir) return hparams
def main(unused_argv): experiment_start = time.time() tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.use_fp16 and FLAGS.use_dist_strategy: raise ValueError("use_fp16 and use_dist_strategy aren't compatible") if FLAGS.use_fp16 + FLAGS.use_amp + FLAGS.use_fastmath > 1: raise ValueError( "Only one of use_fp16, use_amp, use_fastmath can be set") if FLAGS.use_amp: utils.print_out('Enabling TF-AMP') os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' if FLAGS.use_fastmath: utils.print_out('Enabling FastMath') os.environ["TF_ENABLE_CUBLAS_TENSOR_OP_MATH_FP32"] = '1' os.environ["TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32"] = '1' os.environ["TF_ENABLE_CUDNN_RNN_TENSOR_OP_MATH_FP32"] = '1' # Set up hacky envvars. # Hack that affects Defun in attention_wrapper.py active_xla_option_nums = np.sum( [FLAGS.use_xla, FLAGS.use_autojit_xla, FLAGS.xla_compile]) if active_xla_option_nums > 1: raise ValueError( "Only one of use_xla, xla_compile, use_autojit_xla can be set") os.environ["use_xla"] = str(FLAGS.use_xla).lower() if FLAGS.use_xla: os.environ["use_defun"] = str(True).lower() else: os.environ["use_defun"] = str(FLAGS.use_defun).lower() utils.print_out("use_defun is %s for attention" % os.environ["use_defun"]) # TODO(jamesqin): retire this config after Cuda9.1 os.environ["use_fp32_batch_matmul"] = ( "true" if FLAGS.use_fp32_batch_matmul else "false") os.environ["xla_compile"] = "true" if FLAGS.xla_compile else "false" os.environ["force_inputs_padding"] = ("true" if FLAGS.force_inputs_padding else "false") if FLAGS.mode == "train": utils.print_out("Running training mode.") default_hparams = create_hparams(FLAGS) run_main(FLAGS, default_hparams, estimator.train_fn) elif FLAGS.mode == "infer" or FLAGS.mode == "translate": if FLAGS.mode == "infer": utils.print_out("Running inference mode.") translate_mode = False else: utils.print_out("Running translate mode on file {}.".format( FLAGS.translate_file)) translate_mode = True # Random random_seed = FLAGS.random_seed if random_seed is not None and random_seed > 0: utils.print_out("# Set random seed to %d" % random_seed) random.seed(random_seed) np.random.seed(random_seed) tf.set_random_seed(random_seed) # Model output directory output_dir = FLAGS.output_dir if output_dir and not tf.gfile.Exists(output_dir): utils.print_out("# Creating output directory %s ..." % output_dir) tf.gfile.MakeDirs(output_dir) # Load hparams. default_hparams = create_hparams(FLAGS) default_hparams.num_buckets = 1 # The estimator model_fn is written in a way allowing train hparams to be # passed in infer mode. hparams = create_or_load_hparams(default_hparams, FLAGS.hparams_path) utils.print_out("infer_hparams:") utils.print_hparams(hparams) if translate_mode: tokenize(hparams, hparams.translate_file, hparams.translate_file + ".tok") eval_sentences, eval_src_tokens, _ = iterator_utils.get_effective_epoch_size( hparams, train=False) # Run evaluation when there's a new checkpoint tf.logging.info("Starting to evaluate...") eval_start = time.time() _, (eval_speed, eval_latencies), eval_output_tokens = estimator.eval_fn( hparams, hparams.ckpt, only_translate=translate_mode) eval_end = time.time() eval_delta = eval_end - eval_start utils.print_out( "eval time for ckpt: %.2f mins (%.2f sent/sec, %.2f tokens/sec)" % (eval_delta / 60., eval_speed, eval_speed * (eval_src_tokens + eval_output_tokens) / eval_sentences), f=sys.stderr) for lat in sorted(eval_latencies): utils.print_out("eval latency_%s for ckpt: %.2f ms" % (lat, eval_latencies[lat] * 1000)) if translate_mode: detokenize(hparams, hparams.translate_file + ".trans.tok", hparams.translate_file + ".trans") else: assert FLAGS.mode == "train_and_eval" utils.print_out("Running train and eval mode.") # Random random_seed = FLAGS.random_seed if random_seed is not None and random_seed > 0: utils.print_out("# Set random seed to %d" % random_seed) random.seed(random_seed) np.random.seed(random_seed) tf.set_random_seed(random_seed) # Model output directory output_dir = FLAGS.output_dir if output_dir and not tf.gfile.Exists(output_dir): utils.print_out("# Creating output directory %s ..." % output_dir) tf.gfile.MakeDirs(output_dir) # Load hparams. default_hparams = create_hparams(FLAGS) hparams = create_or_load_hparams(default_hparams, FLAGS.hparams_path) utils.print_out("training hparams:") utils.print_hparams(hparams) with tf.gfile.GFile(os.path.join(output_dir, "train_hparams.txt"), "w") as f: f.write(utils.serialize_hparams(hparams) + "\n") # The estimator model_fn is written in a way allowing train hparams to be # passed in infer mode. infer_hparams = tf.contrib.training.HParams(**hparams.values()) infer_hparams.num_buckets = 1 utils.print_out("infer_hparams:") utils.print_hparams(infer_hparams) with tf.gfile.GFile(os.path.join(output_dir, "infer_hparams.txt"), "w") as f: f.write(utils.serialize_hparams(infer_hparams) + "\n") epochs = 0 should_stop = epochs >= FLAGS.max_train_epochs train_sentences, train_src_tokens, train_tgt_tokens = iterator_utils.get_effective_epoch_size( hparams) eval_sentences, eval_src_tokens, _ = iterator_utils.get_effective_epoch_size( hparams, train=False) while not should_stop: utils.print_out("Starting epoch %d" % epochs) try: train_start = time.time() train_speed, _ = estimator.train_fn(hparams) except tf.errors.OutOfRangeError: utils.print_out("training hits OutOfRangeError", f=sys.stderr) train_end = time.time() train_delta = train_end - train_start utils.print_out( "training time for epoch %d: %.2f mins (%.2f sent/sec, %.2f tokens/sec)" % (epochs + 1, train_delta / 60., train_speed, train_speed * (train_src_tokens + train_tgt_tokens) / train_sentences), f=sys.stderr) # This is probably sub-optimal, doing eval per-epoch eval_start = time.time() bleu_score, ( eval_speed, eval_latencies ), eval_output_tokens = estimator.eval_fn(infer_hparams) eval_end = time.time() eval_delta = eval_end - eval_start utils.print_out( "eval time for epoch %d: %.2f mins (%.2f sent/sec, %.2f tokens/sec)" % (epochs + 1, eval_delta / 60., eval_speed, eval_speed * (eval_src_tokens + eval_output_tokens) / eval_sentences), f=sys.stderr) for lat in sorted(eval_latencies): utils.print_out("eval latency_%s for epoch %d: %.2f ms" % (lat, epochs + 1, eval_latencies[lat] * 1000)) if FLAGS.debug or (FLAGS.target_bleu is not None and bleu_score > FLAGS.target_bleu): should_stop = True utils.print_out( "Stop job since target bleu is reached at epoch %d ." % epochs, f=sys.stderr) epochs += 1 if epochs >= FLAGS.max_train_epochs: should_stop = True utils.print_out("Stop job since max_train_epochs is reached.", f=sys.stderr) experiment_end = time.time() utils.print_out('Experiment took {} min'.format( (experiment_end - experiment_start) / 60))
def _set_train_or_infer(self, res, hparams): """Set up training.""" loss = res[1] if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = loss self.word_count = tf.reduce_sum( self.features["source_sequence_length"]) + tf.reduce_sum( self.features["target_sequence_length"]) elif self.mode == tf.contrib.learn.ModeKeys.EVAL: self.eval_loss = loss elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_logits = res[0] self.infer_loss = loss self.sample_id = res[2] if self.mode != tf.contrib.learn.ModeKeys.INFER: ## Count the number of predicted words for compute ppl. self.predict_count = tf.reduce_sum( self.features["target_sequence_length"]) # Gradients and SGD update operation for training the model. # Arrange for the embedding vars to appear at the beginning. # Only build bprop if running on GPU and using dist_strategy, in which # case learning rate, grads and train_op are created in estimator model # function. with tf.name_scope("learning_rate"): self.learning_rate = tf.constant(hparams.learning_rate) # warm-up self.learning_rate = self._get_learning_rate_warmup(hparams) # decay self.learning_rate = self._get_learning_rate_decay(hparams) if (hparams.use_dist_strategy and self.mode == tf.contrib.learn.ModeKeys.TRAIN): # Gradients params = tf.trainable_variables() # Print trainable variables utils.print_out("# Trainable variables") utils.print_out( "Format: <name>, <shape>, <dtype>, <(soft) device placement>") for param in params: utils.print_out( " %s, %s, %s, %s" % (param.name, str( param.get_shape()), param.dtype.name, param.op.device)) utils.print_out("Total params size: %.2f GB" % (4. * np.sum([ p.get_shape().num_elements() for p in params if p.shape.is_fully_defined() ]) / 2**30)) # Optimizer if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) else: raise ValueError("Unknown optimizer type %s" % hparams.optimizer) assert opt is not None grads_and_vars = opt.compute_gradients( self.train_loss, params, colocate_gradients_with_ops=hparams.colocate_gradients_with_ops ) gradients = [x for (x, _) in grads_and_vars] clipped_grads, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) self.grad_norm = grad_norm self.params = params self.grads = clipped_grads self.update = opt.apply_gradients(list(zip(clipped_grads, params)), global_step=self.global_step) else: self.grad_norm = None self.update = None self.params = None self.grads = None
def _single_cell(unit_type, num_units, forget_bias, dropout, mode, dtype=None, residual_connection=False, residual_fn=None, use_block_lstm=False): """Create an instance of a single RNN cell.""" # dropout (= 1 - keep_prob) is set to 0 during eval and infer dropout = dropout if mode == tf.contrib.learn.ModeKeys.TRAIN else 0.0 # Cell Type if unit_type == "lstm": utils.print_out(" LSTM, forget_bias=%g" % forget_bias, new_line=False) if not use_block_lstm: single_cell = tf.nn.rnn_cell.LSTMCell(num_units, dtype=dtype, forget_bias=forget_bias) else: single_cell = tf.contrib.rnn.LSTMBlockCell(num_units, forget_bias=forget_bias) elif unit_type == "gru": utils.print_out(" GRU", new_line=False) single_cell = tf.contrib.rnn.GRUCell(num_units) elif unit_type == "layer_norm_lstm": utils.print_out(" Layer Normalized LSTM, forget_bias=%g" % forget_bias, new_line=False) single_cell = tf.contrib.rnn.LayerNormBasicLSTMCell( num_units, forget_bias=forget_bias, layer_norm=True) elif unit_type == "nas": utils.print_out(" NASCell", new_line=False) single_cell = tf.contrib.rnn.NASCell(num_units) else: raise ValueError("Unknown unit type %s!" % unit_type) # Dropout (= 1 - keep_prob) if dropout > 0.0: single_cell = tf.contrib.rnn.DropoutWrapper(cell=single_cell, input_keep_prob=(1.0 - dropout)) utils.print_out(" %s, dropout=%g " % (type(single_cell).__name__, dropout), new_line=False) # Residual if residual_connection: single_cell = tf.contrib.rnn.ResidualWrapper(single_cell, residual_fn=residual_fn) utils.print_out(" %s" % type(single_cell).__name__, new_line=False) return single_cell
def create_emb_for_encoder_and_decoder(share_vocab, src_vocab_size, tgt_vocab_size, src_embed_size, tgt_embed_size, dtype=tf.float32, num_enc_partitions=0, num_dec_partitions=0, src_vocab_file=None, tgt_vocab_file=None, src_embed_file=None, tgt_embed_file=None, use_char_encode=False, scope=None): """Create embedding matrix for both encoder and decoder. Args: share_vocab: A boolean. Whether to share embedding matrix for both encoder and decoder. src_vocab_size: An integer. The source vocab size. tgt_vocab_size: An integer. The target vocab size. src_embed_size: An integer. The embedding dimension for the encoder's embedding. tgt_embed_size: An integer. The embedding dimension for the decoder's embedding. dtype: dtype of the embedding matrix. Default to float32. num_enc_partitions: number of partitions used for the encoder's embedding vars. num_dec_partitions: number of partitions used for the decoder's embedding vars. src_vocab_file: A string. The source vocabulary file. tgt_vocab_file: A string. The target vocabulary file. src_embed_file: A string. The source embedding file. tgt_embed_file: A string. The target embedding file. use_char_encode: A boolean. If true, use char encoder. scope: VariableScope for the created subgraph. Default to "embedding". Returns: embedding_encoder: Encoder's embedding matrix. embedding_decoder: Decoder's embedding matrix. Raises: ValueError: if use share_vocab but source and target have different vocab size. """ if num_enc_partitions <= 1: enc_partitioner = None else: # Note: num_partitions > 1 is required for distributed training due to # embedding_lookup tries to colocate single partition-ed embedding variable # with lookup ops. This may cause embedding variables being placed on worker # jobs. enc_partitioner = tf.fixed_size_partitioner(num_enc_partitions) if num_dec_partitions <= 1: dec_partitioner = None else: # Note: num_partitions > 1 is required for distributed training due to # embedding_lookup tries to colocate single partition-ed embedding variable # with lookup ops. This may cause embedding variables being placed on worker # jobs. dec_partitioner = tf.fixed_size_partitioner(num_dec_partitions) if src_embed_file and enc_partitioner: raise ValueError( "Can't set num_enc_partitions > 1 when using pretrained encoder " "embedding") if tgt_embed_file and dec_partitioner: raise ValueError( "Can't set num_dec_partitions > 1 when using pretrained decdoer " "embedding") with tf.variable_scope(scope or "embeddings", dtype=dtype, partitioner=enc_partitioner) as scope: # Share embedding if share_vocab: if src_vocab_size != tgt_vocab_size: raise ValueError( "Share embedding but different src/tgt vocab sizes" " %d vs. %d" % (src_vocab_size, tgt_vocab_size)) assert src_embed_size == tgt_embed_size utils.print_out("# Use the same embedding for source and target") vocab_file = src_vocab_file or tgt_vocab_file embed_file = src_embed_file or tgt_embed_file embedding_encoder = _create_or_load_embed("embedding_share", vocab_file, embed_file, src_vocab_size, src_embed_size, dtype) embedding_decoder = embedding_encoder else: if not use_char_encode: with tf.variable_scope("encoder", partitioner=enc_partitioner): embedding_encoder = _create_or_load_embed( "embedding_encoder", src_vocab_file, src_embed_file, src_vocab_size, src_embed_size, dtype) else: embedding_encoder = None with tf.variable_scope("decoder", partitioner=dec_partitioner): embedding_decoder = _create_or_load_embed( "embedding_decoder", tgt_vocab_file, tgt_embed_file, tgt_vocab_size, tgt_embed_size, dtype) return embedding_encoder, embedding_decoder
def build_graph(self, features, labels, mode, params): """docstring.""" del labels, params misc_utils.print_out("Running fast mode_fn") hparams = self.hparams # Create global_step tf.train.get_or_create_global_step() if mode == tf.contrib.learn.ModeKeys.INFER: # Doing inference only on one GPU inf_hparams = tf.contrib.training.HParams(**hparams.values()) inf_hparams.set_hparam("num_gpus", 1) # Inference is done in fp32 and in the same way as that of dist_strategy. inf_hparams.set_hparam("use_fp16", False) misc_utils.print_out("inference hparmas:") misc_utils.print_hparams(inf_hparams) # Create variable_mgr var_mgr = self._get_variable_mgr(inf_hparams) with mixed_precision_scope(), tf.device("gpu:0"), tf.name_scope( "tower_0"), var_mgr.create_outer_variable_scope(0): model = gnmt_model.GNMTModel(inf_hparams, mode=mode, features=features) sample_ids = model.sample_id reverse_target_vocab_table = lookup_ops.index_to_string_table_from_file( inf_hparams.tgt_vocab_file, default_value=vocab_utils.UNK) sample_words = reverse_target_vocab_table.lookup( tf.to_int64(sample_ids)) # make sure outputs is of shape [batch_size, time] or [beam_width, # batch_size, time] when using beam search. if inf_hparams.time_major: sample_words = tf.transpose(sample_words) elif sample_words.shape.ndims == 3: # beam search output in [batch_size, time, beam_width] shape. sample_words = tf.transpose(sample_words, [2, 0, 1]) predictions = {"predictions": sample_words} # return loss, vars, grads, predictions, train_op, scaffold return None, None, None, predictions, None, None elif mode == tf.contrib.learn.ModeKeys.TRAIN: num_towers = hparams.num_gpus # Shard inputs tower_features = self._shard_inputs(features, num_towers) # Create loss scale vars if necessary loss_scale, loss_scale_normal_steps = self._create_loss_scale_vars( ) # Create variable_mgr var_mgr = self._get_variable_mgr(hparams) # Build per-tower fprop and bprop devices = var_mgr.get_devices() tower_gradvars = [] tower_scopes = [] var_scopes = [] train_losses = [] learning_rates = [] batch_sizes = [] opts = [] def fprop_and_bprop(tid): """docstring.""" model = gnmt_model.GNMTModel(hparams, mode=mode, features=tower_features[tid]) # sync training. assert model.learning_rate is not None # The following handles shouldn't be built in when doing manual assert model.grad_norm is None assert model.update is None tower_loss = model.train_loss # Only check loss numerics if in fp16 if hparams.use_fp16 and hparams.check_tower_loss_numerics: tower_loss = tf.check_numerics( tower_loss, "tower_%d has Inf/NaN loss" % tid) # Cast to fp32, otherwise would easily overflow. tower_loss = tf.to_float(tower_loss) var_params, grads, opt = self._compute_tower_grads( tower_loss, var_mgr.trainable_variables_on_device(tid, tid), model.learning_rate, use_fp16=hparams.use_fp16, loss_scale=loss_scale, colocate_gradients_with_ops=hparams. colocate_gradients_with_ops) self._print_varinfo(var_params, tid) res = [model.train_loss, model.learning_rate, model.batch_size] res.extend(grads) opts.append(opt) return res def unpack_fprop_and_bprop_output(output): train_loss = output[0] learning_rate = output[1] batch_size = output[2] grads = output[3:] return train_loss, learning_rate, batch_size, grads with mixed_precision_scope(): for tid in range(num_towers): with tf.device(devices[tid % len(devices)]), tf.name_scope( "tower_%s" % tid) as scope: tower_scopes.append(scope) with var_mgr.create_outer_variable_scope( tid) as var_scope: var_scopes.append(var_scope) outputs = maybe_xla_compile( hparams, fprop_and_bprop, tid) (train_loss, learning_rate, batch_size, grads) = unpack_fprop_and_bprop_output(outputs) train_losses.append(train_loss) learning_rates.append(learning_rate) batch_sizes.append(batch_size) var_params = var_mgr.trainable_variables_on_device( tid, tid) tower_gradvars.append(list(zip(grads, var_params))) # Add summaries if hparams.show_metrics: tf.summary.scalar("learning_rate", learning_rates[0]) if loss_scale: tf.summary.scalar("loss_scale", loss_scale) if hparams.enable_auto_loss_scale: tf.summary.scalar("loss_scale_normal_steps", loss_scale_normal_steps) misc_utils.print_out("Finish building fprop and per-tower bprop.") # Aggregate gradients # The following compute the aggregated grads for each tower, stored in # opaque grad_states structure. apply_grads_devices, grad_states = var_mgr.preprocess_device_grads( tower_gradvars) master_grads = None master_params = None update_ops = [] for i, device in enumerate(apply_grads_devices): with tf.device(device), tf.name_scope(tower_scopes[i]): # Get per-tower grads. with tf.name_scope("get_gradients_to_apply"): avg_gradvars = var_mgr.get_gradients_to_apply( i, grad_states) avg_grads = [gv[0] for gv in avg_gradvars] # gradients post-processing with tf.name_scope("clip_gradients"): if hparams.clip_grads: clipped_grads, grad_norm = model_helper.gradient_clip( avg_grads, max_gradient_norm=hparams.max_gradient_norm) # summary the grad on the 1st tower if i == 0 and hparams.show_metrics: tf.summary.scalar("grad_norm", grad_norm) tf.summary.scalar( "clipped_grad_norm", tf.global_norm(clipped_grads)) else: clipped_grads = avg_grads if i == 0: master_grads = clipped_grads # Build apply-gradients ops clipped_gradvars = list( zip(clipped_grads, [gv[1] for gv in avg_gradvars])) if i == 0: master_params = [gv[1] for gv in avg_gradvars] with tf.name_scope("append_gradient_ops"): loss_scale_params = variable_mgr_util.AutoLossScaleParams( enable_auto_loss_scale=hparams. enable_auto_loss_scale, loss_scale=loss_scale, loss_scale_normal_steps=loss_scale_normal_steps, inc_loss_scale_every_n=hparams. fp16_inc_loss_scale_every_n, is_chief=True) opt = opts[i] var_mgr.append_apply_gradients_ops( grad_states, opt, clipped_gradvars, update_ops, loss_scale_params) misc_utils.print_out("Finish building grad aggregation.") assert len(update_ops) == num_towers train_op = tf.group(update_ops) with tf.control_dependencies([train_op]): global_step = tf.train.get_global_step() train_op = global_step.assign_add(1) # Compute loss on the first gpu # TODO(jamesqin): optimize it? with tf.device("gpu:0"): loss = misc_utils.weighted_avg(train_losses, batch_sizes) # Create local init_ops # TODO(jamesqin): handle resource variables! # At present if not using mirror strategy, not using resource vars. local_init_ops = [] local_init_op = tf.local_variables_initializer() with tf.control_dependencies([local_init_op]): local_init_ops.append(var_mgr.get_post_init_ops()) local_init_ops.extend([local_init_op, tf.tables_initializer()]) saveable_vars = var_mgr.savable_variables() # Add saveables for cudnn vars in master tower. saveable_objects = tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS) saveable_objects = [x for x in saveable_objects if "v0" in x.name] misc_utils.print_out("Saveable vars(%d): " % len(saveable_vars)) for mv in saveable_vars: misc_utils.print_out(mv.name) misc_utils.print_out("All global trainable vars(%d): " % len(tf.trainable_variables())) for tv in tf.trainable_variables(): misc_utils.print_out(tv.name) misc_utils.print_out("All global vars(%d): " % len(tf.global_variables())) for gv in tf.global_variables(): misc_utils.print_out(gv.name) misc_utils.print_out("master backproped params(%d): " % len(master_params)) for mp in master_params: misc_utils.print_out(mp.name) # Note the cudnn vars are skipped the init check. :( scaffold = tf.train.Scaffold( ready_op=tf.report_uninitialized_variables(saveable_vars), ready_for_local_init_op=tf.report_uninitialized_variables( saveable_vars), local_init_op=tf.group(*local_init_ops), saver=tf.train.Saver(saveable_vars + saveable_objects, save_relative_paths=True)) misc_utils.print_out("Finish building model_fn") # return loss, vars, grads, predictions, train_op, scaffold return loss, master_params, master_grads, None, train_op, scaffold
def get_metrics(hparams, model_fn, ckpt=None, only_translate=False): """Run inference and compute metrics.""" pred_estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=hparams.output_dir) benchmark_hook = BenchmarkHook(hparams.infer_batch_size) predictions = pred_estimator.predict(make_input_fn( hparams, tf.contrib.learn.ModeKeys.INFER), checkpoint_path=ckpt, hooks=[benchmark_hook]) translations = [] output_tokens = [] beam_id = 0 for prediction in predictions: # get the top translation. if beam_id == 0: for sent_id in range(hparams.infer_batch_size): if sent_id >= prediction["predictions"].shape[0]: break trans, output_length = nmt_utils.get_translation( prediction["predictions"], sent_id=sent_id, tgt_eos=hparams.eos, subword_option=hparams.subword_option) translations.append(trans) output_tokens.append(output_length) beam_id += 1 if beam_id == hparams.beam_width: beam_id = 0 if only_translate: trans_file = hparams.translate_file + '.trans.tok' else: trans_file = os.path.join( hparams.output_dir, "newstest2014_out_{}.tok.de".format( pred_estimator.get_variable_value(tf.GraphKeys.GLOBAL_STEP))) trans_dir = os.path.dirname(trans_file) if not tf.gfile.Exists(trans_dir): tf.gfile.MakeDirs(trans_dir) tf.logging.info("Writing to file %s" % trans_file) with codecs.getwriter("utf-8")(tf.gfile.GFile(trans_file, mode="wb")) as trans_f: trans_f.write("") # Write empty string to ensure file is created. for translation in translations: trans_f.write((translation + b"\n").decode("utf-8")) if only_translate: return None, benchmark_hook.get_average_speed_and_latencies(), sum( output_tokens) # Evaluation output_dir = os.path.join(pred_estimator.model_dir, "eval") tf.gfile.MakeDirs(output_dir) summary_writer = tf.summary.FileWriter(output_dir) ref_file = "%s.%s" % (hparams.test_prefix, hparams.tgt) # Hardcoded. metric = "bleu" score = get_sacrebleu(trans_file, hparams.detokenizer_file) misc_utils.print_out("bleu is %.5f" % score) with tf.Graph().as_default(): summaries = [] summaries.append(tf.Summary.Value(tag=metric, simple_value=score)) tf_summary = tf.Summary(value=list(summaries)) summary_writer.add_summary( tf_summary, pred_estimator.get_variable_value(tf.GraphKeys.GLOBAL_STEP)) summary_writer.close() return score, benchmark_hook.get_average_speed_and_latencies(), sum( output_tokens)