def check_vocab(vocab_file, out_dir, check_special_token=True, sos=None,
                eos=None, unk=None):
  """Check if vocab_file doesn't exist, create from corpus_file."""
  if tf.gfile.Exists(vocab_file):
    utils.print_out("# Vocab file %s exists" % vocab_file)
    vocab, vocab_size = load_vocab(vocab_file)
    if check_special_token:
      # Verify if the vocab starts with unk, sos, eos
      # If not, prepend those tokens & generate a new vocab file
      if not unk: unk = UNK
      if not sos: sos = SOS
      if not eos: eos = EOS
      assert len(vocab) >= 3
      if vocab[0] != unk or vocab[1] != sos or vocab[2] != eos:
        utils.print_out("The first 3 vocab words [%s, %s, %s]"
                        " are not [%s, %s, %s]" %
                        (vocab[0], vocab[1], vocab[2], unk, sos, eos))
        vocab = [unk, sos, eos] + vocab
        vocab_size += 3
        new_vocab_file = os.path.join(out_dir, os.path.basename(vocab_file))
        with codecs.getwriter("utf-8")(
            tf.gfile.GFile(new_vocab_file, "wb")) as f:
          for word in vocab:
            f.write("%s\n" % word)
        vocab_file = new_vocab_file
  else:
    raise ValueError("vocab_file '%s' does not exist." % vocab_file)

  vocab_size = len(vocab)
  return vocab_size, vocab_file
Exemple #2
0
def _single_cell(num_units,
                 forget_bias,
                 dropout,
                 mode,
                 residual_connection=False,
                 residual_fn=None,
                 global_step=None,
                 fast_reverse=False,
                 seq_len=None):
  """Create an instance of a single RNN cell."""
  # dropout (= 1 - keep_prob) is set to 0 during eval and infer
  dropout = dropout if mode == contrib_learn.ModeKeys.TRAIN else 0.0

  # Cell Type
  utils.print_out("  LSTM, forget_bias=%g" % forget_bias, new_line=False)
  single_cell = contrib_rnn.BasicLSTMCell(num_units, forget_bias=forget_bias)

  # Dropout (= 1 - keep_prob)
  enabled = (
      mode == contrib_learn.ModeKeys.TRAIN) or dropout > 0.0 or fast_reverse
  single_cell = CellWrapper(
      cell=single_cell,
      input_keep_prob=(1.0 - dropout),
      global_step=global_step,
      seq_len=seq_len,
      enabled=enabled)

  # Residual
  if residual_connection:
    single_cell = contrib_rnn.ResidualWrapper(
        single_cell, residual_fn=residual_fn)
    utils.print_out("  %s" % type(single_cell).__name__, new_line=False)

  return single_cell
Exemple #3
0
def _cell_list(num_units,
               num_layers,
               num_residual_layers,
               forget_bias,
               dropout,
               mode,
               single_cell_fn=None,
               residual_fn=None,
               global_step=None,
               fast_reverse=False,
               seq_len=None):
  """Create a list of RNN cells."""
  if not single_cell_fn:
    single_cell_fn = _single_cell

  # Multi-GPU
  cell_list = []
  for i in range(num_layers):
    utils.print_out("  cell %d" % i, new_line=False)
    single_cell = single_cell_fn(
        num_units=num_units,
        forget_bias=forget_bias,
        dropout=dropout,
        mode=mode,
        residual_connection=(i >= num_layers - num_residual_layers),
        residual_fn=residual_fn,
        global_step=global_step,
        fast_reverse=fast_reverse,
        seq_len=seq_len)
    utils.print_out("")
    cell_list.append(single_cell)

  return cell_list
Exemple #4
0
 def _get_infer_maximum_iterations(self, hparams, source_sequence_length):
     """Maximum decoding steps at inference time."""
     if hparams.tgt_max_len_infer:
         maximum_iterations = hparams.tgt_max_len_infer
         utils.print_out("  decoding maximum_iterations %d" %
                         maximum_iterations)
     else:
         decoding_length_factor = 2.0
         max_encoder_length = tf.reduce_max(source_sequence_length)
         maximum_iterations = tf.to_int32(
             tf.round(
                 tf.to_float(max_encoder_length) * decoding_length_factor))
     return maximum_iterations
Exemple #5
0
def get_metric(hparams, predictions, current_step):
    """Run inference and compute metric."""
    predicted_ids = []
    for prediction in predictions:
        predicted_ids.append(prediction["predictions"])

    if hparams.examples_to_infer < len(predicted_ids):
        predicted_ids = predicted_ids[0:hparams.examples_to_infer]
    translations = _convert_ids_to_strings(hparams.tgt_vocab_file,
                                           predicted_ids)

    trans_file = os.path.join(
        hparams.out_dir, "newstest2014_out_{}.tok.de".format(current_step))
    trans_dir = os.path.dirname(trans_file)
    if not tf.gfile.Exists(trans_dir):
        tf.gfile.MakeDirs(trans_dir)
    tf.logging.info("Writing to file %s" % trans_file)
    with codecs.getwriter("utf-8")(tf.gfile.GFile(trans_file,
                                                  mode="wb")) as trans_f:
        trans_f.write("")  # Write empty string to ensure file is created.
        for translation in translations:
            sentence = nmt_utils.get_translation(
                translation,
                tgt_eos=hparams.eos,
                subword_option=hparams.subword_option)
            trans_f.write((sentence + b"\n").decode("utf-8"))

    # Evaluation
    output_dir = os.path.join(hparams.out_dir, "eval")
    tf.gfile.MakeDirs(output_dir)
    summary_writer = tf.summary.FileWriter(output_dir)

    ref_file = "%s.%s" % (hparams.test_prefix, hparams.tgt)

    if hparams.use_REDACTED:
        score = evaluation_utils.evaluate(ref_file, trans_file)
    else:
        score = get_sacrebleu(trans_file, hparams.detokenizer_file)
    with tf.Graph().as_default():
        summaries = []
        summaries.append(tf.Summary.Value(tag="sacrebleu", simple_value=score))
    tf_summary = tf.Summary(value=list(summaries))
    summary_writer.add_summary(tf_summary, current_step)

    misc_utils.print_out("  %s: %.1f" % ("sacrebleu", score))

    summary_writer.close()
    return score
Exemple #6
0
    def build_graph(self, hparams):
        """Subclass must implement this method.

    Creates a sequence-to-sequence model with dynamic RNN decoder API.
    Args:
      hparams: Hyperparameter configurations.

    Returns:
      A tuple of the form (logits, predicted_ids) for infererence and
      (loss, None) for training.
      where:
        logits: float32 Tensor [batch_size x num_decoder_symbols]
        loss: float32 scalar
        predicted_ids: predicted ids from beam search.
    """
        utils.print_out("# Creating %s graph ..." % self.mode)

        # Projection
        with tf.variable_scope("build_network"):
            with tf.variable_scope("decoder/output_projection",
                                   reuse=tf.AUTO_REUSE):
                self.output_layer = tf.get_variable(
                    "output_projection", [self.num_units, self.tgt_vocab_size])

        with tf.variable_scope("dynamic_seq2seq",
                               dtype=self.dtype,
                               reuse=tf.AUTO_REUSE):
            if hparams.activation_dtype == "bfloat16":
                tf.get_variable_scope().set_custom_getter(
                    utils.bfloat16_var_getter if hparams.activation_dtype ==
                    "bfloat16" else None)
                logits_or_loss, decoder_cell_outputs, predicted_ids = self._build_model(
                    hparams)
                if decoder_cell_outputs is not None:
                    decoder_cell_outputs = tf.cast(decoder_cell_outputs,
                                                   tf.float32)
            else:
                logits_or_loss, decoder_cell_outputs, predicted_ids = self._build_model(
                    hparams)

        return logits_or_loss, predicted_ids
def load_embed_txt(embed_file):
  """Load embed_file into a python dictionary.

  Note: the embed_file should be a Glove/word2vec formatted txt file. Assuming
  Here is an exampe assuming embed_size=5:

  the -0.071549 0.093459 0.023738 -0.090339 0.056123
  to 0.57346 0.5417 -0.23477 -0.3624 0.4037
  and 0.20327 0.47348 0.050877 0.002103 0.060547

  For word2vec format, the first line will be: <num_words> <emb_size>.

  Args:
    embed_file: file path to the embedding file.
  Returns:
    a dictionary that maps word to vector, and the size of embedding dimensions.
  """
  emb_dict = dict()
  emb_size = None

  is_first_line = True
  with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, "rb")) as f:
    for line in f:
      tokens = line.rstrip().split(" ")
      if is_first_line:
        is_first_line = False
        if len(tokens) == 2:  # header line
          emb_size = int(tokens[1])
          continue
      word = tokens[0]
      vec = list(map(float, tokens[1:]))
      emb_dict[word] = vec
      if emb_size:
        if emb_size != len(vec):
          utils.print_out(
              "Ignoring %s since embeding size is inconsistent." % word)
          del emb_dict[word]
      else:
        emb_size = len(vec)
  return emb_dict, emb_size
Exemple #8
0
    def _get_learning_rate_warmup(self, hparams):
        """Get learning rate warmup."""
        warmup_steps = hparams.warmup_steps
        warmup_scheme = hparams.warmup_scheme
        utils.print_out(
            "  learning_rate=%g, warmup_steps=%d, warmup_scheme=%s" %
            (hparams.learning_rate, warmup_steps, warmup_scheme))

        # Apply inverse decay if global steps less than warmup steps.
        # Inspired by https://arxiv.org/pdf/1706.03762.pdf (Section 5.3)
        # When step < warmup_steps,
        #   learing_rate *= warmup_factor ** (warmup_steps - step)
        if warmup_scheme == "t2t":
            # 0.01^(1/warmup_steps): we start with a lr, 100 times smaller
            warmup_factor = tf.exp(tf.log(0.01) / warmup_steps)
            inv_decay = warmup_factor**(tf.to_float(warmup_steps -
                                                    self.global_step))
        else:
            raise ValueError("Unknown warmup scheme %s" % warmup_scheme)

        return tf.cond(self.global_step < hparams.warmup_steps,
                       lambda: inv_decay * self.learning_rate,
                       lambda: self.learning_rate,
                       name="learning_rate_warump_cond")
Exemple #9
0
def run_main(flags, default_hparams, estimator_fn):
  """Run main."""
  # Job
  jobid = flags.jobid
  utils.print_out("# Job id %d" % jobid)

  # Random
  random_seed = flags.random_seed
  if random_seed is not None and random_seed > 0:
    utils.print_out("# Set random seed to %d" % random_seed)
    random.seed(random_seed + jobid)
    np.random.seed(random_seed + jobid)
    tf.set_random_seed(random_seed)

  mlp_log.mlperf_print("cache_clear", True)
  mlp_log.mlperf_print("init_start", None)
  mlp_log.mlperf_print("submission_benchmark", "resnet")
  mlp_log.mlperf_print("submission_division", "closed")
  mlp_log.mlperf_print("submission_org", "google")
  mlp_log.mlperf_print("submission_platform", "tpu-v3-%d" % FLAGS.num_shards)
  mlp_log.mlperf_print("submission_status", "research")

  mlp_log.mlperf_print("global_batch_size", FLAGS.batch_size)
  mlp_log.mlperf_print("opt_learning_rate_alt_decay_func", "True")
  mlp_log.mlperf_print("opt_base_learning_rate", FLAGS.learning_rate)
  mlp_log.mlperf_print("opt_learning_rate_decay_interval", FLAGS.decay_interval)
  mlp_log.mlperf_print("opt_learning_rate_decay_factor", FLAGS.decay_factor)
  mlp_log.mlperf_print("opt_learning_rate_decay_steps", FLAGS.decay_steps)
  mlp_log.mlperf_print("opt_learning_rate_remain_steps", FLAGS.decay_start)
  mlp_log.mlperf_print("opt_learning_rate_alt_warmup_func", FLAGS.warmup_scheme)
  mlp_log.mlperf_print("opt_learning_rate_warmup_steps", FLAGS.warmup_steps)
  mlp_log.mlperf_print(
      "max_sequence_length", FLAGS.src_max_len, metadata={"method": "discard"})
  mlp_log.mlperf_print("train_samples", FLAGS.num_examples_per_epoch)
  mlp_log.mlperf_print("eval_samples", FLAGS.examples_to_infer)

  # Model output directory
  out_dir = flags.out_dir
  if out_dir and not tf.gfile.Exists(out_dir):
    utils.print_out("# Creating output directory %s ..." % out_dir)
    tf.gfile.MakeDirs(out_dir)

  # Load hparams.
  hparams = create_or_load_hparams(default_hparams, flags.hparams_path)

  # Train or Evaluation
  return estimator_fn(hparams)
Exemple #10
0
    def _set_train_or_infer(self, res):
        """Set up training."""
        if self.mode == contrib_learn.ModeKeys.INFER:
            self.predicted_ids = res[1]

        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrange for the embedding vars to appear at the beginning.
        if self.mode == contrib_learn.ModeKeys.TRAIN:
            loss = res[0]
            # Gradients
            self.gradients = [
                tf.convert_to_tensor(g) for g in tf.gradients(loss, params)
            ]

        # Print trainable variables
        utils.print_out("# Trainable variables")
        utils.print_out("Format: <name>, <shape>, <(soft) device placement>")
        for param in params:
            utils.print_out(
                "  %s, %s, %s" %
                (param.name, str(param.get_shape()), param.op.device))
Exemple #11
0
def _create_pretrained_emb_from_txt(
    vocab_file, embed_file, num_trainable_tokens=3, dtype=tf.float32,
    scope=None):
  """Load pretrain embeding from embed_file, and return an embedding matrix.

  Args:
    vocab_file: Path to vocab file.
    embed_file: Path to a Glove formmated embedding txt file.
    num_trainable_tokens: Make the first n tokens in the vocab file as trainable
      variables. Default is 3, which is "<unk>", "<s>" and "</s>".
    dtype: data type.
    scope: tf scope name.

  Returns:
    pretrained embedding table variable.
  """
  vocab, _ = vocab_utils.load_vocab(vocab_file)
  trainable_tokens = vocab[:num_trainable_tokens]

  utils.print_out("# Using pretrained embedding: %s." % embed_file)
  utils.print_out("  with trainable tokens: ")

  emb_dict, emb_size = vocab_utils.load_embed_txt(embed_file)
  for token in trainable_tokens:
    utils.print_out("    %s" % token)
    if token not in emb_dict:
      emb_dict[token] = [0.0] * emb_size

  emb_mat = np.array(
      [emb_dict[token] for token in vocab], dtype=dtype.as_numpy_dtype(0))
  emb_mat = tf.constant(emb_mat)
  emb_mat_const = tf.slice(emb_mat, [num_trainable_tokens, 0], [-1, -1])
  with tf.variable_scope(scope or "pretrain_embeddings", dtype=dtype) as scope:
    emb_mat_var = tf.get_variable(
        "emb_mat_var", [num_trainable_tokens, emb_size])
  return tf.concat([emb_mat_var, emb_mat_const], 0)
Exemple #12
0
def create_emb_for_encoder_and_decoder(src_vocab_size,
                                       tgt_vocab_size,
                                       src_embed_size,
                                       tgt_embed_size,
                                       dtype=tf.float32,
                                       num_enc_partitions=0,
                                       num_dec_partitions=0,
                                       src_vocab_file=None,
                                       tgt_vocab_file=None,
                                       src_embed_file=None,
                                       tgt_embed_file=None,
                                       scope=None):
  """Create embedding matrix for both encoder and decoder.

  Args:
    src_vocab_size: An integer. The source vocab size.
    tgt_vocab_size: An integer. The target vocab size.
    src_embed_size: An integer. The embedding dimension for the encoder's
      embedding.
    tgt_embed_size: An integer. The embedding dimension for the decoder's
      embedding.
    dtype: dtype of the embedding matrix. Default to float32.
    num_enc_partitions: number of partitions used for the encoder's embedding
      vars.
    num_dec_partitions: number of partitions used for the decoder's embedding
      vars.
    src_vocab_file: A string. The source vocabulary file.
    tgt_vocab_file: A string. The target vocabulary file.
    src_embed_file: A string. The source embedding file.
    tgt_embed_file: A string. The target embedding file.
    scope: VariableScope for the created subgraph. Default to "embedding".

  Returns:
    embedding_encoder: Encoder's embedding matrix.
    embedding_decoder: Decoder's embedding matrix.

  Raises:
    ValueError: if source and target have different vocab size.
  """
  if num_enc_partitions <= 1:
    enc_partitioner = None
  else:
    # Note: num_partitions > 1 is required for distributed training due to
    # embedding_lookup tries to colocate single partition-ed embedding variable
    # with lookup ops. This may cause embedding variables being placed on worker
    # jobs.
    enc_partitioner = tf.fixed_size_partitioner(num_enc_partitions)

  if num_dec_partitions <= 1:
    dec_partitioner = None
  else:
    # Note: num_partitions > 1 is required for distributed training due to
    # embedding_lookup tries to colocate single partition-ed embedding variable
    # with lookup ops. This may cause embedding variables being placed on worker
    # jobs.
    dec_partitioner = tf.fixed_size_partitioner(num_dec_partitions)

  if src_embed_file and enc_partitioner:
    raise ValueError(
        "Can't set num_enc_partitions > 1 when using pretrained encoder "
        "embedding")

  if tgt_embed_file and dec_partitioner:
    raise ValueError(
        "Can't set num_dec_partitions > 1 when using pretrained decdoer "
        "embedding")

  with tf.variable_scope(
      scope or "embeddings", dtype=dtype, partitioner=enc_partitioner) as scope:
    if src_vocab_size != tgt_vocab_size:
      raise ValueError("Share embedding but different src/tgt vocab sizes"
                       " %d vs. %d" % (src_vocab_size, tgt_vocab_size))
    assert src_embed_size == tgt_embed_size
    utils.print_out("# Use the same embedding for source and target")
    vocab_file = src_vocab_file or tgt_vocab_file
    embed_file = src_embed_file or tgt_embed_file

    embedding_encoder = _create_or_load_embed(
        "embedding_share", vocab_file, embed_file,
        src_vocab_size, src_embed_size, dtype)
    embedding_decoder = embedding_encoder

  return embedding_encoder, embedding_decoder
Exemple #13
0
def extend_hparams(hparams):
  """Add new arguments to hparams."""
  # Sanity checks
  if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]:
    raise ValueError("subword option must be either spm, or bpe")
  if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0:
    raise ValueError("beam_width must greater than 0 when using beam_search"
                     "decoder.")

  # Different number of encoder / decoder layers
  assert hparams.num_encoder_layers == hparams.num_decoder_layers

  # The first unidirectional layer (after the bi-directional layer) in
  # the GNMT encoder can't have residual connection due to the input is
  # the concatenation of fw_cell and bw_cell's outputs.
  num_encoder_residual_layers = hparams.num_encoder_layers - 2
  num_decoder_residual_layers = num_encoder_residual_layers
  _add_argument(hparams, "num_encoder_residual_layers",
                num_encoder_residual_layers)
  _add_argument(hparams, "num_decoder_residual_layers",
                num_decoder_residual_layers)

  ## Vocab
  # Get vocab file names first
  if hparams.vocab_prefix:
    src_vocab_file = six.ensure_str(
        hparams.vocab_prefix) + "." + six.ensure_str(hparams.src)
    tgt_vocab_file = six.ensure_str(
        hparams.vocab_prefix) + "." + six.ensure_str(hparams.tgt)
  else:
    raise ValueError("hparams.vocab_prefix must be provided.")

  # Source vocab
  src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
      src_vocab_file,
      hparams.out_dir,
      check_special_token=hparams.check_special_token,
      sos=hparams.sos,
      eos=hparams.eos,
      unk=vocab_utils.UNK)

  # Target vocab
  utils.print_out("  using source vocab for target")
  tgt_vocab_file = src_vocab_file
  tgt_vocab_size = src_vocab_size
  _add_argument(hparams, "src_vocab_size", src_vocab_size)
  _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size)
  _add_argument(hparams, "src_vocab_file", src_vocab_file)
  _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file)

  # Num embedding partitions
  _add_argument(
      hparams, "num_enc_emb_partitions", hparams.num_embeddings_partitions)
  _add_argument(
      hparams, "num_dec_emb_partitions", hparams.num_embeddings_partitions)

  # Pretrained Embeddings
  _add_argument(hparams, "src_embed_file", "")
  _add_argument(hparams, "tgt_embed_file", "")

  return hparams