Beispiel #1
0
def _cell_list(num_units,
               num_layers,
               num_residual_layers,
               forget_bias,
               dropout,
               mode,
               single_cell_fn=None,
               residual_fn=None,
               global_step=None,
               fast_reverse=False,
               seq_len=None):
  """Create a list of RNN cells."""
  if not single_cell_fn:
    single_cell_fn = _single_cell

  # Multi-GPU
  cell_list = []
  for i in range(num_layers):
    utils.print_out("  cell %d" % i, new_line=False)
    single_cell = single_cell_fn(
        num_units=num_units,
        forget_bias=forget_bias,
        dropout=dropout,
        mode=mode,
        residual_connection=(i >= num_layers - num_residual_layers),
        residual_fn=residual_fn,
        global_step=global_step,
        fast_reverse=fast_reverse,
        seq_len=seq_len)
    utils.print_out("")
    cell_list.append(single_cell)

  return cell_list
def check_vocab(vocab_file,
                out_dir,
                check_special_token=True,
                sos=None,
                eos=None,
                unk=None):
    """Check if vocab_file doesn't exist, create from corpus_file."""
    if tf.gfile.Exists(vocab_file):
        utils.print_out("# Vocab file %s exists" % vocab_file)
        vocab, vocab_size = load_vocab(vocab_file)
        if check_special_token:
            # Verify if the vocab starts with unk, sos, eos
            # If not, prepend those tokens & generate a new vocab file
            if not unk: unk = UNK
            if not sos: sos = SOS
            if not eos: eos = EOS
            assert len(vocab) >= 3
            if vocab[0] != unk or vocab[1] != sos or vocab[2] != eos:
                utils.print_out("The first 3 vocab words [%s, %s, %s]"
                                " are not [%s, %s, %s]" %
                                (vocab[0], vocab[1], vocab[2], unk, sos, eos))
                vocab = [unk, sos, eos] + vocab
                vocab_size += 3
                new_vocab_file = os.path.join(out_dir,
                                              os.path.basename(vocab_file))
                with codecs.getwriter("utf-8")(tf.gfile.GFile(
                        new_vocab_file, "wb")) as f:
                    for word in vocab:
                        f.write("%s\n" % word)
                vocab_file = new_vocab_file
    else:
        raise ValueError("vocab_file '%s' does not exist." % vocab_file)

    vocab_size = len(vocab)
    return vocab_size, vocab_file
Beispiel #3
0
def _single_cell(num_units,
                 forget_bias,
                 dropout,
                 mode,
                 residual_connection=False,
                 residual_fn=None,
                 global_step=None,
                 fast_reverse=False,
                 seq_len=None):
  """Create an instance of a single RNN cell."""
  # dropout (= 1 - keep_prob) is set to 0 during eval and infer
  dropout = dropout if mode == contrib_learn.ModeKeys.TRAIN else 0.0

  # Cell Type
  utils.print_out("  LSTM, forget_bias=%g" % forget_bias, new_line=False)
  single_cell = contrib_rnn.BasicLSTMCell(num_units, forget_bias=forget_bias)

  # Dropout (= 1 - keep_prob)
  enabled = (
      mode == contrib_learn.ModeKeys.TRAIN) or dropout > 0.0 or fast_reverse
  single_cell = CellWrapper(
      cell=single_cell,
      input_keep_prob=(1.0 - dropout),
      global_step=global_step,
      seq_len=seq_len,
      enabled=enabled)

  # Residual
  if residual_connection:
    single_cell = contrib_rnn.ResidualWrapper(
        single_cell, residual_fn=residual_fn)
    utils.print_out("  %s" % type(single_cell).__name__, new_line=False)

  return single_cell
Beispiel #4
0
 def _get_infer_maximum_iterations(self, hparams, source_sequence_length):
   """Maximum decoding steps at inference time."""
   if hparams.tgt_max_len_infer:
     maximum_iterations = hparams.tgt_max_len_infer
     utils.print_out("  decoding maximum_iterations %d" % maximum_iterations)
   else:
     decoding_length_factor = 2.0
     max_encoder_length = tf.reduce_max(source_sequence_length)
     maximum_iterations = tf.to_int32(
         tf.round(tf.to_float(max_encoder_length) * decoding_length_factor))
   return maximum_iterations
Beispiel #5
0
def get_metric(hparams, predictions, current_step):
    """Run inference and compute metric."""
    predicted_ids = []
    for prediction in predictions:
        predicted_ids.append(prediction["predictions"])

    if hparams.examples_to_infer < len(predicted_ids):
        predicted_ids = predicted_ids[0:hparams.examples_to_infer]
    translations = _convert_ids_to_strings(hparams.tgt_vocab_file,
                                           predicted_ids)

    trans_file = os.path.join(
        hparams.out_dir, "newstest2014_out_{}.tok.de".format(current_step))
    trans_dir = os.path.dirname(trans_file)
    if not tf.gfile.Exists(trans_dir):
        tf.gfile.MakeDirs(trans_dir)
    tf.logging.info("Writing to file %s" % trans_file)
    with codecs.getwriter("utf-8")(tf.gfile.GFile(trans_file,
                                                  mode="wb")) as trans_f:
        trans_f.write("")  # Write empty string to ensure file is created.
        for translation in translations:
            sentence = nmt_utils.get_translation(
                translation,
                tgt_eos=hparams.eos,
                subword_option=hparams.subword_option)
            trans_f.write((sentence + b"\n").decode("utf-8"))

    # Evaluation
    output_dir = os.path.join(hparams.out_dir, "eval")
    tf.gfile.MakeDirs(output_dir)
    summary_writer = tf.summary.FileWriter(output_dir)

    ref_file = "%s.%s" % (hparams.test_prefix, hparams.tgt)

    if hparams.use_REDACTED:
        score = evaluation_utils.evaluate(ref_file, trans_file)
    else:
        score = get_sacrebleu(trans_file, hparams.detokenizer_file)
    with tf.Graph().as_default():
        summaries = []
        summaries.append(tf.Summary.Value(tag="sacrebleu", simple_value=score))
    tf_summary = tf.Summary(value=list(summaries))
    summary_writer.add_summary(tf_summary, current_step)

    misc_utils.print_out("  %s: %.1f" % ("sacrebleu", score))

    summary_writer.close()
    return score
def load_embed_txt(embed_file):
    """Load embed_file into a python dictionary.

  Note: the embed_file should be a Glove/word2vec formatted txt file. Assuming
  Here is an exampe assuming embed_size=5:

  the -0.071549 0.093459 0.023738 -0.090339 0.056123
  to 0.57346 0.5417 -0.23477 -0.3624 0.4037
  and 0.20327 0.47348 0.050877 0.002103 0.060547

  For word2vec format, the first line will be: <num_words> <emb_size>.

  Args:
    embed_file: file path to the embedding file.
  Returns:
    a dictionary that maps word to vector, and the size of embedding dimensions.
  """
    emb_dict = dict()
    emb_size = None

    is_first_line = True
    with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, "rb")) as f:
        for line in f:
            tokens = line.rstrip().split(" ")
            if is_first_line:
                is_first_line = False
                if len(tokens) == 2:  # header line
                    emb_size = int(tokens[1])
                    continue
            word = tokens[0]
            vec = list(map(float, tokens[1:]))
            emb_dict[word] = vec
            if emb_size:
                if emb_size != len(vec):
                    utils.print_out(
                        "Ignoring %s since embeding size is inconsistent." %
                        word)
                    del emb_dict[word]
            else:
                emb_size = len(vec)
    return emb_dict, emb_size
Beispiel #7
0
  def build_graph(self, hparams):
    """Subclass must implement this method.

    Creates a sequence-to-sequence model with dynamic RNN decoder API.
    Args:
      hparams: Hyperparameter configurations.

    Returns:
      A tuple of the form (logits, predicted_ids) for infererence and
      (loss, None) for training.
      where:
        logits: float32 Tensor [batch_size x num_decoder_symbols]
        loss: float32 scalar
        predicted_ids: predicted ids from beam search.
    """
    utils.print_out("# Creating %s graph ..." % self.mode)

    # Projection
    with tf.variable_scope("build_network"):
      with tf.variable_scope("decoder/output_projection", reuse=tf.AUTO_REUSE):
        self.output_layer = tf.get_variable(
            "output_projection", [self.num_units, self.tgt_vocab_size])

    with tf.variable_scope(
        "dynamic_seq2seq", dtype=self.dtype, reuse=tf.AUTO_REUSE):
      if hparams.activation_dtype == "bfloat16":
        tf.get_variable_scope().set_custom_getter(
            utils.bfloat16_var_getter if hparams.activation_dtype == "bfloat16"
            else None)
        logits_or_loss, decoder_cell_outputs, predicted_ids = self._build_model(
            hparams)
        if decoder_cell_outputs is not None:
          decoder_cell_outputs = tf.cast(decoder_cell_outputs, tf.float32)
      else:
        logits_or_loss, decoder_cell_outputs, predicted_ids = self._build_model(
            hparams)

    return logits_or_loss, predicted_ids
Beispiel #8
0
  def _get_learning_rate_warmup(self, hparams):
    """Get learning rate warmup."""
    warmup_steps = hparams.warmup_steps
    warmup_scheme = hparams.warmup_scheme
    utils.print_out("  learning_rate=%g, warmup_steps=%d, warmup_scheme=%s" %
                    (hparams.learning_rate, warmup_steps, warmup_scheme))

    # Apply inverse decay if global steps less than warmup steps.
    # Inspired by https://arxiv.org/pdf/1706.03762.pdf (Section 5.3)
    # When step < warmup_steps,
    #   learing_rate *= warmup_factor ** (warmup_steps - step)
    if warmup_scheme == "t2t":
      # 0.01^(1/warmup_steps): we start with a lr, 100 times smaller
      warmup_factor = tf.exp(tf.log(0.01) / warmup_steps)
      inv_decay = warmup_factor**(tf.to_float(warmup_steps - self.global_step))
    else:
      raise ValueError("Unknown warmup scheme %s" % warmup_scheme)

    return tf.cond(
        self.global_step < hparams.warmup_steps,
        lambda: inv_decay * self.learning_rate,
        lambda: self.learning_rate,
        name="learning_rate_warump_cond")
Beispiel #9
0
def run_main(flags, default_hparams, estimator_fn):
    """Run main."""
    # Job
    jobid = flags.jobid
    utils.print_out("# Job id %d" % jobid)

    # Random
    random_seed = flags.random_seed
    if random_seed is not None and random_seed > 0:
        utils.print_out("# Set random seed to %d" % random_seed)
        random.seed(random_seed + jobid)
        np.random.seed(random_seed + jobid)
        tf.set_random_seed(random_seed)

    mlp_log.mlperf_print("cache_clear", True)
    mlp_log.mlperf_print("init_start", None)
    mlp_log.mlperf_print("submission_benchmark", "resnet")
    mlp_log.mlperf_print("submission_division", "closed")
    mlp_log.mlperf_print("submission_org", "google")
    mlp_log.mlperf_print("submission_platform", "tpu-v3-%d" % FLAGS.num_shards)
    mlp_log.mlperf_print("submission_status", "research")

    mlp_log.mlperf_print("global_batch_size", FLAGS.batch_size)
    mlp_log.mlperf_print("opt_learning_rate_alt_decay_func", "True")
    mlp_log.mlperf_print("opt_base_learning_rate", FLAGS.learning_rate)
    mlp_log.mlperf_print("opt_learning_rate_decay_interval",
                         FLAGS.decay_interval)
    mlp_log.mlperf_print("opt_learning_rate_decay_factor", FLAGS.decay_factor)
    mlp_log.mlperf_print("opt_learning_rate_decay_steps", FLAGS.decay_steps)
    mlp_log.mlperf_print("opt_learning_rate_remain_steps", FLAGS.decay_start)
    mlp_log.mlperf_print("opt_learning_rate_alt_warmup_func",
                         FLAGS.warmup_scheme)
    mlp_log.mlperf_print("opt_learning_rate_warmup_steps", FLAGS.warmup_steps)
    mlp_log.mlperf_print("max_sequence_length",
                         FLAGS.src_max_len,
                         metadata={"method": "discard"})
    mlp_log.mlperf_print("train_samples", FLAGS.num_examples_per_epoch)
    mlp_log.mlperf_print("eval_samples", FLAGS.examples_to_infer)

    # Model output directory
    out_dir = flags.out_dir
    if out_dir and not tf.gfile.Exists(out_dir):
        utils.print_out("# Creating output directory %s ..." % out_dir)
        tf.gfile.MakeDirs(out_dir)

    # Load hparams.
    hparams = create_or_load_hparams(default_hparams, flags.hparams_path)

    # Train or Evaluation
    return estimator_fn(hparams)
Beispiel #10
0
  def _set_train_or_infer(self, res):
    """Set up training."""
    if self.mode == contrib_learn.ModeKeys.INFER:
      self.predicted_ids = res[1]

    params = tf.trainable_variables()

    # Gradients and SGD update operation for training the model.
    # Arrange for the embedding vars to appear at the beginning.
    if self.mode == contrib_learn.ModeKeys.TRAIN:
      loss = res[0]
      # Gradients
      self.gradients = [
          tf.convert_to_tensor(g) for g in tf.gradients(loss, params)
      ]

    # Print trainable variables
    utils.print_out("# Trainable variables")
    utils.print_out("Format: <name>, <shape>, <(soft) device placement>")
    for param in params:
      utils.print_out("  %s, %s, %s" % (param.name, str(param.get_shape()),
                                        param.op.device))
Beispiel #11
0
def _create_pretrained_emb_from_txt(
    vocab_file, embed_file, num_trainable_tokens=3, dtype=tf.float32,
    scope=None):
  """Load pretrain embeding from embed_file, and return an embedding matrix.

  Args:
    vocab_file: Path to vocab file.
    embed_file: Path to a Glove formmated embedding txt file.
    num_trainable_tokens: Make the first n tokens in the vocab file as trainable
      variables. Default is 3, which is "<unk>", "<s>" and "</s>".
    dtype: data type.
    scope: tf scope name.

  Returns:
    pretrained embedding table variable.
  """
  vocab, _ = vocab_utils.load_vocab(vocab_file)
  trainable_tokens = vocab[:num_trainable_tokens]

  utils.print_out("# Using pretrained embedding: %s." % embed_file)
  utils.print_out("  with trainable tokens: ")

  emb_dict, emb_size = vocab_utils.load_embed_txt(embed_file)
  for token in trainable_tokens:
    utils.print_out("    %s" % token)
    if token not in emb_dict:
      emb_dict[token] = [0.0] * emb_size

  emb_mat = np.array(
      [emb_dict[token] for token in vocab], dtype=dtype.as_numpy_dtype(0))
  emb_mat = tf.constant(emb_mat)
  emb_mat_const = tf.slice(emb_mat, [num_trainable_tokens, 0], [-1, -1])
  with tf.variable_scope(scope or "pretrain_embeddings", dtype=dtype) as scope:
    emb_mat_var = tf.get_variable(
        "emb_mat_var", [num_trainable_tokens, emb_size])
  return tf.concat([emb_mat_var, emb_mat_const], 0)
Beispiel #12
0
def create_emb_for_encoder_and_decoder(src_vocab_size,
                                       tgt_vocab_size,
                                       src_embed_size,
                                       tgt_embed_size,
                                       dtype=tf.float32,
                                       num_enc_partitions=0,
                                       num_dec_partitions=0,
                                       src_vocab_file=None,
                                       tgt_vocab_file=None,
                                       src_embed_file=None,
                                       tgt_embed_file=None,
                                       scope=None):
  """Create embedding matrix for both encoder and decoder.

  Args:
    src_vocab_size: An integer. The source vocab size.
    tgt_vocab_size: An integer. The target vocab size.
    src_embed_size: An integer. The embedding dimension for the encoder's
      embedding.
    tgt_embed_size: An integer. The embedding dimension for the decoder's
      embedding.
    dtype: dtype of the embedding matrix. Default to float32.
    num_enc_partitions: number of partitions used for the encoder's embedding
      vars.
    num_dec_partitions: number of partitions used for the decoder's embedding
      vars.
    src_vocab_file: A string. The source vocabulary file.
    tgt_vocab_file: A string. The target vocabulary file.
    src_embed_file: A string. The source embedding file.
    tgt_embed_file: A string. The target embedding file.
    scope: VariableScope for the created subgraph. Default to "embedding".

  Returns:
    embedding_encoder: Encoder's embedding matrix.
    embedding_decoder: Decoder's embedding matrix.

  Raises:
    ValueError: if source and target have different vocab size.
  """
  if num_enc_partitions <= 1:
    enc_partitioner = None
  else:
    # Note: num_partitions > 1 is required for distributed training due to
    # embedding_lookup tries to colocate single partition-ed embedding variable
    # with lookup ops. This may cause embedding variables being placed on worker
    # jobs.
    enc_partitioner = tf.fixed_size_partitioner(num_enc_partitions)

  if num_dec_partitions <= 1:
    dec_partitioner = None
  else:
    # Note: num_partitions > 1 is required for distributed training due to
    # embedding_lookup tries to colocate single partition-ed embedding variable
    # with lookup ops. This may cause embedding variables being placed on worker
    # jobs.
    dec_partitioner = tf.fixed_size_partitioner(num_dec_partitions)

  if src_embed_file and enc_partitioner:
    raise ValueError(
        "Can't set num_enc_partitions > 1 when using pretrained encoder "
        "embedding")

  if tgt_embed_file and dec_partitioner:
    raise ValueError(
        "Can't set num_dec_partitions > 1 when using pretrained decdoer "
        "embedding")

  with tf.variable_scope(
      scope or "embeddings", dtype=dtype, partitioner=enc_partitioner) as scope:
    if src_vocab_size != tgt_vocab_size:
      raise ValueError("Share embedding but different src/tgt vocab sizes"
                       " %d vs. %d" % (src_vocab_size, tgt_vocab_size))
    assert src_embed_size == tgt_embed_size
    utils.print_out("# Use the same embedding for source and target")
    vocab_file = src_vocab_file or tgt_vocab_file
    embed_file = src_embed_file or tgt_embed_file

    embedding_encoder = _create_or_load_embed(
        "embedding_share", vocab_file, embed_file,
        src_vocab_size, src_embed_size, dtype)
    embedding_decoder = embedding_encoder

  return embedding_encoder, embedding_decoder
Beispiel #13
0
def extend_hparams(hparams):
    """Add new arguments to hparams."""
    # Sanity checks
    if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]:
        raise ValueError("subword option must be either spm, or bpe")
    if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0:
        raise ValueError(
            "beam_width must greater than 0 when using beam_search"
            "decoder.")

    # Different number of encoder / decoder layers
    assert hparams.num_encoder_layers == hparams.num_decoder_layers

    # The first unidirectional layer (after the bi-directional layer) in
    # the GNMT encoder can't have residual connection due to the input is
    # the concatenation of fw_cell and bw_cell's outputs.
    num_encoder_residual_layers = hparams.num_encoder_layers - 2
    num_decoder_residual_layers = num_encoder_residual_layers
    _add_argument(hparams, "num_encoder_residual_layers",
                  num_encoder_residual_layers)
    _add_argument(hparams, "num_decoder_residual_layers",
                  num_decoder_residual_layers)

    ## Vocab
    # Get vocab file names first
    if hparams.vocab_prefix:
        src_vocab_file = six.ensure_str(
            hparams.vocab_prefix) + "." + six.ensure_str(hparams.src)
        tgt_vocab_file = six.ensure_str(
            hparams.vocab_prefix) + "." + six.ensure_str(hparams.tgt)
    else:
        raise ValueError("hparams.vocab_prefix must be provided.")

    # Source vocab
    src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
        src_vocab_file,
        hparams.out_dir,
        check_special_token=hparams.check_special_token,
        sos=hparams.sos,
        eos=hparams.eos,
        unk=vocab_utils.UNK)

    # Target vocab
    utils.print_out("  using source vocab for target")
    tgt_vocab_file = src_vocab_file
    tgt_vocab_size = src_vocab_size
    _add_argument(hparams, "src_vocab_size", src_vocab_size)
    _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size)
    _add_argument(hparams, "src_vocab_file", src_vocab_file)
    _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file)

    # Num embedding partitions
    _add_argument(hparams, "num_enc_emb_partitions",
                  hparams.num_embeddings_partitions)
    _add_argument(hparams, "num_dec_emb_partitions",
                  hparams.num_embeddings_partitions)

    # Pretrained Embeddings
    _add_argument(hparams, "src_embed_file", "")
    _add_argument(hparams, "tgt_embed_file", "")

    return hparams